Safekeeper peer recovery rfc.

Bump cryptography from 41.0.2 to 41.0.3 (#4870 )
Tar Remote Extensions (#4715 )
2026-05-21 07:00:38 +00:00 · 2023-08-03 14:15:14 +03:00 · 2023-08-02 14:10:36 +03:00 · 2023-08-02 12:38:12 +03:00 · 2023-08-01 19:18:25 +03:00 · 2023-08-01 18:28:18 +03:00
69 changed files with 2595 additions and 669 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -740,6 +740,9 @@ name = "cc"
 version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+dependencies = [
+ "jobserver",
+]

 [[package]]
 name = "cexpr"
@@ -907,12 +910,14 @@ dependencies = [
 "opentelemetry",
 "postgres",
 "regex",
+ "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
 "tar",
 "tokio",
 "tokio-postgres",
+ "toml_edit",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -920,6 +925,7 @@ dependencies = [
 "url",
 "utils",
 "workspace_hack",
+ "zstd",
 ]

 [[package]]
@@ -980,6 +986,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "tracing",
 "url",
 "utils",
 "workspace_hack",
@@ -1972,6 +1979,15 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"

+[[package]]
+name = "jobserver"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.63"
@@ -5296,6 +5312,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "bytes",
+ "cc",
 "chrono",
 "clap",
 "clap_builder",
@@ -5396,3 +5413,33 @@ name = "zeroize"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
+
+[[package]]
+name = "zstd"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "6.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
+dependencies = [
+ "libc",
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.8+zstd.1.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -551,10 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
-# There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
-    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
+    echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -32,3 +32,6 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
+toml_edit.workspace = true
+remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+zstd = "0.12.4"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -5,6 +5,8 @@
 //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
+//! - If remote_extension_config is provided, it will be used to fetch extensions list
+//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -27,7 +29,8 @@
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
-//!             -b /usr/local/bin/postgres
+//!             -b /usr/local/bin/postgres \
+//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
 //! ```
 //!
 use std::collections::HashMap;
@@ -35,7 +38,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex};
+use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -48,22 +51,33 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
+use compute_tools::extension_server::{get_pg_version, init_remote_storage};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;

-const BUILD_TAG_DEFAULT: &str = "local";
+// this is an arbitrary build tag. Fine as a default / for testing purposes
+// in-case of not-set environment var
+const BUILD_TAG_DEFAULT: &str = "5670669815";

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
-
+    let build_tag = option_env!("BUILD_TAG")
+        .unwrap_or(BUILD_TAG_DEFAULT)
+        .to_string();
    info!("build_tag: {build_tag}");

    let matches = cli().get_matches();
+    let pgbin_default = String::from("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+
+    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
+    let ext_remote_storage = remote_ext_config.map(|x| {
+        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
+    });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -128,9 +142,6 @@ fn main() -> Result<()> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.get_one::<String>("pgbin").unwrap();
-
    let spec;
    let mut live_config_allowed = false;
    match spec_json {
@@ -168,6 +179,7 @@ fn main() -> Result<()> {

    let mut new_state = ComputeState::new();
    let spec_set;
+
    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
        new_state.pspec = Some(pspec);
@@ -179,9 +191,15 @@ fn main() -> Result<()> {
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
+        pgversion: get_pg_version(pgbin),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
+        ext_remote_storage,
+        ext_remote_paths: OnceLock::new(),
+        ext_download_progress: RwLock::new(HashMap::new()),
+        library_index: OnceLock::new(),
+        build_tag,
    };
    let compute = Arc::new(compute_node);

@@ -190,9 +208,18 @@ fn main() -> Result<()> {
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

+    let extension_server_port: u16 = http_port;
+
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
+
+        // TODO this can stall startups in the unlikely event that we bind
+        //      this compute node while it's busy prewarming. It's not too
+        //      bad because it's just 100ms and unlikely, but it's an
+        //      avoidable problem.
+        compute.prewarm_postgres()?;
+
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
@@ -229,7 +256,7 @@ fn main() -> Result<()> {
    // Start Postgres
    let mut delay_exit = false;
    let mut exit_code = None;
-    let pg = match compute.start_compute() {
+    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
@@ -358,6 +385,12 @@ fn cli() -> clap::Command {
                .long("control-plane-uri")
                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
+        .arg(
+            Arg::new("remote-ext-config")
+                .short('r')
+                .long("remote-ext-config")
+                .value_name("REMOTE_EXT_CONFIG"),
+        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,16 +1,20 @@
+use std::collections::HashMap;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex};
+use std::sync::{Condvar, Mutex, OnceLock, RwLock};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
+use regex::Regex;
+use tokio;
 use tokio_postgres;
 use tracing::{error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -20,10 +24,12 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use crate::config;
+use remote_storage::{GenericRemoteStorage, RemotePath};
+
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
+use crate::{config, extension_server};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -31,6 +37,7 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
+    pub pgversion: String,
    /// We should only allow live re- / configuration of the compute node if
    /// it uses 'pull model', i.e. it can go to control-plane and fetch
    /// the latest configuration. Otherwise, there could be a case:
@@ -50,6 +57,24 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
+    ///  the S3 bucket that we search for extensions in
+    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    // (key: extension name, value: path to extension archive in remote storage)
+    pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
+    // (key: library name, value: name of extension containing this library)
+    pub library_index: OnceLock<HashMap<String, String>>,
+    // key: ext_archive_name, value: started download time, download_completed?
+    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
+    pub build_tag: String,
+}
+
+// store some metrics about download size that might impact startup time
+#[derive(Clone, Debug)]
+pub struct RemoteExtensionMetrics {
+    num_ext_downloaded: u64,
+    largest_ext_size: u64,
+    total_ext_download_size: u64,
+    prep_extensions_ms: u64,
 }

 #[derive(Clone, Debug)]
@@ -473,14 +498,22 @@ impl ComputeNode {
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip_all)]
-    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+    pub fn prepare_pgdata(
+        &self,
+        compute_state: &ComputeState,
+        extension_server_port: u16,
+    ) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
+        config::write_postgres_conf(
+            &pgdata_path.join("postgresql.conf"),
+            &pspec.spec,
+            Some(extension_server_port),
+        )?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
        // is already connected it will be kicked out, so a secondary (standby)
@@ -532,6 +565,50 @@ impl ComputeNode {
        Ok(())
    }

+    /// Start and stop a postgres process to warm up the VM for startup.
+    pub fn prewarm_postgres(&self) -> Result<()> {
+        info!("prewarming");
+
+        // Create pgdata
+        let pgdata = &format!("{}.warmup", self.pgdata);
+        create_pgdata(pgdata)?;
+
+        // Run initdb to completion
+        info!("running initdb");
+        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
+        Command::new(initdb_bin)
+            .args(["-D", pgdata])
+            .output()
+            .expect("cannot start initdb process");
+
+        // Write conf
+        use std::io::Write;
+        let conf_path = Path::new(pgdata).join("postgresql.conf");
+        let mut file = std::fs::File::create(conf_path)?;
+        writeln!(file, "shared_buffers=65536")?;
+        writeln!(file, "port=51055")?; // Nobody should be connecting
+        writeln!(file, "shared_preload_libraries = 'neon'")?;
+
+        // Start postgres
+        info!("starting postgres");
+        let mut pg = Command::new(&self.pgbin)
+            .args(["-D", pgdata])
+            .spawn()
+            .expect("cannot start postgres process");
+
+        // Stop it when it's ready
+        info!("waiting for postgres");
+        wait_for_postgres(&mut pg, Path::new(pgdata))?;
+        pg.kill()?;
+        info!("sent kill signal");
+        pg.wait()?;
+        info!("done prewarming");
+
+        // clean up
+        let _ok = fs::remove_dir_all(pgdata);
+        Ok(())
+    }
+
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
@@ -626,7 +703,7 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
        self.pg_reload_conf(&mut client)?;
@@ -656,7 +733,7 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(&self) -> Result<std::process::Child> {
+    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -667,7 +744,31 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        self.prepare_pgdata(&compute_state)?;
+        // This part is sync, because we need to download
+        // remote shared_preload_libraries before postgres start (if any)
+        {
+            let library_load_start_time = Utc::now();
+            let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
+
+            let library_load_time = Utc::now()
+                .signed_duration_since(library_load_start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            let mut state = self.state.lock().unwrap();
+            state.metrics.load_ext_ms = library_load_time;
+            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
+            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
+            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
+            state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
+            info!(
+                "Loading shared_preload_libraries took {:?}ms",
+                library_load_time
+            );
+            info!("{:?}", remote_ext_metrics);
+        }
+
+        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
@@ -815,4 +916,200 @@ LIMIT 100",
            "{{\"pg_stat_statements\": []}}".to_string()
        }
    }
+
+    // If remote extension storage is configured,
+    // download extension control files
+    pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
+        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
+            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+            let spec = &pspec.spec;
+            let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
+            info!("custom extensions: {:?}", &custom_ext);
+            let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
+                ext_remote_storage,
+                &self.pgbin,
+                &self.pgversion,
+                &custom_ext,
+                &self.build_tag,
+            )
+            .await?;
+            self.ext_remote_paths
+                .set(ext_remote_paths)
+                .expect("this is the only time we set ext_remote_paths");
+            self.library_index
+                .set(library_index)
+                .expect("this is the only time we set library_index");
+        }
+        Ok(())
+    }
+
+    // download an archive, unzip and place files in correct locations
+    pub async fn download_extension(&self, ext_name: &str, is_library: bool) -> Result<u64> {
+        match &self.ext_remote_storage {
+            None => anyhow::bail!("No remote extension storage"),
+            Some(remote_storage) => {
+                let mut real_ext_name = ext_name.to_string();
+                if is_library {
+                    // sometimes library names might have a suffix like
+                    // library.so or library.so.3. We strip this off
+                    // because library_index is based on the name without the file extension
+                    let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
+                    let lib_raw_name = strip_lib_suffix.replace(&real_ext_name, "").to_string();
+                    real_ext_name = self
+                        .library_index
+                        .get()
+                        .expect("must have already downloaded the library_index")[&lib_raw_name]
+                        .clone();
+                }
+
+                let ext_path = &self
+                    .ext_remote_paths
+                    .get()
+                    .expect("error accessing ext_remote_paths")[&real_ext_name];
+                let ext_archive_name = ext_path.object_name().expect("bad path");
+
+                let mut first_try = false;
+                if !self
+                    .ext_download_progress
+                    .read()
+                    .expect("lock err")
+                    .contains_key(ext_archive_name)
+                {
+                    self.ext_download_progress
+                        .write()
+                        .expect("lock err")
+                        .insert(ext_archive_name.to_string(), (Utc::now(), false));
+                    first_try = true;
+                }
+                let (download_start, download_completed) =
+                    self.ext_download_progress.read().expect("lock err")[ext_archive_name];
+                let start_time_delta = Utc::now()
+                    .signed_duration_since(download_start)
+                    .to_std()
+                    .unwrap()
+                    .as_millis() as u64;
+
+                // how long to wait for extension download if it was started by another process
+                const HANG_TIMEOUT: u64 = 3000; // milliseconds
+
+                if download_completed {
+                    info!("extension already downloaded, skipping re-download");
+                    return Ok(0);
+                } else if start_time_delta < HANG_TIMEOUT && !first_try {
+                    info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
+                    let mut interval =
+                        tokio::time::interval(tokio::time::Duration::from_millis(500));
+                    loop {
+                        info!("waiting for download");
+                        interval.tick().await;
+                        let (_, download_completed_now) =
+                            self.ext_download_progress.read().expect("lock")[ext_archive_name];
+                        if download_completed_now {
+                            info!("download finished by whoever else downloaded it");
+                            return Ok(0);
+                        }
+                    }
+                    // NOTE: the above loop will get terminated
+                    // based on the timeout of the download function
+                }
+
+                // if extension hasn't been downloaded before or the previous
+                // attempt to download was at least HANG_TIMEOUT ms ago
+                // then we try to download it here
+                info!("downloading new extension {ext_archive_name}");
+
+                let download_size = extension_server::download_extension(
+                    &real_ext_name,
+                    ext_path,
+                    remote_storage,
+                    &self.pgbin,
+                )
+                .await;
+                self.ext_download_progress
+                    .write()
+                    .expect("bad lock")
+                    .insert(ext_archive_name.to_string(), (download_start, true));
+                download_size
+            }
+        }
+    }
+
+    #[tokio::main]
+    pub async fn prepare_preload_libraries(
+        &self,
+        compute_state: &ComputeState,
+    ) -> Result<RemoteExtensionMetrics> {
+        if self.ext_remote_storage.is_none() {
+            return Ok(RemoteExtensionMetrics {
+                num_ext_downloaded: 0,
+                largest_ext_size: 0,
+                total_ext_download_size: 0,
+                prep_extensions_ms: 0,
+            });
+        }
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        let spec = &pspec.spec;
+
+        info!("parse shared_preload_libraries from spec.cluster.settings");
+        let mut libs_vec = Vec::new();
+        if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+            libs_vec = libs
+                .split(&[',', '\'', ' '])
+                .filter(|s| *s != "neon" && !s.is_empty())
+                .map(str::to_string)
+                .collect();
+        }
+        info!("parse shared_preload_libraries from provided postgresql.conf");
+        // that is used in neon_local and python tests
+        if let Some(conf) = &spec.cluster.postgresql_conf {
+            let conf_lines = conf.split('\n').collect::<Vec<&str>>();
+            let mut shared_preload_libraries_line = "";
+            for line in conf_lines {
+                if line.starts_with("shared_preload_libraries") {
+                    shared_preload_libraries_line = line;
+                }
+            }
+            let mut preload_libs_vec = Vec::new();
+            if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
+                preload_libs_vec = libs
+                    .split(&[',', '\'', ' '])
+                    .filter(|s| *s != "neon" && !s.is_empty())
+                    .map(str::to_string)
+                    .collect();
+            }
+            libs_vec.extend(preload_libs_vec);
+        }
+
+        info!("Download ext_index.json, find the extension paths");
+        let prep_ext_start_time = Utc::now();
+        self.prepare_external_extensions(compute_state).await?;
+        let prep_ext_time_delta = Utc::now()
+            .signed_duration_since(prep_ext_start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+        info!("Prepare extensions took {prep_ext_time_delta}ms");
+
+        info!("Downloading to shared preload libraries: {:?}", &libs_vec);
+        let mut download_tasks = Vec::new();
+        for library in &libs_vec {
+            download_tasks.push(self.download_extension(library, true));
+        }
+        let results = join_all(download_tasks).await;
+
+        let mut remote_ext_metrics = RemoteExtensionMetrics {
+            num_ext_downloaded: 0,
+            largest_ext_size: 0,
+            total_ext_download_size: 0,
+            prep_extensions_ms: prep_ext_time_delta,
+        };
+        for result in results {
+            let download_size = result?;
+            remote_ext_metrics.num_ext_downloaded += 1;
+            remote_ext_metrics.largest_ext_size =
+                std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
+            remote_ext_metrics.total_ext_download_size += download_size;
+        }
+        Ok(remote_ext_metrics)
+    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 }

 /// Create or completely rewrite configuration file specified by `path`
-pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
+pub fn write_postgres_conf(
+    path: &Path,
+    spec: &ComputeSpec,
+    extension_server_port: Option<u16>,
+) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

@@ -87,5 +91,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        writeln!(file, "# Managed by compute_ctl: end")?;
    }

+    if let Some(port) = extension_server_port {
+        writeln!(file, "neon.extension_server_port={}", port)?;
+    }
+
    Ok(())
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -0,0 +1,275 @@
+// Download extension files from the extension store
+// and put them in the right place in the postgres directory (share / lib)
+/*
+The layout of the S3 bucket is as follows:
+5615610098 // this is an extension build number
+├── v14
+│   ├── extensions
+│   │   ├── anon.tar.zst
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   ├── anon.tar.zst
+    │   └── embedding.tar.zst
+    └── ext_index.json
+5615261079
+├── v14
+│   ├── extensions
+│   │   └── anon.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── anon.tar.zst
+    └── ext_index.json
+5623261088
+├── v14
+│   ├── extensions
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── embedding.tar.zst
+    └── ext_index.json
+
+Note that build number cannot be part of prefix because we might need extensions
+from other build numbers.
+
+ext_index.json stores the control files and location of extension archives
+It also stores a list of public extensions and a library_index
+
+We don't need to duplicate extension.tar.zst files.
+We only need to upload a new one if it is updated.
+(Although currently we just upload every time anyways, hopefully will change
+this sometime)
+
+*access* is controlled by spec
+
+More specifically, here is an example ext_index.json
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
+*/
+use anyhow::Context;
+use anyhow::{self, Result};
+use futures::future::join_all;
+use remote_storage::*;
+use serde_json;
+use std::collections::HashMap;
+use std::io::Read;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::path::Path;
+use std::str;
+use tar::Archive;
+use tokio::io::AsyncReadExt;
+use tracing::info;
+use tracing::log::warn;
+use zstd::stream::read::Decoder;
+
+fn get_pg_config(argument: &str, pgbin: &str) -> String {
+    // gives the result of `pg_config [argument]`
+    // where argument is a flag like `--version` or `--sharedir`
+    let pgconfig = pgbin
+        .strip_suffix("postgres")
+        .expect("bad pgbin")
+        .to_owned()
+        + "/pg_config";
+    let config_output = std::process::Command::new(pgconfig)
+        .arg(argument)
+        .output()
+        .expect("pg_config error");
+    std::str::from_utf8(&config_output.stdout)
+        .expect("pg_config error")
+        .trim()
+        .to_string()
+}
+
+pub fn get_pg_version(pgbin: &str) -> String {
+    // pg_config --version returns a (platform specific) human readable string
+    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    let human_version = get_pg_config("--version", pgbin);
+    if human_version.contains("15") {
+        return "v15".to_string();
+    } else if human_version.contains("14") {
+        return "v14".to_string();
+    }
+    panic!("Unsuported postgres version {human_version}");
+}
+
+// download control files for enabled_extensions
+// return Hashmaps converting library names to extension names (library_index)
+// and specifying the remote path to the archive for each extension name
+pub async fn get_available_extensions(
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+    pg_version: &str,
+    custom_extensions: &[String],
+    build_tag: &str,
+) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
+    let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
+    info!("download ext_index.json from: {:?}", &index_path);
+
+    let mut download = remote_storage.download(&index_path).await?;
+    let mut ext_idx_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut ext_idx_buffer)
+        .await?;
+    info!("ext_index downloaded");
+
+    #[derive(Debug, serde::Deserialize)]
+    struct Index {
+        public_extensions: Vec<String>,
+        library_index: HashMap<String, String>,
+        extension_data: HashMap<String, ExtensionData>,
+    }
+
+    #[derive(Debug, serde::Deserialize)]
+    struct ExtensionData {
+        control_data: HashMap<String, String>,
+        archive_path: String,
+    }
+
+    let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
+    let mut enabled_extensions = ext_index_full.public_extensions;
+    enabled_extensions.extend_from_slice(custom_extensions);
+    let library_index = ext_index_full.library_index;
+    let all_extension_data = ext_index_full.extension_data;
+    info!("library_index: {:?}", library_index);
+
+    info!("enabled_extensions: {:?}", enabled_extensions);
+    let mut ext_remote_paths = HashMap::new();
+    let mut file_create_tasks = Vec::new();
+    for extension in enabled_extensions {
+        let ext_data = &all_extension_data[&extension];
+        for (control_file, control_contents) in &ext_data.control_data {
+            let extension_name = control_file
+                .strip_suffix(".control")
+                .expect("control files must end in .control");
+            ext_remote_paths.insert(
+                extension_name.to_string(),
+                RemotePath::from_string(&ext_data.archive_path)?,
+            );
+            let control_path = local_sharedir.join(control_file);
+            info!("writing file {:?}{:?}", control_path, control_contents);
+            file_create_tasks.push(tokio::fs::write(control_path, control_contents));
+        }
+    }
+    let results = join_all(file_create_tasks).await;
+    for result in results {
+        result?;
+    }
+    info!("ext_remote_paths {:?}", ext_remote_paths);
+    Ok((ext_remote_paths, library_index))
+}
+
+// download the archive for a given extension,
+// unzip it, and place files in the appropriate locations (share/lib)
+pub async fn download_extension(
+    ext_name: &str,
+    ext_path: &RemotePath,
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+) -> Result<u64> {
+    info!("Download extension {:?} from {:?}", ext_name, ext_path);
+    let mut download = remote_storage.download(ext_path).await?;
+    let mut download_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut download_buffer)
+        .await?;
+    let download_size = download_buffer.len() as u64;
+    // it's unclear whether it is more performant to decompress into memory or not
+    // TODO: decompressing into memory can be avoided
+    let mut decoder = Decoder::new(download_buffer.as_slice())?;
+    let mut decompress_buffer = Vec::new();
+    decoder.read_to_end(&mut decompress_buffer)?;
+    let mut archive = Archive::new(decompress_buffer.as_slice());
+    let unzip_dest = pgbin
+        .strip_suffix("/bin/postgres")
+        .expect("bad pgbin")
+        .to_string()
+        + "/download_extensions";
+    archive.unpack(&unzip_dest)?;
+    info!("Download + unzip {:?} completed successfully", &ext_path);
+
+    let sharedir_paths = (
+        unzip_dest.to_string() + "/share/extension",
+        Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
+    );
+    let libdir_paths = (
+        unzip_dest.to_string() + "/lib",
+        Path::new(&get_pg_config("--libdir", pgbin)).join("postgresql"),
+    );
+    // move contents of the libdir / sharedir in unzipped archive to the correct local paths
+    for paths in [sharedir_paths, libdir_paths] {
+        let (zip_dir, real_dir) = paths;
+        info!("mv {zip_dir:?}/*  {real_dir:?}");
+        for file in std::fs::read_dir(zip_dir)? {
+            let old_file = file?.path();
+            let new_file =
+                Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
+            info!("moving {old_file:?} to {new_file:?}");
+
+            // extension download failed: Directory not empty (os error 39)
+            match std::fs::rename(old_file, new_file) {
+                Ok(()) => info!("move succeeded"),
+                Err(e) => {
+                    warn!("move failed, probably because the extension already exists: {e}")
+                }
+            }
+        }
+    }
+    info!("done moving extension {ext_name}");
+    Ok(download_size)
+}
+
+// This function initializes the necessary structs to use remote storage
+pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
+    #[derive(Debug, serde::Deserialize)]
+    struct RemoteExtJson {
+        bucket: String,
+        region: String,
+        endpoint: Option<String>,
+        prefix: Option<String>,
+    }
+    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
+
+    let config = S3Config {
+        bucket_name: remote_ext_json.bucket,
+        bucket_region: remote_ext_json.region,
+        prefix_in_bucket: remote_ext_json.prefix,
+        endpoint: remote_ext_json.endpoint,
+        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
+        max_keys_per_list_response: None,
+    };
+    let config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
+        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
+        storage: RemoteStorageKind::AwsS3(config),
+    };
+    GenericRemoteStorage::from_config(&config)
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -121,6 +121,37 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        // download extension files from S3 on demand
+        (&Method::POST, route) if route.starts_with("/extension_server/") => {
+            info!("serving {:?} POST request", route);
+            info!("req.uri {:?}", req.uri());
+
+            let mut is_library = false;
+            if let Some(params) = req.uri().query() {
+                info!("serving {:?} POST request with params: {}", route, params);
+                if params == "is_library=true" {
+                    is_library = true;
+                } else {
+                    let mut resp = Response::new(Body::from("Wrong request parameters"));
+                    *resp.status_mut() = StatusCode::BAD_REQUEST;
+                    return resp;
+                }
+            }
+
+            let filename = route.split('/').last().unwrap().to_string();
+            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
+
+            match compute.download_extension(&filename, is_library).await {
+                Ok(_) => Response::new(Body::from("OK")),
+                Err(e) => {
+                    error!("extension download failed: {}", e);
+                    let mut resp = Response::new(Body::from(e.to_string()));
+                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                    resp
+                }
+            }
+        }
+
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -139,6 +139,34 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/GenericError"
+  /extension_server:
+    post:
+      tags:
+      - Extension
+      summary: Download extension from S3 to local folder.
+      description: ""
+      operationId: downloadExtension
+      responses:
+        200:
+          description: Extension downloaded
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'OK' if download succeeded.
+                example: "OK"
+        400:
+        description: Request is invalid.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"
+        500:
+        description: Extension download request failed.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -9,6 +9,7 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
+pub mod extension_server;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
 pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;

    update_pg_hba(pgdata_path)?;

--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -32,3 +32,4 @@ utils.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
+tracing.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -658,6 +658,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
+
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -699,7 +701,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers)?;
+                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -743,7 +745,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token, safekeepers)?;
+                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
        "stop" => {
@@ -1003,6 +1005,12 @@ fn cli() -> Command {
        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
        .required(false);

+    let remote_ext_config_args = Arg::new("remote-ext-config")
+        .long("remote-ext-config")
+        .num_args(1)
+        .help("Configure the S3 bucket that we search for extensions in.")
+        .required(false);
+
    let lsn_arg = Arg::new("lsn")
        .long("lsn")
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1161,6 +1169,7 @@ fn cli() -> Command {
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
+                    .arg(remote_ext_config_args)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -313,7 +313,7 @@ impl Endpoint {

                // TODO: use future host field from safekeeper spec
                // Pass the list of safekeepers to the replica so that it can connect to any of them,
-                // whichever is availiable.
+                // whichever is available.
                let sk_ports = self
                    .env
                    .safekeepers
@@ -420,7 +420,12 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
+    pub fn start(
+        &self,
+        auth_token: &Option<String>,
+        safekeepers: Vec<NodeId>,
+        remote_ext_config: Option<&String>,
+    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }
@@ -488,6 +493,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
+            custom_extensions: Some(vec![]),
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -519,6 +525,11 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
+
+        if let Some(remote_ext_config) = remote_ext_config {
+            cmd.args(["--remote-ext-config", remote_ext_config]);
+        }
+
        let child = cmd.spawn()?;

        // Write down the pid so we can wait for it when we want to stop
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -0,0 +1,236 @@
+# Supporting custom user Extensions (Dynamic Extension Loading)
+Created 2023-05-03
+
+## Motivation
+
+There are many extensions in the PostgreSQL ecosystem, and not all extensions
+are of a quality that we can confidently support them. Additionally, our
+current extension inclusion mechanism has several problems because we build all
+extensions into the primary Compute image: We build the extensions every time
+we build the compute image regardless of whether we actually need to rebuild
+the image, and the inclusion of these extensions in the image adds a hard
+dependency on all supported extensions - thus increasing the image size, and
+with it the time it takes to download that image - increasing first start
+latency.
+
+This RFC proposes a dynamic loading mechanism that solves most of these
+problems.
+
+## Summary
+
+`compute_ctl` is made responsible for loading extensions on-demand into
+the container's file system for dynamically loaded extensions, and will also
+make sure that the extensions in `shared_preload_libraries` are downloaded
+before the compute node starts.
+
+## Components
+
+compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
+
+## Requirements
+
+Compute nodes with no extra extensions should not be negatively impacted by
+the existence of support for many extensions.
+
+Installing an extension into PostgreSQL should be easy.
+
+Non-preloaded extensions shouldn't impact startup latency.
+
+Uninstalled extensions shouldn't impact query latency.
+
+A small latency penalty for dynamically loaded extensions is acceptable in
+the first seconds of compute startup, but not in steady-state operations.
+
+## Proposed implementation
+
+### On-demand, JIT-loading of extensions
+
+Before postgres starts we download 
+- control files for all extensions available to that compute node;
+- all `shared_preload_libraries`;
+
+After postgres is running, `compute_ctl` listens for requests to load files.
+When PostgreSQL requests a file, `compute_ctl` downloads it.
+
+PostgreSQL requests files in the following cases:
+- When loading a preload library set in `local_preload_libraries`
+- When explicitly loading a library with `LOAD`
+- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
+
+
+#### Summary
+
+Pros:
+ - Startup is only as slow as it takes to load all (shared_)preload_libraries
+ - Supports BYO Extension
+
+Cons:
+ - O(sizeof(extensions)) IO requirement for loading all extensions.
+
+### Alternative solutions
+
+1. Allow users to add their extensions to the base image
+   
+   Pros:
+    - Easy to deploy
+
+   Cons:
+    - Doesn't scale - first start size is dependent on image size;
+    - All extensions are shared across all users: It doesn't allow users to
+      bring their own restrictive-licensed extensions
+
+2. Bring Your Own compute image
+   
+   Pros:
+    - Still easy to deploy
+    - User can bring own patched version of PostgreSQL
+
+   Cons:
+    - First start latency is O(sizeof(extensions image))
+    - Warm instance pool for skipping pod schedule latency is not feasible with
+      O(n) custom images
+    - Support channels are difficult to manage
+
+3. Download all user extensions in bulk on compute start
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues for "clean" users.
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - Downloading all extensions in advance takes a lot of time, thus startup
+      latency issues
+
+4. Store user's extensions in persistent storage
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - EC2 instances have only limited number of attachments shared between EBS
+      volumes, direct-attached NVMe drives, and ENIs.
+    - Compute instance migration isn't trivially solved for EBS mounts (e.g.
+      the device is unavailable whilst moving the mount between instances).
+    - EBS can only mount on one instance at a time (except the expensive IO2
+      device type).
+
+5. Store user's extensions in network drive
+   
+   Pros:
+    - Easy to deploy
+    - Few startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - We'd need networked drives, and a lot of them, which would store many
+      duplicate extensions.
+    - **UNCHECKED:** Compute instance migration may not work nicely with
+      networked IOs
+
+
+### Idea extensions
+
+The extension store does not have to be S3 directly, but could be a Node-local
+caching service on top of S3. This would reduce the load on the network for
+popular extensions.
+
+## Extension Storage implementation
+
+The layout of the S3 bucket is as follows:
+```
+5615610098 // this is an extension build number
+├── v14
+│   ├── extensions
+│   │   ├── anon.tar.zst
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   ├── anon.tar.zst
+    │   └── embedding.tar.zst
+    └── ext_index.json
+5615261079
+├── v14
+│   ├── extensions
+│   │   └── anon.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── anon.tar.zst
+    └── ext_index.json
+5623261088
+├── v14
+│   ├── extensions
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── embedding.tar.zst
+    └── ext_index.json
+```
+
+Note that build number cannot be part of prefix because we might need extensions
+from other build numbers.
+
+`ext_index.json` stores the control files and location of extension archives. 
+It also stores a list of public extensions and a library_index
+
+We don't need to duplicate `extension.tar.zst`` files.
+We only need to upload a new one if it is updated.
+(Although currently we just upload every time anyways, hopefully will change
+this sometime)
+
+*access* is controlled by spec
+
+More specifically, here is an example ext_index.json
+```
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+        // for more complex extensions like postgis
+        // we might have something like:
+        // address_standardizer: postgis
+        // postgis_tiger: postgis
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
+```
+
+### How to add new extension to the Extension Storage?
+
+Simply upload build artifacts to the S3 bucket.
+Implement a CI step for that. Splitting it from compute-node-image build.
+
+### How do we deal with extension versions and updates?
+
+Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
+This is needed to ensure that `/share` and `/lib` files are in sync.
+
+For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
+
+### Alternatives
+
+For extensions written on trusted languages we can also adopt
+`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
+This will increase the amount supported extensions and decrease the amount of work required to support them.
--- a/docs/rfcs/025-sk-peer-recovery.md
+++ b/docs/rfcs/025-sk-peer-recovery.md
@@ -0,0 +1,71 @@
+# Safekeeper peer recovery
+
+## A problem and proposed solution
+
+Currently, on start walproposer preserves WAL for all safekeepers: on start it
+determines the horizon beyond which all safekeepers received WAL, downloads
+missing part to pg_wal and holds WAL since this horizon ever since. This is
+problematic, because
+
+1. If one safekeeper is down and/or lagging, pg_wal eventually explodes -- we intentionally don't have much space on computes.
+2. If one safekeeper is down and/or lagging it makes compute start longer.
+
+Proposed solution is to teach safekeepers to fetch WAL directly from peers,
+respecting consensus rules. Namely,
+- On start, walproposer won't download WAL at all -- it will have it only since
+  writing position. As WAL grows it should also keep some fixed number of
+  latest segments (~20) to provide gradual switch from peer recovery to walproposer
+  streaming under load; it can be achieved by setting wal_keep_size or
+  implemented separately.
+- Whenever safekeeper through peer communication discovers that 1) it lacks WAL compared to some 
+  peer and 2) walproposer streaming is not active, it starts recovery. Recovery ends when either there 
+  is nothing more to fetch or streaming walproposer is discovered.
+
+## Details
+
+### Correctness
+
+The idea is simple: recovery process imitates actions of donor's last_log_term
+== donor's term leader. That is, sk A will fetch WAL from sk B if
+1) B's (last_log_term, LSN) is higher than A's (last_log_term, LSN) *and* 
+2) A's term <= B's term -- otherwise append request can't be accepted. 
+3) B's term == B's last_log_term -- to ensure that such a leader was ever elected in 
+   the first place.
+
+Note that not always such configuration is possible. e.g. in scenario
+A 1.1
+B 1.1 2.1
+C 1.1 3.1
+
+where (x.y) is (term, LSN) pair if A voted for term 4 and B and C haven't (their
+terms are 3 and 2 respectively), then A can't pull from B nor from C. IOW, we
+need elected authoritative leader to determine to correct log sequence. However,
+such scenario is unlikely and will be fixed by walproposer voting once it
+appears, so we can ignore it for now, and add elections on safekeepers side
+later if needed.
+
+Just like a normal leader, recovery would first truncate WAL and only then start
+inserting.
+
+### Start/stop criterion
+
+Recovery shouldn't prevent actively streaming compute -- we don't skip records,
+so if recovery inserts something after walproposer push, next will error out.
+OTOH, for better availability recovery should finish its job aligning all
+safekeepers even if compute is missing. So I propose to track on safekeeper
+existence of streaming compute. Recovery should kick in if 1) there is something
+to pull and 2) streaming compute doesn't exist. On each insert, compute presence
+is checked and recovery is terminated if it appeared. It also terminates if
+there is nothing more to pull.
+
+This should be good enough, though not bullet proof: in theory we can imagine
+recovery starting regularly before streaming started and inserting something
+after. Such loop is very unlikely though, we can add more heuristics if it shows
+up.
+
+## Alternatives
+
+An entirely different direction would be more granular WAL managing on computes
+-- don't hold a lot, but download and pass on demand to stale safekeepers. It
+seems of comparable complexity, but writing rust is more pleasant and less
+postgres version dependant.
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -76,6 +76,11 @@ pub struct ComputeMetrics {
    pub start_postgres_ms: u64,
    pub config_ms: u64,
    pub total_startup_ms: u64,
+    pub load_ext_ms: u64,
+    pub num_ext_downloaded: u64,
+    pub largest_ext_size: u64, // these are measured in bytes
+    pub total_ext_download_size: u64,
+    pub prep_extensions_ms: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -60,6 +60,9 @@ pub struct ComputeSpec {
    /// If set, 'storage_auth_token' is used as the password to authenticate to
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
+
+    // list of prefixes to search for custom extensions in remote extension storage
+    pub custom_extensions: Option<Vec<String>>,
 }

 #[serde_as]
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;

-#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -17,6 +17,32 @@ pub enum EventType {
    },
 }

+impl EventType {
+    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
+        use EventType::*;
+        match self {
+            Absolute { time } => Some(time),
+            _ => None,
+        }
+    }
+
+    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
+        // these can most likely be thought of as Range or RangeFull
+        use EventType::*;
+        match self {
+            Incremental {
+                start_time,
+                stop_time,
+            } => Some(start_time..stop_time),
+            _ => None,
+        }
+    }
+
+    pub fn is_incremental(&self) -> bool {
+        matches!(self, EventType::Incremental { .. })
+    }
+}
+
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
    #[serde(flatten)]
@@ -31,7 +57,7 @@ pub struct Event<Extra> {
    pub extra: Extra,
 }

-pub fn idempotency_key(node_id: String) -> String {
+pub fn idempotency_key(node_id: &str) -> String {
    format!(
        "{}-{}-{:04}",
        Utc::now(),
@@ -45,6 +71,6 @@ pub const CHUNK_SIZE: usize = 1000;
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
 #[derive(serde::Serialize)]
-pub struct EventChunk<'a, T> {
-    pub events: &'a [T],
+pub struct EventChunk<'a, T: Clone> {
+    pub events: std::borrow::Cow<'a, [T]>,
 }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -65,6 +65,10 @@ impl RemotePath {
        Ok(Self(relative_path.to_path_buf()))
    }

+    pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
+        Self::new(Path::new(relative_path))
+    }
+
    pub fn with_base(&self, base_path: &Path) -> PathBuf {
        base_path.join(&self.0)
    }
@@ -190,6 +194,20 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
+    // A function for listing all the files in a "directory"
+    // Example:
+    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
+    // lists common *prefixes*, if any of files
+    // Example:
+    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
@@ -201,14 +219,6 @@ impl GenericRemoteStorage {
        }
    }

-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
-        }
-    }
-
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -68,7 +68,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
            true
        },
    )?;
-    let mut cursor = BlockCursor::new(&file);
+    let cursor = BlockCursor::new(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos())?;
        println!("key:{} value_len:{}", k, value.len());
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,27 +7,23 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::Utc;
+use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
 use serde::Serialize;
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
-use std::time::Duration;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};
-
-const WRITTEN_SIZE: &str = "written_size";
-const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
-const RESIDENT_SIZE: &str = "resident_size";
-const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
-const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
+use utils::lsn::Lsn;

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Clone, Copy)]
 struct Ids {
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
@@ -38,10 +34,142 @@ struct Ids {

 /// Key that uniquely identifies the object, this metric describes.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub struct PageserverConsumptionMetricsKey {
-    pub tenant_id: TenantId,
-    pub timeline_id: Option<TimelineId>,
-    pub metric: &'static str,
+struct MetricsKey {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    metric: &'static str,
+}
+
+impl MetricsKey {
+    const fn absolute_values(self) -> AbsoluteValueFactory {
+        AbsoluteValueFactory(self)
+    }
+    const fn incremental_values(self) -> IncrementalValueFactory {
+        IncrementalValueFactory(self)
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only absolute values.
+struct AbsoluteValueFactory(MetricsKey);
+
+impl AbsoluteValueFactory {
+    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
+        let key = self.0;
+        (key, (EventType::Absolute { time }, val))
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only incremental values.
+struct IncrementalValueFactory(MetricsKey);
+
+impl IncrementalValueFactory {
+    #[allow(clippy::wrong_self_convention)]
+    fn from_previous_up_to(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> (MetricsKey, (EventType, u64)) {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        (
+            key,
+            (
+                EventType::Incremental {
+                    start_time: prev_end,
+                    stop_time: up_to,
+                },
+                val,
+            ),
+        )
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+// the static part of a MetricsKey
+impl MetricsKey {
+    /// Absolute value of [`Timeline::get_last_record_lsn`].
+    ///
+    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
+    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "written_size",
+        }
+        .absolute_values()
+    }
+
+    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
+    /// previously sent, starting from the previously sent incremental time range ending at the
+    /// latest absolute measurement.
+    const fn written_size_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> IncrementalValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            // the name here is correctly about data not size, because that is what is wanted by
+            // downstream pipeline
+            metric: "written_data_bytes_delta",
+        }
+        .incremental_values()
+    }
+
+    /// Exact [`Timeline::get_current_logical_size`].
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    const fn timeline_logical_size(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "timeline_logical_size",
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::remote_size`]
+    ///
+    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "remote_storage_size",
+        }
+        .absolute_values()
+    }
+
+    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
+    ///
+    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "resident_size",
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    ///
+    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "synthetic_storage_size",
+        }
+        .absolute_values()
+    }
 }

 /// Main thread that serves metrics collection
@@ -79,7 +207,7 @@ pub async fn collect_metrics(
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
+    let mut cached_metrics = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
@@ -119,15 +247,15 @@ pub async fn collect_metrics(
 ///
 /// TODO
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
-pub async fn collect_metrics_iteration(
+async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
+    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
+    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -161,99 +289,65 @@ pub async fn collect_metrics_iteration(
        let mut tenant_resident_size = 0;

        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines().iter() {
+        for timeline in tenant.list_timelines() {
            // collect per-timeline metrics only for active timelines
-            if timeline.is_active() {
-                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
+            let timeline_id = timeline.timeline_id;
+
+            match TimelineSnapshot::collect(&timeline, ctx) {
+                Ok(Some(snap)) => {
+                    snap.to_metrics(
                        tenant_id,
-                        timeline_id: Some(timeline.timeline_id),
-                        metric: WRITTEN_SIZE,
-                    },
-                    timeline_written_size,
-                ));
-
-                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
-                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
-                    // Only send timeline logical size when it is fully calculated.
-                    Ok((size, is_exact)) if is_exact => {
-                        current_metrics.push((
-                            PageserverConsumptionMetricsKey {
-                                tenant_id,
-                                timeline_id: Some(timeline.timeline_id),
-                                metric: TIMELINE_LOGICAL_SIZE,
-                            },
-                            size,
-                        ));
-                    }
-                    Ok((_, _)) => {}
-                    Err(err) => {
-                        error!(
-                            "failed to get current logical size for timeline {}: {err:?}",
-                            timeline.timeline_id
-                        );
-                        continue;
-                    }
-                };
+                        timeline_id,
+                        Utc::now(),
+                        &mut current_metrics,
+                        cached_metrics,
+                    );
+                }
+                Ok(None) => {}
+                Err(e) => {
+                    error!(
+                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
+                        timeline.timeline_id
+                    );
+                    continue;
+                }
            }

-            let timeline_resident_size = timeline.get_resident_physical_size();
-            tenant_resident_size += timeline_resident_size;
+            tenant_resident_size += timeline.resident_physical_size();
        }

-        match tenant.get_remote_size().await {
-            Ok(tenant_remote_size) => {
-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
-                        tenant_id,
-                        timeline_id: None,
-                        metric: REMOTE_STORAGE_SIZE,
-                    },
-                    tenant_remote_size,
-                ));
-            }
-            Err(err) => {
-                error!(
-                    "failed to get remote size for tenant {}: {err:?}",
-                    tenant_id
-                );
-            }
-        }
+        current_metrics
+            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));

-        current_metrics.push((
-            PageserverConsumptionMetricsKey {
-                tenant_id,
-                timeline_id: None,
-                metric: RESIDENT_SIZE,
-            },
-            tenant_resident_size,
-        ));
+        current_metrics
+            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
-        let tenant_synthetic_size = tenant.get_cached_synthetic_size();
+        let synthetic_size = tenant.cached_synthetic_size();

-        if tenant_synthetic_size != 0 {
+        if synthetic_size != 0 {
            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push((
-                PageserverConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: None,
-                    metric: SYNTHETIC_STORAGE_SIZE,
-                },
-                tenant_synthetic_size,
-            ));
+            current_metrics
+                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
        }
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
    // See: https://github.com/neondatabase/neon/issues/3485
    if !send_cached {
-        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
-            Some(val) => val != curr_val,
-            None => true,
+        current_metrics.retain(|(curr_key, (kind, curr_val))| {
+            if kind.is_incremental() {
+                // incremental values (currently only written_size_delta) should not get any cache
+                // deduplication because they will be used by upstream for "is still alive."
+                true
+            } else {
+                match cached_metrics.get(curr_key) {
+                    Some((_, val)) => val != curr_val,
+                    None => true,
+                }
+            }
        });
    }

@@ -268,14 +362,16 @@ pub async fn collect_metrics_iteration(

    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

+    let node_id = node_id.to_string();
+
    for chunk in chunks {
        chunk_to_send.clear();

        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
-            kind: EventType::Absolute { time: Utc::now() },
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
+            kind: *when,
            metric: curr_key.metric,
-            idempotency_key: idempotency_key(node_id.to_string()),
+            idempotency_key: idempotency_key(&node_id),
            value: *curr_val,
            extra: Ids {
                tenant_id: curr_key.tenant_id,
@@ -283,17 +379,14 @@ pub async fn collect_metrics_iteration(
            },
        }));

-        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
-            events: &chunk_to_send,
-        })
-        .expect("PageserverConsumptionMetric should not fail serialization");
-
        const MAX_RETRIES: u32 = 3;

        for attempt in 0..MAX_RETRIES {
            let res = client
                .post(metric_collection_endpoint.clone())
-                .json(&chunk_json)
+                .json(&EventChunk {
+                    events: (&chunk_to_send).into(),
+                })
                .send()
                .await;

@@ -329,6 +422,130 @@ pub async fn collect_metrics_iteration(
    }
 }

+/// Internal type to make timeline metric production testable.
+///
+/// As this value type contains all of the information needed from a timeline to produce the
+/// metrics, it can easily be created with different values in test.
+struct TimelineSnapshot {
+    loaded_at: (Lsn, SystemTime),
+    last_record_lsn: Lsn,
+    current_exact_logical_size: Option<u64>,
+}
+
+impl TimelineSnapshot {
+    /// Collect the metrics from an actual timeline.
+    ///
+    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    fn collect(
+        t: &Arc<crate::tenant::Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Self>> {
+        use anyhow::Context;
+
+        if !t.is_active() {
+            // no collection for broken or stopping needed, we will still keep the cached values
+            // though at the caller.
+            Ok(None)
+        } else {
+            let loaded_at = t.loaded_at;
+            let last_record_lsn = t.get_last_record_lsn();
+
+            let current_exact_logical_size = {
+                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
+                let res = span
+                    .in_scope(|| t.get_current_logical_size(ctx))
+                    .context("get_current_logical_size");
+                match res? {
+                    // Only send timeline logical size when it is fully calculated.
+                    (size, is_exact) if is_exact => Some(size),
+                    (_, _) => None,
+                }
+            };
+
+            Ok(Some(TimelineSnapshot {
+                loaded_at,
+                last_record_lsn,
+                current_exact_logical_size,
+            }))
+        }
+    }
+
+    /// Produce the timeline consumption metrics into the `metrics` argument.
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        now: DateTime<Utc>,
+        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
+        cache: &HashMap<MetricsKey, (EventType, u64)>,
+    ) {
+        let timeline_written_size = u64::from(self.last_record_lsn);
+
+        let (key, written_size_now) =
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
+
+        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
+        // features might change this.
+
+        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
+
+        // use this when available, because in a stream of incremental values, it will be
+        // accurate where as when last_record_lsn stops moving, we will only cache the last
+        // one of those.
+        let last_stop_time = cache
+            .get(written_size_delta_key.key())
+            .map(|(until, _val)| {
+                until
+                    .incremental_timerange()
+                    .expect("never create EventType::Absolute for written_size_delta")
+                    .end
+            });
+
+        // by default, use the last sent written_size as the basis for
+        // calculating the delta. if we don't yet have one, use the load time value.
+        let prev = cache
+            .get(&key)
+            .map(|(prev_at, prev)| {
+                // use the prev time from our last incremental update, or default to latest
+                // absolute update on the first round.
+                let prev_at = prev_at
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let prev_at = last_stop_time.unwrap_or(prev_at);
+                (*prev_at, *prev)
+            })
+            .unwrap_or_else(|| {
+                // if we don't have a previous point of comparison, compare to the load time
+                // lsn.
+                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
+                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+            });
+
+        // written_size_bytes_delta
+        metrics.extend(
+            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+                let up_to = written_size_now
+                    .0
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
+                Some(key_value)
+            } else {
+                None
+            },
+        );
+
+        // written_size
+        metrics.push((key, written_size_now));
+
+        if let Some(size) = self.current_exact_logical_size {
+            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
+        }
+    }
+}
+
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
@@ -343,7 +560,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-        tick_at = ticker.tick() => {
+            tick_at = ticker.tick() => {

                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -379,3 +596,149 @@ pub async fn calculate_synthetic_size_worker(
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use std::time::SystemTime;
+    use utils::{
+        id::{TenantId, TimelineId},
+        lsn::Lsn,
+    };
+
+    use crate::consumption_metrics::MetricsKey;
+
+    use super::TimelineSnapshot;
+    use chrono::{DateTime, Utc};
+
+    #[test]
+    fn startup_collected_timeline_metrics_before_advancing() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::new();
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, SystemTime::now()),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        let now = DateTime::<Utc>::from(SystemTime::now());
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    snap.loaded_at.1.into(),
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_second_round() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id)
+                    .from_previous_up_to(before, now, 0),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, just_before, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let just_before = DateTime::<Utc>::from(just_before);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            // at t=before was the last time the last_record_lsn changed
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+            // end time of this event is used for the next ones
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                before,
+                just_before,
+                0,
+            ),
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    just_before,
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
+        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
+        times[0] = std::time::SystemTime::now();
+        for behind in 1..N {
+            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
+        }
+
+        times
+    }
+}
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -644,20 +644,19 @@ impl Tenant {
        Ok(())
    }

-    /// get size of all remote timelines
+    /// Get sum of all remote timelines sizes
    ///
    /// This function relies on the index_part instead of listing the remote storage
-    ///
-    pub async fn get_remote_size(&self) -> anyhow::Result<u64> {
+    pub fn remote_size(&self) -> u64 {
        let mut size = 0;

-        for timeline in self.list_timelines().iter() {
+        for timeline in self.list_timelines() {
            if let Some(remote_client) = &timeline.remote_client {
                size += remote_client.get_remote_physical_size();
            }
        }

-        Ok(size)
+        size
    }

    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
@@ -2889,7 +2888,7 @@ impl Tenant {
            .set(size);
    }

-    pub fn get_cached_synthetic_size(&self) -> u64 {
+    pub fn cached_synthetic_size(&self) -> u64 {
        self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
    }
 }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -21,7 +21,7 @@ where
    R: BlockReader,
 {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
        self.read_blob_into_buf(offset, &mut buf)?;
        Ok(buf)
@@ -29,7 +29,7 @@ where
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
    pub fn read_blob_into_buf(
-        &mut self,
+        &self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
    ) -> Result<(), std::io::Error> {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -80,7 +80,7 @@ where
        BlockCursor { reader }
    }

-    pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+    pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -390,39 +390,42 @@ where
    }

    #[allow(dead_code)]
-    pub fn dump(&self) -> Result<()> {
-        self.dump_recurse(self.root_blk, &[], 0)
-    }
+    pub async fn dump(&self) -> Result<()> {
+        let mut stack = Vec::new();

-    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
-        let blk = self.reader.read_blk(self.start_blk + blknum)?;
-        let buf: &[u8] = blk.as_ref();
+        stack.push((self.root_blk, String::new(), 0, 0, 0));

-        let node = OnDiskNode::<L>::deparse(buf)?;
+        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
+            let blk = self.reader.read_blk(self.start_blk + blknum)?;
+            let buf: &[u8] = blk.as_ref();
+            let node = OnDiskNode::<L>::deparse(buf)?;

-        print!("{:indent$}", "", indent = depth * 2);
-        println!(
-            "blk #{}: path {}: prefix {}, suffix_len {}",
-            blknum,
-            hex::encode(path),
-            hex::encode(node.prefix),
-            node.suffix_len
-        );
+            if child_idx == 0 {
+                print!("{:indent$}", "", indent = depth * 2);
+                let path_prefix = stack
+                    .iter()
+                    .map(|(_blknum, path, ..)| path.as_str())
+                    .collect::<String>();
+                println!(
+                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
+                    hex::encode(node.prefix),
+                    node.suffix_len
+                );
+            }

-        let mut idx = 0;
-        let mut key_off = 0;
-        while idx < node.num_children {
+            if child_idx + 1 < node.num_children {
+                let key_off = key_off + node.suffix_len as usize;
+                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
+            }
            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(idx as usize);
+            let val = node.value(child_idx as usize);
+
            print!("{:indent$}", "", indent = depth * 2 + 2);
            println!("{}: {}", hex::encode(key), hex::encode(val.0));

            if node.level > 0 {
-                let child_path = [path, node.prefix].concat();
-                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
+                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
            }
-            idx += 1;
-            key_off += node.suffix_len as usize;
        }
        Ok(())
    }
@@ -754,8 +757,8 @@ mod tests {
        }
    }

-    #[test]
-    fn basic() -> Result<()> {
+    #[tokio::test]
+    async fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -775,7 +778,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump()?;
+        reader.dump().await?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
@@ -835,8 +838,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn lots_of_keys() -> Result<()> {
+    #[tokio::test]
+    async fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -856,7 +859,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump()?;
+        reader.dump().await?;

        use std::sync::Mutex;

@@ -994,8 +997,8 @@ mod tests {
    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
-    #[test]
-    fn particular_data() -> Result<()> {
+    #[tokio::test]
+    async fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1022,7 +1025,7 @@ mod tests {
        })?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump()?;
+        reader.dump().await?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -420,7 +420,7 @@ mod tests {
            blobs.push((pos, data));
        }

-        let mut cursor = BlockCursor::new(&file);
+        let cursor = BlockCursor::new(&file);
        for (pos, expected) in blobs {
            let actual = cursor.read_blob(pos)?;
            assert_eq!(actual, expected);
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -223,6 +223,45 @@ mod tests {
        assert_eq!(part, expected);
    }

+    #[test]
+    fn v2_indexpart_is_parsed_with_deleted_at() {
+        let example = r#"{
+            "version":2,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["This shouldn't fail deserialization"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 2,
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
+
    #[test]
    fn empty_layers_are_parsed() {
        let empty_layers_json = r#"{
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -9,7 +9,7 @@ mod remote_layer;

 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::repository::{Key, Value};
+use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
@@ -34,7 +34,7 @@ use utils::{
    lsn::Lsn,
 };

-pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
@@ -381,12 +381,6 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

-/// Returned by [`PersistentLayer::iter`]
-pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
-
-/// Returned by [`PersistentLayer::key_iter`]
-pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
-
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
@@ -427,15 +421,6 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
    // `None` for `RemoteLayer`.
    fn local_path(&self) -> Option<PathBuf>;

-    /// Iterate through all keys and values stored in the layer
-    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
-
-    /// Iterate through all keys stored in the layer. Returns key, lsn and value size
-    /// It is used only for compaction and so is currently implemented only for DeltaLayer
-    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        panic!("Not implemented")
-    }
-
    /// Permanently remove this layer from disk.
    fn delete_resident_layer_file(&self) -> Result<()>;

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -61,8 +61,8 @@ use utils::{
 };

 use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
-    LayerKeyIter, PathOrConf, PersistentLayerDesc,
+    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
+    PersistentLayerDesc,
 };

 ///
@@ -189,7 +189,7 @@ pub struct DeltaLayer {

    access_stats: LayerAccessStats,

-    inner: OnceCell<DeltaLayerInner>,
+    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

 impl std::fmt::Debug for DeltaLayer {
@@ -256,12 +256,12 @@ impl Layer for DeltaLayer {
            file,
        );

-        tree_reader.dump()?;
+        tree_reader.dump().await?;

-        let mut cursor = file.block_cursor();
+        let cursor = file.block_cursor();

        // A subroutine to dump a single blob
-        let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
+        let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
            let buf = cursor.read_blob(blob_ref.pos())?;
            let val = Value::des(&buf)?;
            let desc = match val {
@@ -343,7 +343,7 @@ impl Layer for DeltaLayer {
            })?;

            // Ok, 'offsets' now contains the offsets of all the entries we need to read
-            let mut cursor = file.block_cursor();
+            let cursor = file.block_cursor();
            let mut buf = Vec::new();
            for (entry_lsn, pos) in offsets {
                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
@@ -424,23 +424,6 @@ impl PersistentLayer for DeltaLayer {
        Some(self.path())
    }

-    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .context("load delta layer")?;
-        Ok(match DeltaValueIter::new(inner) {
-            Ok(iter) => Box::new(iter),
-            Err(err) => Box::new(std::iter::once(Err(err))),
-        })
-    }
-
-    fn key_iter(&self, ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        let inner = self.load(LayerAccessKind::KeyIter, ctx)?;
-        Ok(Box::new(
-            DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
-        ))
-    }
-
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -510,7 +493,11 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
+    fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<&Arc<DeltaLayerInner>> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        // Quick exit if already loaded
@@ -519,7 +506,7 @@ impl DeltaLayer {
            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    fn load_inner(&self) -> Result<DeltaLayerInner> {
+    fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

        let file = VirtualFile::open(&path)
@@ -554,11 +541,11 @@ impl DeltaLayer {

        debug!("loaded from {}", &path.display());

-        Ok(DeltaLayerInner {
+        Ok(Arc::new(DeltaLayerInner {
            file,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
-        })
+        }))
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -623,6 +610,24 @@ impl DeltaLayer {
            &self.layer_name(),
        )
    }
+
+    /// Obtains all keys and value references stored in the layer
+    ///
+    /// The value can be obtained via the [`ValueRef::load`] function.
+    pub fn load_val_refs(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, ValueRef)>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .context("load delta layer")?;
+        DeltaLayerInner::load_val_refs(inner).context("Layer index is corrupted")
+    }
+
+    /// Loads all keys stored in the layer. Returns key, lsn and value size.
+    pub fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .context("load delta layer keys")?;
+        inner.load_keys().context("Layer index is corrupted")
+    }
 }

 /// A builder object for constructing a new delta layer.
@@ -893,121 +898,41 @@ impl Drop for DeltaLayerWriter {
    }
 }

-///
-/// Iterator over all key-value pairse stored in a delta layer
-///
-/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
-/// That takes up quite a lot of memory. Should do this in a more streaming
-/// fashion.
-///
-struct DeltaValueIter<'a> {
-    all_offsets: Vec<(DeltaKey, BlobRef)>,
-    next_idx: usize,
-    reader: BlockCursor<Adapter<'a>>,
-}
-
-struct Adapter<'a>(&'a DeltaLayerInner);
-
-impl<'a> BlockReader for Adapter<'a> {
-    type BlockLease = PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.read_blk(blknum)
-    }
-}
-
-impl<'a> Iterator for DeltaValueIter<'a> {
-    type Item = Result<(Key, Lsn, Value)>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.next_res().transpose()
-    }
-}
-
-impl<'a> DeltaValueIter<'a> {
-    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = &inner.file;
+impl DeltaLayerInner {
+    fn load_val_refs(this: &Arc<DeltaLayerInner>) -> Result<Vec<(Key, Lsn, ValueRef)>> {
+        let file = &this.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
+            this.index_start_blk,
+            this.index_root_blk,
            file,
        );

-        let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
+        let mut all_offsets = Vec::<(Key, Lsn, ValueRef)>::new();
        tree_reader.visit(
            &[0u8; DELTA_KEY_SIZE],
            VisitDirection::Forwards,
            |key, value| {
-                all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
+                let delta_key = DeltaKey::from_slice(key);
+                let val_ref = ValueRef {
+                    blob_ref: BlobRef(value),
+                    reader: BlockCursor::new(Adapter(this.clone())),
+                };
+                all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
                true
            },
        )?;

-        let iter = DeltaValueIter {
-            all_offsets,
-            next_idx: 0,
-            reader: BlockCursor::new(Adapter(inner)),
-        };
-
-        Ok(iter)
+        Ok(all_offsets)
    }
-
-    fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
-        if self.next_idx < self.all_offsets.len() {
-            let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];
-
-            let key = delta_key.key();
-            let lsn = delta_key.lsn();
-
-            let buf = self.reader.read_blob(blob_ref.pos())?;
-            let val = Value::des(&buf)?;
-            self.next_idx += 1;
-            Ok(Some((key, lsn, val)))
-        } else {
-            Ok(None)
-        }
-    }
-}
-///
-/// Iterator over all keys stored in a delta layer
-///
-/// FIXME: This creates a Vector to hold all keys.
-/// That takes up quite a lot of memory. Should do this in a more streaming
-/// fashion.
-///
-struct DeltaKeyIter {
-    all_keys: Vec<(DeltaKey, u64)>,
-    next_idx: usize,
-}
-
-impl Iterator for DeltaKeyIter {
-    type Item = (Key, Lsn, u64);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.next_idx < self.all_keys.len() {
-            let (delta_key, size) = &self.all_keys[self.next_idx];
-
-            let key = delta_key.key();
-            let lsn = delta_key.lsn();
-
-            self.next_idx += 1;
-            Some((key, lsn, *size))
-        } else {
-            None
-        }
-    }
-}
-
-impl<'a> DeltaKeyIter {
-    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = &inner.file;
+    fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
+        let file = &self.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
+            self.index_start_blk,
+            self.index_root_blk,
            file,
        );

-        let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
+        let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
        tree_reader.visit(
            &[0u8; DELTA_KEY_SIZE],
            VisitDirection::Forwards,
@@ -1015,46 +940,48 @@ impl<'a> DeltaKeyIter {
                let delta_key = DeltaKey::from_slice(key);
                let pos = BlobRef(value).pos();
                if let Some(last) = all_keys.last_mut() {
-                    if last.0.key() == delta_key.key() {
+                    if last.0 == delta_key.key() {
                        return true;
                    } else {
                        // subtract offset of new key BLOB and first blob of this key
                        // to get total size if values associated with this key
-                        let first_pos = last.1;
-                        last.1 = pos - first_pos;
+                        let first_pos = last.2;
+                        last.2 = pos - first_pos;
                    }
                }
-                all_keys.push((delta_key, pos));
+                all_keys.push((delta_key.key(), delta_key.lsn(), pos));
                true
            },
        )?;
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of layer
-            last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
+            last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
        }
-        let iter = DeltaKeyIter {
-            all_keys,
-            next_idx: 0,
-        };
-
-        Ok(iter)
+        Ok(all_keys)
    }
 }

-#[cfg(test)]
-mod test {
-    use super::DeltaKeyIter;
-    use super::DeltaLayer;
-    use super::DeltaValueIter;
+/// Reference to an on-disk value
+pub struct ValueRef {
+    blob_ref: BlobRef,
+    reader: BlockCursor<Adapter>,
+}

-    // We will soon need the iters to be send in the compaction code.
-    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
-    // Cf https://github.com/neondatabase/neon/issues/4471
-    #[test]
-    fn is_send() {
-        fn assert_send<T: Send>() {}
-        assert_send::<DeltaLayer>();
-        assert_send::<DeltaValueIter>();
-        assert_send::<DeltaKeyIter>();
+impl ValueRef {
+    /// Loads the value from disk
+    pub fn load(&self) -> Result<Value> {
+        let buf = self.reader.read_blob(self.blob_ref.pos())?;
+        let val = Value::des(&buf)?;
+        Ok(val)
+    }
+}
+
+struct Adapter(Arc<DeltaLayerInner>);
+
+impl BlockReader for Adapter {
+    type BlockLease = PageReadGuard<'static>;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+        self.0.file.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -57,9 +57,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{
-    AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
-};
+use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};

 ///
 /// Header stored in the beginning of the file
@@ -175,7 +173,7 @@ impl Layer for ImageLayer {
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

-        tree_reader.dump()?;
+        tree_reader.dump().await?;

        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
            println!("key: {} offset {}", hex::encode(key), value);
@@ -255,10 +253,6 @@ impl PersistentLayer for ImageLayer {
        Some(self.path())
    }

-    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        unimplemented!();
-    }
-
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -151,7 +151,7 @@ impl Layer for InMemoryLayer {
            return Ok(());
        }

-        let mut cursor = inner.file.block_cursor();
+        let cursor = inner.file.block_cursor();
        let mut buf = Vec::new();
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
@@ -196,7 +196,7 @@ impl Layer for InMemoryLayer {

        let inner = self.inner.read().unwrap();

-        let mut reader = inner.file.block_cursor();
+        let reader = inner.file.block_cursor();

        // Scan the page versions backwards, starting from `lsn`.
        if let Some(vec_map) = inner.index.get(&key) {
@@ -354,7 +354,7 @@ impl InMemoryLayer {

        let mut buf = Vec::new();

-        let mut cursor = inner.file.block_cursor();
+        let cursor = inner.file.block_cursor();

        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
        keys.sort_by_key(|k| k.0);
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -20,8 +20,8 @@ use utils::{

 use super::filename::{DeltaFileName, ImageFileName};
 use super::{
-    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
-    LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
+    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -129,14 +129,6 @@ impl PersistentLayer for RemoteLayer {
        None
    }

-    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        bail!("cannot iterate a remote layer");
-    }
-
-    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        bail!("cannot iterate a remote layer");
-    }
-
    fn delete_resident_layer_file(&self) -> Result<()> {
        bail!("remote layer has no layer file");
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -294,6 +294,10 @@ pub struct Timeline {
    /// Completion shared between all timelines loaded during startup; used to delay heavier
    /// background tasks until some logical sizes have been calculated.
    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
+
+    /// Load or creation time information about the disk_consistent_lsn and when the loading
+    /// happened. Used for consumption metrics.
+    pub(crate) loaded_at: (Lsn, SystemTime),
 }

 pub struct WalReceiverInfo {
@@ -524,7 +528,7 @@ impl Timeline {
        size
    }

-    pub fn get_resident_physical_size(&self) -> u64 {
+    pub fn resident_physical_size(&self) -> u64 {
        self.metrics.resident_physical_size_gauge.get()
    }

@@ -1404,6 +1408,8 @@ impl Timeline {
                last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
                last_freeze_ts: RwLock::new(Instant::now()),

+                loaded_at: (disk_consistent_lsn, SystemTime::now()),
+
                ancestor_timeline: ancestor,
                ancestor_lsn: metadata.ancestor_lsn(),

@@ -1600,7 +1606,7 @@ impl Timeline {
            if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
                // create an ImageLayer struct for each image file.
                if imgfilename.lsn > disk_consistent_lsn {
-                    warn!(
+                    info!(
                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
                        imgfilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1632,7 +1638,7 @@ impl Timeline {
                // is 102, then it might not have been fully flushed to disk
                // before crash.
                if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
-                    warn!(
+                    info!(
                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
                        deltafilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1774,7 +1780,7 @@ impl Timeline {
            match remote_layer_name {
                LayerFileName::Image(imgfilename) => {
                    if imgfilename.lsn > up_to_date_disk_consistent_lsn {
-                        warn!(
+                        info!(
                        "found future image layer {} on timeline {} remote_consistent_lsn is {}",
                        imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
                    );
@@ -1799,7 +1805,7 @@ impl Timeline {
                    // is 102, then it might not have been fully flushed to disk
                    // before crash.
                    if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
-                        warn!(
+                        info!(
                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
                        );
@@ -3507,7 +3513,13 @@ impl Timeline {
        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
        let mut prev: Option<Key> = None;
        for (next_key, _next_lsn, _size) in itertools::process_results(
-            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_keys(ctx)?
+                    .into_iter())
+            }),
            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
        )? {
            if let Some(prev_key) = prev {
@@ -3543,25 +3555,31 @@ impl Timeline {
        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
        let all_values_iter = itertools::process_results(
-            deltas_to_compact.iter().map(|l| l.iter(ctx)),
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_val_refs(ctx)?
+                    .into_iter())
+            }),
            |iter_iter| {
                iter_iter.kmerge_by(|a, b| {
-                    if let Ok((a_key, a_lsn, _)) = a {
-                        if let Ok((b_key, b_lsn, _)) = b {
-                            (a_key, a_lsn) < (b_key, b_lsn)
-                        } else {
-                            false
-                        }
-                    } else {
-                        true
-                    }
+                    let (a_key, a_lsn, _) = a;
+                    let (b_key, b_lsn, _) = b;
+                    (a_key, a_lsn) < (b_key, b_lsn)
                })
            },
        )?;

        // This iterator walks through all keys and is needed to calculate size used by each key
        let mut all_keys_iter = itertools::process_results(
-            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_keys(ctx)?
+                    .into_iter())
+            }),
            |iter_iter| {
                iter_iter.kmerge_by(|a, b| {
                    let (a_key, a_lsn, _) = a;
@@ -3623,8 +3641,8 @@ impl Timeline {
        let mut key_values_total_size = 0u64;
        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-        for x in all_values_iter {
-            let (key, lsn, value) = x?;
+        for (key, lsn, value_ref) in all_values_iter {
+            let value = value_ref.load()?;
            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
            // We need to check key boundaries once we reach next key or end of layer with the same key
            if !same_key || lsn == dup_end_lsn {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -308,8 +308,13 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        let mut state = self.eviction_task_timeline_state.lock().await;
+
+        // Only do the imitate_layer accesses approximately as often as the threshold.  A little
+        // more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
+        let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
+
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
                    .await;
@@ -332,7 +337,7 @@ impl Timeline {
        };
        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
                self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
                    .await;
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,6 +4,7 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	extension_server.o \
 	file_cache.o \
 	libpagestore.o \
 	libpqwalproposer.o \
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -0,0 +1,103 @@
+
+/*-------------------------------------------------------------------------
+ *
+ * extension_server.c
+ *	  Request compute_ctl to download extension files.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/extension_server.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "tcop/pquery.h"
+#include "tcop/utility.h"
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "commands/defrem.h"
+#include "miscadmin.h"
+#include "utils/acl.h"
+#include "fmgr.h"
+#include "utils/guc.h"
+#include "port.h"
+#include "fmgr.h"
+
+#include <curl/curl.h>
+
+static int extension_server_port = 0;
+
+static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
+
+// to download all SQL (and data) files for an extension:
+// curl -X POST http://localhost:8080/extension_server/postgis
+// it covers two possible extension files layouts:
+// 1. extension_name--version--platform.sql
+// 2. extension_name/extension_name--version.sql
+//    extension_name/extra_files.csv
+//
+// to download specific library file:
+// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
+static bool
+neon_download_extension_file_http(const char *filename, bool is_library)
+{
+    CURL *curl;
+    CURLcode res;
+    char *compute_ctl_url;
+    char *postdata;
+    bool ret = false;
+
+    if ((curl = curl_easy_init()) == NULL)
+    {
+        elog(ERROR, "Failed to initialize curl handle");
+    }
+
+    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
+                               extension_server_port, filename, is_library ? "?is_library=true" : "");
+
+    elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
+
+    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
+    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
+    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
+
+    if (curl)
+    {
+        /* Perform the request, res will get the return code */
+        res = curl_easy_perform(curl);
+        /* Check for errors */
+        if (res == CURLE_OK)
+        {
+            ret = true;
+        }
+        else
+        {
+            // Don't error here because postgres will try to find the file
+            // and will fail with some proper error message if it's not found.
+            elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
+        }
+
+        /* always cleanup */
+        curl_easy_cleanup(curl);
+    }
+
+    return ret;
+}
+
+void pg_init_extension_server()
+{
+    // Port to connect to compute_ctl on localhost
+    // to request extension files.
+    DefineCustomIntVariable("neon.extension_server_port",
+                            "connection string to the compute_ctl",
+                            NULL,
+                            &extension_server_port,
+                            0, 0, INT_MAX,
+                            PGC_POSTMASTER,
+                            0, /* no flags required */
+                            NULL, NULL, NULL);
+
+    // set download_extension_file_hook
+    prev_download_extension_file_hook = download_extension_file_hook;
+    download_extension_file_hook = neon_download_extension_file_http;
+}
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -88,50 +88,16 @@ static LWLockId lfc_lock;
 static int   lfc_max_size;
 static int   lfc_size_limit;
 static int   lfc_free_space_watermark;
-static int   lfc_free_memory_watermark;
 static char* lfc_path;
 static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif
-static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark or lfc_free_memory_watermak are reached */
+static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */

 void FileCacheMonitorMain(Datum main_arg);

-#ifdef __APPLE__
-
-#include <sys/types.h>
-#include <sys/sysctl.h>
-
-static size_t
-get_available_memory(void)
-{
-	size_t total;
-	size_t sizeof_total = sizeof(total);
-	if (sysctlbyname("hw.memsize", &total, &sizeof_total, NULL, 0) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	return total;
-}
-
-#else
-
-#include <sys/sysinfo.h>
-
-static size_t
-get_available_memory(void)
-{
-	struct sysinfo si;
-	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
-
-	return si.totalram*si.mem_unit;
-}
-
-#endif
-
-
 static void
 lfc_shmem_startup(void)
 {
@@ -229,11 +195,10 @@ lfc_change_limit_hook(int newval, void *extra)
 }

 /*
- * Local file system state monitor check available free space and memory.
- * If available disk space is lower than lfc_free_space_watermark or
- * available memory is lower than lfc_free_memory_watermark then we shrink size of local cache
+ * Local file system state monitor check available free space.
+ * If it is lower than lfc_free_space_watermark then we shrink size of local cache
 * but throwing away least recently accessed chunks.
- * First time the watermark is reached cache size is divided by two,
+ * First time low space watermark is reached cache size is divided by two,
 * second time by four,... Finally we remove all chunks from local cache.
 *
 * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
@@ -263,27 +228,23 @@ FileCacheMonitorMain(Datum main_arg)
 	{
 		if (lfc_size_limit != 0)
 		{
-			bool shrink_cache = false;
-			if (lfc_free_space_watermark != 0)
+			struct statvfs sfs;
+			if (statvfs(lfc_path, &sfs) < 0)
 			{
-				struct statvfs sfs;
-				if (statvfs(lfc_path, &sfs) < 0)
-					elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
-				else
-					shrink_cache |= sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB;
-			}
-			if (lfc_free_memory_watermark != 0)
-				shrink_cache |= get_available_memory() < lfc_free_memory_watermark*MB;
-
-			if (shrink_cache)
-			{
-				if (lfc_shrinking_factor < 31) {
-					lfc_shrinking_factor += 1;
-				}
-				lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
+				elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
 			}
 			else
-				lfc_shrinking_factor = 0; /* reset to initial value */
+			{
+				if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
+				{
+					if (lfc_shrinking_factor < 31) {
+						lfc_shrinking_factor += 1;
+					}
+					lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
+				}
+				else
+					lfc_shrinking_factor = 0; /* reset to initial value */
+			}
 		}
 		pg_usleep(monitor_interval);
 	}
@@ -356,19 +317,6 @@ lfc_init(void)
 							NULL,
 							NULL);

-	DefineCustomIntVariable("neon.free_memory_watermark",
-							"Minimal free memory in system after reaching which local file cache will be truncated",
-							NULL,
-							&lfc_free_memory_watermark,
-							0, /* disabled by default, because iurt makes sense only when local file cache is located i tmpfs  */
-							0,
-							INT_MAX,
-							PGC_SIGHUP,
-							GUC_UNIT_MB,
-							NULL,
-							NULL,
-							NULL);
-
 	DefineCustomStringVariable("neon.file_cache_path",
 							   "Path to local file cache (can be raw device)",
 							   NULL,
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -35,8 +35,11 @@ _PG_init(void)
 {
 	pg_init_libpagestore();
 	pg_init_walproposer();
+
 	InitControlPlaneConnector();

+	pg_init_extension_server();
+
        // Important: This must happen after other parts of the extension
        // are loaded, otherwise any settings to GUCs that were set before
        // the extension was loaded will be removed.
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -21,6 +21,8 @@ extern char *neon_tenant;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

+extern void pg_init_extension_server(void);
+
 /*
 * Returns true if we shouldn't do REDO on that block in record indicated by
 * block_id; false otherwise.
--- a/poetry.lock
+++ b/poetry.lock
@@ -887,34 +887,34 @@ files = [

 [[package]]
 name = "cryptography"
-version = "41.0.2"
+version = "41.0.3"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:01f1d9e537f9a15b037d5d9ee442b8c22e3ae11ce65ea1f3316a41c78756b711"},
-    {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:079347de771f9282fbfe0e0236c716686950c19dee1b76240ab09ce1624d76d7"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:439c3cc4c0d42fa999b83ded80a9a1fb54d53c58d6e59234cfe97f241e6c781d"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f14ad275364c8b4e525d018f6716537ae7b6d369c094805cae45300847e0894f"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:84609ade00a6ec59a89729e87a503c6e36af98ddcd566d5f3be52e29ba993182"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:49c3222bb8f8e800aead2e376cbef687bc9e3cb9b58b29a261210456a7783d83"},
-    {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d73f419a56d74fef257955f51b18d046f3506270a5fd2ac5febbfa259d6c0fa5"},
-    {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:2a034bf7d9ca894720f2ec1d8b7b5832d7e363571828037f9e0c4f18c1b58a58"},
-    {file = "cryptography-41.0.2-cp37-abi3-win32.whl", hash = "sha256:d124682c7a23c9764e54ca9ab5b308b14b18eba02722b8659fb238546de83a76"},
-    {file = "cryptography-41.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:9c3fe6534d59d071ee82081ca3d71eed3210f76ebd0361798c74abc2bcf347d4"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a719399b99377b218dac6cf547b6ec54e6ef20207b6165126a280b0ce97e0d2a"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:182be4171f9332b6741ee818ec27daff9fb00349f706629f5cbf417bd50e66fd"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7a9a3bced53b7f09da251685224d6a260c3cb291768f54954e28f03ef14e3766"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f0dc40e6f7aa37af01aba07277d3d64d5a03dc66d682097541ec4da03cc140ee"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:674b669d5daa64206c38e507808aae49904c988fa0a71c935e7006a3e1e83831"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7af244b012711a26196450d34f483357e42aeddb04128885d95a69bd8b14b69b"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9b6d717393dbae53d4e52684ef4f022444fc1cce3c48c38cb74fca29e1f08eaa"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:192255f539d7a89f2102d07d7375b1e0a81f7478925b3bc2e0549ebf739dae0e"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f772610fe364372de33d76edcd313636a25684edb94cee53fd790195f5989d14"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b332cba64d99a70c1e0836902720887fb4529ea49ea7f5462cf6640e095e11d2"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9a6673c1828db6270b76b22cc696f40cde9043eb90373da5c2f8f2158957f42f"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:342f3767e25876751e14f8459ad85e77e660537ca0a066e10e75df9c9e9099f0"},
-    {file = "cryptography-41.0.2.tar.gz", hash = "sha256:7d230bf856164de164ecb615ccc14c7fc6de6906ddd5b491f3af90d3514c925c"},
+    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"},
+    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"},
+    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"},
+    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"},
+    {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"},
+    {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"},
+    {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"},
 ]

 [package.dependencies]
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -5,7 +5,7 @@ use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::{try_wake, NUM_RETRIES_CONNECT},
+    proxy::handle_try_wake,
    sasl, scram,
    stream::PqStream,
 };
@@ -51,14 +51,15 @@ pub(super) async fn authenticate(
        }
    };

+    info!("compute node's state has likely changed; requesting a wake-up");
    let mut num_retries = 0;
    let mut node = loop {
-        num_retries += 1;
-        match try_wake(api, extra, creds).await? {
+        let wake_res = api.wake_compute(extra, creds).await;
+        match handle_try_wake(wake_res, num_retries)? {
+            ControlFlow::Continue(_) => num_retries += 1,
            ControlFlow::Break(n) => break n,
-            ControlFlow::Continue(_) if num_retries < NUM_RETRIES_CONNECT => continue,
-            ControlFlow::Continue(e) => return Err(e.into()),
        }
+        info!(num_retries, "retrying wake compute");
    };
    if let Some(keys) = scram_keys {
        use tokio_postgres::config::AuthKeys;
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,7 +1,9 @@
 use std::sync::Arc;

+use anyhow::bail;
 use futures::pin_mut;
 use futures::StreamExt;
+use hashbrown::HashMap;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
@@ -12,6 +14,7 @@ use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
+use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
 use url::Url;

@@ -37,6 +40,8 @@ const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
+static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
+static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");

 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

@@ -170,7 +175,7 @@ pub async fn handle(
    request: Request<Body>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
-) -> anyhow::Result<Value> {
+) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
    //
    // Determine the destination and connection params
    //
@@ -185,6 +190,23 @@ pub async fn handle(
    // Allow connection pooling only if explicitly requested
    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

+    // isolation level and read only
+
+    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
+    let txn_isolation_level = match txn_isolation_level_raw {
+        Some(ref x) => Some(match x.as_bytes() {
+            b"Serializable" => IsolationLevel::Serializable,
+            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
+            b"ReadCommitted" => IsolationLevel::ReadCommitted,
+            b"RepeatableRead" => IsolationLevel::RepeatableRead,
+            _ => bail!("invalid isolation level"),
+        }),
+        None => None,
+    };
+
+    let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
+    let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
+
    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
        None => MAX_REQUEST_SIZE + 1,
@@ -208,10 +230,19 @@ pub async fn handle(
    // Now execute the query and return the result
    //
    let result = match payload {
-        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode).await,
+        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
+            .await
+            .map(|x| (x, HashMap::default())),
        Payload::Batch(queries) => {
            let mut results = Vec::new();
-            let transaction = client.transaction().await?;
+            let mut builder = client.build_transaction();
+            if let Some(isolation_level) = txn_isolation_level {
+                builder = builder.isolation_level(isolation_level);
+            }
+            if txn_read_only {
+                builder = builder.read_only(true);
+            }
+            let transaction = builder.start().await?;
            for query in queries {
                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
                match result {
@@ -223,7 +254,15 @@ pub async fn handle(
                }
            }
            transaction.commit().await?;
-            Ok(json!({ "results": results }))
+            let mut headers = HashMap::default();
+            headers.insert(
+                TXN_READ_ONLY.clone(),
+                HeaderValue::try_from(txn_read_only.to_string())?,
+            );
+            if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
+                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
+            }
+            Ok((json!({ "results": results }), headers))
        }
    };

--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -6,6 +6,7 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
+use hashbrown::HashMap;
 use hyper::{
    server::{
        accept,
@@ -205,7 +206,7 @@ async fn ws_handler(
            Ok(_) => StatusCode::OK,
            Err(_) => StatusCode::BAD_REQUEST,
        };
-        let json = match result {
+        let (json, headers) = match result {
            Ok(r) => r,
            Err(e) => {
                let message = format!("{:?}", e);
@@ -216,7 +217,10 @@ async fn ws_handler(
                    },
                    None => Value::Null,
                };
-                json!({ "message": message, "code": code })
+                (
+                    json!({ "message": message, "code": code }),
+                    HashMap::default(),
+                )
            }
        };
        json_response(status_code, json).map(|mut r| {
@@ -224,6 +228,9 @@ async fn ws_handler(
                "Access-Control-Allow-Origin",
                hyper::http::HeaderValue::from_static("*"),
            );
+            for (k, v) in headers {
+                r.headers_mut().insert(k, v);
+            }
            r
        })
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -11,7 +11,6 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

-///
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
 /// so keep it in a named struct.
@@ -19,8 +18,7 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// Both the proxy and the ingestion endpoint will live in the same region (or cell)
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
-///
-#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
+#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
 pub struct Ids {
    pub endpoint_id: String,
    pub branch_id: String,
@@ -149,7 +147,7 @@ async fn collect_metrics_iteration(
                    stop_time: *curr_time,
                },
                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname.to_owned()),
+                idempotency_key: idempotency_key(hostname),
                value,
                extra: Ids {
                    endpoint_id: curr_key.endpoint_id.clone(),
@@ -167,12 +165,11 @@ async fn collect_metrics_iteration(
    // Send metrics.
    // Split into chunks of 1000 metrics to avoid exceeding the max request size
    for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
-        let chunk_json = serde_json::value::to_raw_value(&EventChunk { events: chunk })
-            .expect("ProxyConsumptionMetric should not fail serialization");
-
        let res = client
            .post(metric_collection_endpoint.clone())
-            .json(&chunk_json)
+            .json(&EventChunk {
+                events: chunk.into(),
+            })
            .send()
            .await;

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -347,11 +347,6 @@ async fn connect_to_compute_once(
        .await
 }

-enum ConnectionState<E> {
-    Cached(console::CachedNodeInfo),
-    Invalid(compute::ConnCfg, E),
-}
-
 #[async_trait]
 pub trait ConnectMechanism {
    type Connection;
@@ -407,70 +402,67 @@ where

    mechanism.update_connect_config(&mut node_info.config);

-    let mut num_retries = 0;
-    let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);
+    // try once
+    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+        Ok(res) => return Ok(res),
+        Err(e) => {
+            error!(error = ?e, "could not connect to compute node");
+            (invalidate_cache(node_info), e)
+        }
+    };

-    loop {
-        match state {
-            ConnectionState::Invalid(config, err) => {
-                info!("compute node's state has likely changed; requesting a wake-up");
+    let mut num_retries = 1;

-                let wake_res = match creds {
-                    auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
-                    auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
-                    // nothing to do?
-                    auth::BackendType::Link(_) => return Err(err.into()),
-                    // test backend
-                    auth::BackendType::Test(x) => x.wake_compute(),
-                };
+    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let node_info = loop {
+        let wake_res = match creds {
+            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
+            // nothing to do?
+            auth::BackendType::Link(_) => return Err(err.into()),
+            // test backend
+            auth::BackendType::Test(x) => x.wake_compute(),
+        };

-                match handle_try_wake(wake_res) {
-                    // there was an error communicating with the control plane
-                    Err(e) => return Err(e.into()),
-                    // failed to wake up but we can continue to retry
-                    Ok(ControlFlow::Continue(_)) => {
-                        state = ConnectionState::Invalid(config, err);
-                        let wait_duration = retry_after(num_retries);
-                        num_retries += 1;
-
-                        info!(num_retries, "retrying wake compute");
-                        time::sleep(wait_duration).await;
-                        continue;
-                    }
-                    // successfully woke up a compute node and can break the wakeup loop
-                    Ok(ControlFlow::Break(mut node_info)) => {
-                        node_info.config.reuse_password(&config);
-                        mechanism.update_connect_config(&mut node_info.config);
-                        state = ConnectionState::Cached(node_info)
-                    }
-                }
+        match handle_try_wake(wake_res, num_retries)? {
+            // failed to wake up but we can continue to retry
+            ControlFlow::Continue(_) => {}
+            // successfully woke up a compute node and can break the wakeup loop
+            ControlFlow::Break(mut node_info) => {
+                node_info.config.reuse_password(&config);
+                mechanism.update_connect_config(&mut node_info.config);
+                break node_info;
            }
-            ConnectionState::Cached(node_info) => {
-                match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-                    Ok(res) => return Ok(res),
-                    Err(e) => {
-                        error!(error = ?e, "could not connect to compute node");
-                        if !e.should_retry(num_retries) {
-                            return Err(e.into());
-                        }
+        }

-                        // after the first connect failure,
-                        // we should invalidate the cache and wake up a new compute node
-                        if num_retries == 0 {
-                            state = ConnectionState::Invalid(invalidate_cache(node_info), e);
-                        } else {
-                            state = ConnectionState::Cached(node_info);
-                        }
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;

-                        let wait_duration = retry_after(num_retries);
-                        num_retries += 1;
+        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying wake compute");
+    };

-                        info!(num_retries, "retrying wake compute");
-                        time::sleep(wait_duration).await;
-                    }
+    // now that we have a new node, try connect to it repeatedly.
+    // this can error for a few reasons, for instance:
+    // * DNS connection settings haven't quite propagated yet
+    info!("wake_compute success. attempting to connect");
+    loop {
+        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !e.should_retry(num_retries) {
+                    return Err(e.into());
                }
            }
        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying connect_once");
    }
 }

@@ -478,12 +470,15 @@ where
 /// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
 /// * Returns Ok(Break(node)) if the wakeup succeeded
 /// * Returns Err(e) if there was an error
-fn handle_try_wake(
+pub fn handle_try_wake(
    result: Result<console::CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
 ) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
    match result {
        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.could_retry() => Ok(ControlFlow::Continue(err)),
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
            _ => Err(err),
        },
        // Ready to try again.
@@ -491,22 +486,10 @@ fn handle_try_wake(
    }
 }

-/// Attempts to wake up the compute node.
-pub async fn try_wake(
-    api: &impl console::Api,
-    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::ClientCredentials<'_>,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    info!("compute node's state has likely changed; requesting a wake-up");
-    handle_try_wake(api.wake_compute(extra, creds).await)
-}
-
 pub trait ShouldRetry {
    fn could_retry(&self) -> bool;
    fn should_retry(&self, num_retries: u32) -> bool {
        match self {
-            // retry all errors at least once
-            _ if num_retries == 0 => true,
            _ if num_retries >= NUM_RETRIES_CONNECT => false,
            err => err.could_retry(),
        }
@@ -558,14 +541,9 @@ impl ShouldRetry for compute::ConnectionError {
    }
 }

-pub fn retry_after(num_retries: u32) -> time::Duration {
-    match num_retries {
-        0 => time::Duration::ZERO,
-        _ => {
-            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
-        }
-    }
+fn retry_after(num_retries: u32) -> time::Duration {
+    // 1.5 seems to be an ok growth factor heuristic
+    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -302,7 +302,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 0..10 {
+    for num_retries in 1..10 {
        total_wait += retry_after(num_retries);
    }
    assert!(total_wait < tokio::time::Duration::from_secs(12));
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -234,7 +234,10 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
                listen_pg_addr_tenant_only
            );
            let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
-                error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+                error!(
+                    "failed to bind to address {}: {}",
+                    listen_pg_addr_tenant_only, e
+                );
                e
            })?;
            Some(listener)
--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -15,16 +15,12 @@ from pathlib import Path
    ],
    "library_index": {
        "anon": "anon",
-        "kq_imcx": "kq_imcx"
-        // would be more complicated for something like postgis where multiple library names all map to postgis
+        // for more complex extensions like postgis
+        // we might have something like:
+        // address_standardizer: postgis
+        // postgis_tiger: postgis
    },
    "extension_data": {
-        "kq_imcx": {
-            "control_data": {
-                "kq_imcx.control": "# This file is generated content from add_postgresql_extension.\n# No point in modifying it, it will be overwritten anyway.\n\n# Default version, always set\ndefault_version = '0.1'\n\n# Module pathname generated from target shared library name. Use\n# MODULE_PATHNAME in script file.\nmodule_pathname = '$libdir/kq_imcx.so'\n\n# Comment for extension. Set using COMMENT option. Can be set in\n# script file as well.\ncomment = 'ketteQ In-Memory Calendar Extension (IMCX)'\n\n# Encoding for script file. Set using ENCODING option.\n#encoding = ''\n\n# Required extensions. Set using REQUIRES option (multi-valued).\n#requires = ''\ntrusted = true\n"
-            },
-            "archive_path": "5648391853/v15/extensions/kq_imcx.tar.zst"
-        },
        "anon": {
            "control_data": {
                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -530,6 +530,16 @@ def available_remote_storages() -> List[RemoteStorageKind]:
    return remote_storages


+def available_s3_storages() -> List[RemoteStorageKind]:
+    remote_storages = [RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
+        remote_storages.append(RemoteStorageKind.REAL_S3)
+        log.info("Enabling real s3 storage for tests")
+    else:
+        log.info("Using mock implementations to test remote storage")
+    return remote_storages
+
+
@dataclass
 class LocalFsStorage:
    root: Path
@@ -550,6 +560,16 @@ class S3Storage:
            "AWS_SECRET_ACCESS_KEY": self.secret_key,
        }

+    def to_string(self) -> str:
+        return json.dumps(
+            {
+                "bucket": self.bucket_name,
+                "region": self.bucket_region,
+                "endpoint": self.endpoint,
+                "prefix": self.prefix_in_bucket,
+            }
+        )
+

 RemoteStorage = Union[LocalFsStorage, S3Storage]

@@ -616,10 +636,12 @@ class NeonEnvBuilder:
        self.rust_log_override = rust_log_override
        self.port_distributor = port_distributor
        self.remote_storage = remote_storage
+        self.ext_remote_storage: Optional[S3Storage] = None
+        self.remote_storage_client: Optional[Any] = None
        self.remote_storage_users = remote_storage_users
        self.broker = broker
        self.run_id = run_id
-        self.mock_s3_server = mock_s3_server
+        self.mock_s3_server: MockS3Server = mock_s3_server
        self.pageserver_config_override = pageserver_config_override
        self.num_safekeepers = num_safekeepers
        self.safekeepers_id_start = safekeepers_id_start
@@ -667,15 +689,24 @@ class NeonEnvBuilder:
        remote_storage_kind: RemoteStorageKind,
        test_name: str,
        force_enable: bool = True,
+        enable_remote_extensions: bool = False,
    ):
        if remote_storage_kind == RemoteStorageKind.NOOP:
            return
        elif remote_storage_kind == RemoteStorageKind.LOCAL_FS:
            self.enable_local_fs_remote_storage(force_enable=force_enable)
        elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
-            self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable)
+            self.enable_mock_s3_remote_storage(
+                bucket_name=test_name,
+                force_enable=force_enable,
+                enable_remote_extensions=enable_remote_extensions,
+            )
        elif remote_storage_kind == RemoteStorageKind.REAL_S3:
-            self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable)
+            self.enable_real_s3_remote_storage(
+                test_name=test_name,
+                force_enable=force_enable,
+                enable_remote_extensions=enable_remote_extensions,
+            )
        else:
            raise RuntimeError(f"Unknown storage type: {remote_storage_kind}")

@@ -689,11 +720,18 @@ class NeonEnvBuilder:
        assert force_enable or self.remote_storage is None, "remote storage is enabled already"
        self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage"))

-    def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True):
+    def enable_mock_s3_remote_storage(
+        self,
+        bucket_name: str,
+        force_enable: bool = True,
+        enable_remote_extensions: bool = False,
+    ):
        """
        Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
        Starts up the mock server, if that does not run yet.
        Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
+
+        Also creates the bucket for extensions, self.ext_remote_storage bucket
        """
        assert force_enable or self.remote_storage is None, "remote storage is enabled already"
        mock_endpoint = self.mock_s3_server.endpoint()
@@ -714,9 +752,25 @@ class NeonEnvBuilder:
            bucket_region=mock_region,
            access_key=self.mock_s3_server.access_key(),
            secret_key=self.mock_s3_server.secret_key(),
+            prefix_in_bucket="pageserver",
        )

-    def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True):
+        if enable_remote_extensions:
+            self.ext_remote_storage = S3Storage(
+                bucket_name=bucket_name,
+                endpoint=mock_endpoint,
+                bucket_region=mock_region,
+                access_key=self.mock_s3_server.access_key(),
+                secret_key=self.mock_s3_server.secret_key(),
+                prefix_in_bucket="ext",
+            )
+
+    def enable_real_s3_remote_storage(
+        self,
+        test_name: str,
+        force_enable: bool = True,
+        enable_remote_extensions: bool = False,
+    ):
        """
        Sets up configuration to use real s3 endpoint without mock server
        """
@@ -756,6 +810,15 @@ class NeonEnvBuilder:
            prefix_in_bucket=self.remote_storage_prefix,
        )

+        if enable_remote_extensions:
+            self.ext_remote_storage = S3Storage(
+                bucket_name="neon-dev-extensions-eu-central-1",
+                bucket_region="eu-central-1",
+                access_key=access_key,
+                secret_key=secret_key,
+                prefix_in_bucket=None,
+            )
+
    def cleanup_local_storage(self):
        if self.preserve_database_files:
            return
@@ -789,6 +852,7 @@ class NeonEnvBuilder:
        # `self.remote_storage_prefix` is coupled with `S3Storage` storage type,
        # so this line effectively a no-op
        assert isinstance(self.remote_storage, S3Storage)
+        assert self.remote_storage_client is not None

        if self.keep_remote_storage_contents:
            log.info("keep_remote_storage_contents skipping remote storage cleanup")
@@ -918,6 +982,8 @@ class NeonEnv:
        self.neon_binpath = config.neon_binpath
        self.pg_distrib_dir = config.pg_distrib_dir
        self.endpoint_counter = 0
+        self.remote_storage_client = config.remote_storage_client
+        self.ext_remote_storage = config.ext_remote_storage

        # generate initial tenant ID here instead of letting 'neon init' generate it,
        # so that we don't need to dig it out of the config file afterwards.
@@ -1505,6 +1571,7 @@ class NeonCli(AbstractNeonCli):
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
        branch_name: Optional[str] = None,
+        remote_ext_config: Optional[str] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1514,6 +1581,8 @@ class NeonCli(AbstractNeonCli):
            "--pg-version",
            self.env.pg_version,
        ]
+        if remote_ext_config is not None:
+            args.extend(["--remote-ext-config", remote_ext_config])
        if lsn is not None:
            args.append(f"--lsn={lsn}")
        args.extend(["--pg-port", str(pg_port)])
@@ -2375,7 +2444,7 @@ class Endpoint(PgProtocol):

        return self

-    def start(self) -> "Endpoint":
+    def start(self, remote_ext_config: Optional[str] = None) -> "Endpoint":
        """
        Start the Postgres instance.
        Returns self.
@@ -2391,6 +2460,7 @@ class Endpoint(PgProtocol):
            http_port=self.http_port,
            tenant_id=self.tenant_id,
            safekeepers=self.active_safekeepers,
+            remote_ext_config=remote_ext_config,
        )
        self.running = True

@@ -2480,6 +2550,7 @@ class Endpoint(PgProtocol):
        hot_standby: bool = False,
        lsn: Optional[Lsn] = None,
        config_lines: Optional[List[str]] = None,
+        remote_ext_config: Optional[str] = None,
    ) -> "Endpoint":
        """
        Create an endpoint, apply config, and start Postgres.
@@ -2494,7 +2565,7 @@ class Endpoint(PgProtocol):
            config_lines=config_lines,
            hot_standby=hot_standby,
            lsn=lsn,
-        ).start()
+        ).start(remote_ext_config=remote_ext_config)

        log.info(f"Postgres startup took {time.time() - started_at} seconds")

@@ -2528,6 +2599,7 @@ class EndpointFactory:
        lsn: Optional[Lsn] = None,
        hot_standby: bool = False,
        config_lines: Optional[List[str]] = None,
+        remote_ext_config: Optional[str] = None,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -2544,6 +2616,7 @@ class EndpointFactory:
            hot_standby=hot_standby,
            config_lines=config_lines,
            lsn=lsn,
+            remote_ext_config=remote_ext_config,
        )

    def create(
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -197,10 +197,9 @@ def wait_timeline_detail_404(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    wait_longer: bool = False,
+    iterations: int,
 ):
    last_exc = None
-    iterations = 10 if wait_longer else 2
    for _ in range(iterations):
        time.sleep(0.250)
        try:
@@ -220,8 +219,8 @@ def timeline_delete_wait_completed(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    wait_longer: bool = False,  # Use when running with RemoteStorageKind.REAL_S3
+    iterations: int = 20,
    **delete_args,
 ):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -89,6 +89,9 @@ class TenantId(Id):
    def __repr__(self) -> str:
        return f'`TenantId("{self.id.hex()}")'

+    def __str__(self) -> str:
+        return self.id.hex()
+

 class TimelineId(Id):
    def __repr__(self) -> str:
--- a/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json
+++ b/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json
@@ -0,0 +1,24 @@
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
--- a/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst
+++ b/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst
--- a/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst
+++ b/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst
--- a/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json
+++ b/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json
@@ -0,0 +1,17 @@
+{
+    "public_extensions": [
+        "anon"
+    ],
+    "library_index": {
+        "anon": "anon"
+    },
+    "extension_data": {
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v15/extensions/anon.tar.zst"
+        }
+    }
+}
+
--- a/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst
+++ b/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -257,28 +257,15 @@ def prepare_snapshot(
        shutil.rmtree(repo_dir / "pgdatadirs")
    os.mkdir(repo_dir / "endpoints")

-    # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create
-    # them anymore, but old versions did.
-    for tenant in (repo_dir / "tenants").glob("*"):
-        wal_redo_dir = tenant / "wal-redo-datadir.___temp"
-        if wal_redo_dir.exists() and wal_redo_dir.is_dir():
-            shutil.rmtree(wal_redo_dir)
-
    # Update paths and ports in config files
    pageserver_toml = repo_dir / "pageserver.toml"
    pageserver_config = toml.load(pageserver_toml)
    pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
-    pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
-        pageserver_config["listen_http_addr"]
-    )
-    pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
-        pageserver_config["listen_pg_addr"]
-    )
+    for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"):
+        pageserver_config[param] = port_distributor.replace_with_new_port(pageserver_config[param])

-    # Older pageserver versions had just one `auth_type` setting. Now there
-    # are separate settings for pg and http ports. We don't use authentication
-    # in compatibility tests so just remove authentication related settings.
-    pageserver_config.pop("auth_type", None)
+    # We don't use authentication in compatibility tests
+    # so just remove authentication related settings.
    pageserver_config.pop("pg_auth_type", None)
    pageserver_config.pop("http_auth_type", None)

@@ -290,19 +277,16 @@ def prepare_snapshot(

    snapshot_config_toml = repo_dir / "config"
    snapshot_config = toml.load(snapshot_config_toml)
-
-    broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
-    snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
-
-    snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["pageserver"]["listen_http_addr"]
-    )
-    snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["pageserver"]["listen_pg_addr"]
+    for param in ("listen_http_addr", "listen_pg_addr"):
+        snapshot_config["pageserver"][param] = port_distributor.replace_with_new_port(
+            snapshot_config["pageserver"][param]
+        )
+    snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port(
+        snapshot_config["broker"]["listen_addr"]
    )
    for sk in snapshot_config["safekeepers"]:
-        sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
-        sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])
+        for param in ("http_port", "pg_port", "pg_tenant_only_port"):
+            sk[param] = port_distributor.replace_with_new_port(sk[param])

    if pg_distrib_dir:
        snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -0,0 +1,324 @@
+import os
+import shutil
+import threading
+from contextlib import closing
+from pathlib import Path
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_s3_storages,
+)
+from fixtures.pg_version import PgVersion
+
+
+# Cleaning up downloaded files is important for local tests
+# or else one test could reuse the files from another test or another test run
+def cleanup(pg_version):
+    PGDIR = Path(f"pg_install/v{pg_version}")
+
+    LIB_DIR = PGDIR / Path("lib/postgresql")
+    cleanup_lib_globs = ["anon*", "postgis*", "pg_buffercache*"]
+    cleanup_lib_glob_paths = [LIB_DIR.glob(x) for x in cleanup_lib_globs]
+
+    SHARE_DIR = PGDIR / Path("share/postgresql/extension")
+    cleanup_ext_globs = [
+        "anon*",
+        "address_standardizer*",
+        "postgis*",
+        "pageinspect*",
+        "pg_buffercache*",
+        "pgrouting*",
+    ]
+    cleanup_ext_glob_paths = [SHARE_DIR.glob(x) for x in cleanup_ext_globs]
+
+    all_glob_paths = cleanup_lib_glob_paths + cleanup_ext_glob_paths
+    all_cleanup_files = []
+    for file_glob in all_glob_paths:
+        for file in file_glob:
+            all_cleanup_files.append(file)
+
+    for file in all_cleanup_files:
+        try:
+            os.remove(file)
+            log.info(f"removed file {file}")
+        except Exception as err:
+            log.info(
+                f"skipping remove of file {file} because it doesn't exist.\
+                      this may be expected or unexpected depending on the test {err}"
+            )
+
+    cleanup_folders = [SHARE_DIR / Path("anon"), PGDIR / Path("download_extensions")]
+    for folder in cleanup_folders:
+        try:
+            shutil.rmtree(folder)
+            log.info(f"removed folder {folder}")
+        except Exception as err:
+            log.info(
+                f"skipping remove of folder {folder} because it doesn't exist.\
+                      this may be expected or unexpected depending on the test {err}"
+            )
+
+
+def upload_files(env):
+    log.info("Uploading test files to mock bucket")
+    os.chdir("test_runner/regress/data/extension_test")
+    for path in os.walk("."):
+        prefix, _, files = path
+        for file in files:
+            # the [2:] is to remove the leading "./"
+            full_path = os.path.join(prefix, file)[2:]
+
+            with open(full_path, "rb") as f:
+                log.info(f"UPLOAD {full_path} to ext/{full_path}")
+                env.remote_storage_client.upload_fileobj(
+                    f,
+                    env.ext_remote_storage.bucket_name,
+                    f"ext/{full_path}",
+                )
+    os.chdir("../../../..")
+
+
+# Test downloading remote extension.
+@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
+def test_remote_extensions(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_version: PgVersion,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_remote_extensions",
+        enable_remote_extensions=True,
+    )
+    env = neon_env_builder.init_start()
+    tenant_id, _ = env.neon_cli.create_tenant()
+    env.neon_cli.create_timeline("test_remote_extensions", tenant_id=tenant_id)
+
+    assert env.ext_remote_storage is not None  # satisfy mypy
+    assert env.remote_storage_client is not None  # satisfy mypy
+
+    # For MOCK_S3 we upload test files.
+    # For REAL_S3 we use the files already in the bucket
+    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
+        upload_files(env)
+
+    # Start a compute node and check that it can download the extensions
+    # and use them to CREATE EXTENSION and LOAD
+    endpoint = env.endpoints.create_start(
+        "test_remote_extensions",
+        tenant_id=tenant_id,
+        remote_ext_config=env.ext_remote_storage.to_string(),
+        # config_lines=["log_min_messages=debug3"],
+    )
+    try:
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                # Check that appropriate control files were downloaded
+                cur.execute("SELECT * FROM pg_available_extensions")
+                all_extensions = [x[0] for x in cur.fetchall()]
+                log.info(all_extensions)
+                assert "anon" in all_extensions
+
+                # postgis is on real s3 but not mock s3.
+                # it's kind of a big file, would rather not upload to github
+                if remote_storage_kind == RemoteStorageKind.REAL_S3:
+                    assert "postgis" in all_extensions
+                    # this may fail locally if dependency is missing
+                    # we don't really care about the error,
+                    # we just want to make sure it downloaded
+                    try:
+                        cur.execute("CREATE EXTENSION postgis")
+                    except Exception as err:
+                        log.info(f"(expected) error creating postgis extension: {err}")
+                        # we do not check the error, so this is basically a NO-OP
+                        # however checking the log you can make sure that it worked
+                        # and also get valuable information about how long loading the extension took
+
+                # this is expected to fail on my computer because I don't have the pgcrypto extension
+                try:
+                    cur.execute("CREATE EXTENSION anon")
+                except Exception as err:
+                    log.info("error creating anon extension")
+                    assert "pgcrypto" in str(err), "unexpected error creating anon extension"
+    finally:
+        cleanup(pg_version)
+
+
+# Test downloading remote library.
+@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
+def test_remote_library(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_version: PgVersion,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_remote_library",
+        enable_remote_extensions=True,
+    )
+    env = neon_env_builder.init_start()
+    tenant_id, _ = env.neon_cli.create_tenant()
+    env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id)
+
+    assert env.ext_remote_storage is not None  # satisfy mypy
+    assert env.remote_storage_client is not None  # satisfy mypy
+
+    # For MOCK_S3 we upload test files.
+    # For REAL_S3 we use the files already in the bucket
+    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
+        upload_files(env)
+
+    # and use them to run LOAD library
+    endpoint = env.endpoints.create_start(
+        "test_remote_library",
+        tenant_id=tenant_id,
+        remote_ext_config=env.ext_remote_storage.to_string(),
+        # config_lines=["log_min_messages=debug3"],
+    )
+    try:
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                # try to load library
+                try:
+                    cur.execute("LOAD 'anon'")
+                except Exception as err:
+                    log.info(f"error loading anon library: {err}")
+                    raise AssertionError("unexpected error loading anon library") from err
+
+                # test library which name is different from extension name
+                # this may fail locally if dependency is missing
+                # however, it does successfully download the postgis archive
+                if remote_storage_kind == RemoteStorageKind.REAL_S3:
+                    try:
+                        cur.execute("LOAD 'postgis_topology-3'")
+                    except Exception as err:
+                        log.info("error loading postgis_topology-3")
+                        assert "No such file or directory" in str(
+                            err
+                        ), "unexpected error loading postgis_topology-3"
+    finally:
+        cleanup(pg_version)
+
+
+# Here we test a complex extension
+# which has multiple extensions in one archive
+# using postgis as an example
+@pytest.mark.skipif(
+    RemoteStorageKind.REAL_S3 not in available_s3_storages(),
+    reason="skipping test because real s3 not enabled",
+)
+def test_multiple_extensions_one_archive(
+    neon_env_builder: NeonEnvBuilder,
+    pg_version: PgVersion,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.REAL_S3,
+        test_name="test_multiple_extensions_one_archive",
+        enable_remote_extensions=True,
+    )
+    env = neon_env_builder.init_start()
+    tenant_id, _ = env.neon_cli.create_tenant()
+    env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id)
+
+    assert env.ext_remote_storage is not None  # satisfy mypy
+    assert env.remote_storage_client is not None  # satisfy mypy
+
+    endpoint = env.endpoints.create_start(
+        "test_multiple_extensions_one_archive",
+        tenant_id=tenant_id,
+        remote_ext_config=env.ext_remote_storage.to_string(),
+    )
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CREATE EXTENSION address_standardizer;")
+            cur.execute("CREATE EXTENSION address_standardizer_data_us;")
+            # execute query to ensure that it works
+            cur.execute(
+                "SELECT house_num, name, suftype, city, country, state, unit \
+                        FROM standardize_address('us_lex', 'us_gaz', 'us_rules', \
+                        'One Rust Place, Boston, MA 02109');"
+            )
+            res = cur.fetchall()
+            log.info(res)
+            assert len(res) > 0
+
+    cleanup(pg_version)
+
+
+# Test that extension is downloaded after endpoint restart,
+# when the library is used in the query.
+#
+# Run the test with mutliple simultaneous connections to an endpoint.
+# to ensure that the extension is downloaded only once.
+#
+def test_extension_download_after_restart(
+    neon_env_builder: NeonEnvBuilder,
+    pg_version: PgVersion,
+):
+    if "15" in pg_version:  # SKIP v15 for now because test set only has extension built for v14
+        return None
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_extension_download_after_restart",
+        enable_remote_extensions=True,
+    )
+    env = neon_env_builder.init_start()
+    tenant_id, _ = env.neon_cli.create_tenant()
+    env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id)
+
+    assert env.ext_remote_storage is not None  # satisfy mypy
+    assert env.remote_storage_client is not None  # satisfy mypy
+
+    # For MOCK_S3 we upload test files.
+    upload_files(env)
+
+    endpoint = env.endpoints.create_start(
+        "test_extension_download_after_restart",
+        tenant_id=tenant_id,
+        remote_ext_config=env.ext_remote_storage.to_string(),
+        config_lines=["log_min_messages=debug3"],
+    )
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CREATE extension pg_buffercache;")
+            cur.execute("SELECT * from pg_buffercache;")
+            res = cur.fetchall()
+            assert len(res) > 0
+            log.info(res)
+
+    # shutdown compute node
+    endpoint.stop()
+    # remove extension files locally
+    cleanup(pg_version)
+
+    # spin up compute node again (there are no extension files available, because compute is stateless)
+    endpoint = env.endpoints.create_start(
+        "test_extension_download_after_restart",
+        tenant_id=tenant_id,
+        remote_ext_config=env.ext_remote_storage.to_string(),
+        config_lines=["log_min_messages=debug3"],
+    )
+
+    # connect to compute node and run the query
+    # that will trigger the download of the extension
+    def run_query(endpoint, thread_id: int):
+        log.info("thread_id {%d} starting", thread_id)
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT * from pg_buffercache;")
+                res = cur.fetchall()
+                assert len(res) > 0
+                log.info("thread_id {%d}, res = %s", thread_id, res)
+
+    threads = [threading.Thread(target=run_query, args=(endpoint, i)) for i in range(2)]
+
+    for thread in threads:
+        thread.start()
+    for thread in threads:
+        thread.join()
+
+    cleanup(pg_version)
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -14,10 +14,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-
    pageserver_http = env.pageserver.http_client()

    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -72,10 +72,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-
    # Use a tiny checkpoint distance, to create a lot of layers quickly.
    # That allows us to stress the compaction and layer flushing logic more.
    tenant, _ = env.neon_cli.create_tenant(
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -265,18 +265,23 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):
 def test_sql_over_http_batch(static_proxy: NeonProxy):
    static_proxy.safe_psql("create role http with login password 'http' superuser")

-    def qq(queries: List[Tuple[str, Optional[List[Any]]]]) -> Any:
+    def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any:
        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
        response = requests.post(
            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
            data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
-            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Batch-Isolation-Level": "Serializable",
+                "Neon-Batch-Read-Only": "true" if read_only else "false",
+            },
            verify=str(static_proxy.test_output_dir / "proxy.crt"),
        )
        assert response.status_code == 200
-        return response.json()["results"]
+        return response.json()["results"], response.headers

-    result = qq(
+    result, headers = qq(
        [
            ("select 42 as answer", None),
            ("select $1 as answer", [42]),
@@ -291,6 +296,9 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
        ]
    )

+    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
+    assert headers["Neon-Batch-Read-Only"] == "false"
+
    assert result[0]["rows"] == [{"answer": 42}]
    assert result[1]["rows"] == [{"answer": "42"}]
    assert result[2]["rows"] == [{"answer": 42}]
@@ -311,3 +319,14 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
    assert res["command"] == "DROP"
    assert res["rowCount"] is None
    assert len(result) == 10
+
+    result, headers = qq(
+        [
+            ("select 42 as answer", None),
+        ],
+        True,
+    )
+    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
+    assert headers["Neon-Batch-Read-Only"] == "true"
+
+    assert result[0]["rows"] == [{"answer": 42}]
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -15,10 +15,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    env.pageserver.is_testing_enabled_or_skip()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-
    # Create a branch for us
    env.neon_cli.create_branch("test_pageserver_recovery", "main")

--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -348,9 +348,6 @@ def test_remote_storage_upload_queue_retries(
    # XXX: should vary this test to selectively fail just layer uploads, index uploads, deletions
    #      but how do we validate the result after restore?

-    # these are always possible when we do an immediate stop. perhaps something with compacting has changed since.
-    env.pageserver.allowed_errors.append(r".*found future (delta|image) layer.*")
-
    env.pageserver.stop(immediate=True)
    env.endpoints.stop_all()

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -229,6 +229,8 @@ def test_delete_timeline_exercise_crash_safety_failpoints(

    ps_http.configure_failpoints((failpoint, "return"))

+    iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4
+
    # These failpoints are earlier than background task is spawned.
    # so they result in api request failure.
    if failpoint in (
@@ -245,7 +247,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
            tenant_id=env.initial_tenant,
            timeline_id=timeline_id,
            expected_state="Broken",
-            iterations=2,  # effectively try immediately and retry once in one second
+            iterations=iterations,
        )

        reason = timeline_info["state"]["Broken"]["reason"]
@@ -254,21 +256,19 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
        # failpoint may not be the only error in the stack
        assert reason.endswith(f"failpoint: {failpoint}"), reason

-    wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3
    if check is Check.RETRY_WITH_RESTART:
        env.pageserver.stop()
        env.pageserver.start()
+
+        wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations)
+
        if failpoint == "timeline-delete-before-index-deleted-at":
            # We crashed before persisting this to remote storage, need to retry delete request
-
-            # Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading)
-            wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2)
-
            timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
        else:
            # Pageserver should've resumed deletion after restart.
            wait_timeline_detail_404(
-                ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
+                ps_http, env.initial_tenant, timeline_id, iterations=iterations
            )
    elif check is Check.RETRY_WITHOUT_RESTART:
        # this should succeed
@@ -276,7 +276,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
        ps_http.configure_failpoints((failpoint, "off"))

        timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
+            ps_http, env.initial_tenant, timeline_id, iterations=iterations
        )

    # Check remote is impty
@@ -404,6 +404,7 @@ def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str]
    assert isinstance(neon_env_builder.remote_storage, S3Storage)

    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    assert neon_env_builder.remote_storage_client is not None
    response = neon_env_builder.remote_storage_client.list_objects_v2(
        Bucket=neon_env_builder.remote_storage.bucket_name,
        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
@@ -569,7 +570,7 @@ def test_concurrent_timeline_delete_stuck_on(
        try:
            log.info("first call start")
            timeline_delete_wait_completed(
-                ps_http, env.initial_tenant, child_timeline_id, timeout=10
+                ps_http, env.initial_tenant, child_timeline_id, timeout=20
            )
            log.info("first call success")
            result_queue.put("success")
@@ -683,7 +684,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
    wait_until(50, 0.1, first_request_finished)

    # check that the timeline is gone
-    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2)


@pytest.mark.parametrize(
@@ -758,7 +759,7 @@ def test_timeline_delete_works_for_remote_smoke(
        )

    # for some reason the check above doesnt immediately take effect for the below.
-    # Assume it is mock server incosistency and check twice.
+    # Assume it is mock server inconsistency and check twice.
    wait_until(
        2,
        0.5,
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "1220c8a63f00101829f9222a5821fc084b4384c7",
-    "postgres-v14": "ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f"
+    "postgres-v15": "553f2d3618a6d4893bde67f1c065926ee8a3a118",
+    "postgres-v14": "28bf5ccfa2fda9677566a25abd450e714d9ed055"
 }
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,6 +60,7 @@ url = { version = "2", features = ["serde"] }
 [build-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
+cc = { version = "1", default-features = false, features = ["parallel"] }
 either = { version = "1" }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
Author	SHA1	Message	Date
Arseny Sher	5ac3f86c96	Safekeeper peer recovery rfc.	2023-08-03 14:15:14 +03:00
Alexander Bayandin	381f41e685	Bump cryptography from 41.0.2 to 41.0.3 (#4870 )	2023-08-02 14:10:36 +03:00
Alek Westover	d005c77ea3	Tar Remote Extensions (#4715 ) Add infrastructure to dynamically load postgres extensions and shared libraries from remote extension storage. Before postgres start downloads list of available remote extensions and libraries, and also downloads 'shared_preload_libraries'. After postgres is running, 'compute_ctl' listens for HTTP requests to load files. Postgres has new GUC 'extension_server_port' to specify port on which 'compute_ctl' listens for requests. When PostgreSQL requests a file, 'compute_ctl' downloads it. See more details about feature design and remote extension storage layout in docs/rfcs/024-extension-loading.md --------- Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech> Co-authored-by: Alek Westover <alek.westover@gmail.com>	2023-08-02 12:38:12 +03:00
Joonas Koivunen	04776ade6c	fix(consumption): rename _size_ => _data_ (#4866 ) I failed at renaming the metric middle part while managing to do a great job with the suffix. Fix the middle part as well.	2023-08-01 19:18:25 +03:00
Dmitry Rodionov	c3fe335eaf	wait for tenant to be active before polling for timeline absence (#4856 ) ## Problem https://neon-github-public-dev.s3.amazonaws.com/reports/main/5692829577/index.html#suites/f588e0a787c49e67b29490359c589fae/4c50937643d68a66 ## Summary of changes wait for tenant to be active after restart before polling for timeline absence	2023-08-01 18:28:18 +03:00
Joonas Koivunen	3a00a5deb2	refactor: tidy consumption metrics (#4860 ) Tidying up I've been wanting to do for some time. Follow-up to #4857.	2023-08-01 18:14:16 +03:00
Joonas Koivunen	78fa2b13e5	test: written_size_bytes_delta (#4857 ) Two stabs at this, by mocking a http receiver and the globals out (now reverted) and then by separating the timeline dependency and just testing what kind of events certain timelines produce. I think this pattern could work for some of our problems. Follow-up to #4822.	2023-08-01 15:30:36 +03:00
John Spray	7c076edeea	pageserver: tweak period of imitate_layer_accesses (#4859 ) ## Problem When the eviction threshold is an integer multiple of the eviction period, it is unreliable to skip imitating accesses based on whether the last imitation was more recent than the threshold. This is because as finite time passes between the time used for the periodic execution, and the 'now' time used for updating last_layer_access_imitation. When this is just a few milliseconds, and everything else is on-time, then a 5 second threshold with a 1 second period will end up entering its 5th iteration slightly _less than_ 5 second since last_layer_access_imitation, and thereby skipping instead of running the imitation. If a few milliseconds then pass before we check the access time of a file that _should_ have been bumped by the imitation pass, then we end up evicting something we shouldn't have evicted. ## Summary of changes We can make this race far less likely by using the threshold minus one interval as the period for re-executing the imitate_layer_accesses: that way we're not vulnerable to racing by just a few millis, and there would have to be a delay of the order `period` to cause us to wrongly evict a layer. This is not a complete solution: it would be good to revisit this and use a non-walltime mechanism for pinning these layers into local storage, rather than relying on bumping access times.	2023-08-01 13:17:49 +01:00
Arpad Müller	69528b7c30	Prepare k-merge in compaction for async I/O (#4836 ) ## Problem The k-merge in pageserver compaction currently relies on iterators over the keys and also over the values. This approach does not support async code because we are using iterators and those don't support async in general. Also, the k-merge implementation we use doesn't support async either. Instead, as we already load all the keys into memory, the plan is to just do the sorting in-memory for now, switch to async, and then once we want to support workloads that don't have all keys stored in memory, we can look into switching to a k-merge implementation that supports async instead. ## Summary of changes The core of this PR is the move from functions on the `PersistentLayer` trait to return custom iterator types to inherent functions on `DeltaLayer` that return buffers with all keys or value references. Value references are a type we created in this PR, containing a `BlobRef` as well as an `Arc` pointer to the `DeltaLayerInner`, so that we can lazily load the values during compaction. This preserves the property of the current code. This PR does not switch us to doing the k-merge via sort on slices, but with this PR, doing such a switch is relatively easy and only requires changes of the compaction code itself. Part of https://github.com/neondatabase/neon/issues/4743	2023-08-01 13:38:35 +02:00
Konstantin Knizhnik	a98a80abc2	Deffine NEON_SMGR to make it possible for extensions to use Neon SMG API (#4840 ) ## Problem See https://neondb.slack.com/archives/C036U0GRMRB/p1689148023067319 ## Summary of changes Define NEON_SMGR in smgr.h ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2023-08-01 10:04:45 +03:00
Alex Chi Z	7b6c849456	support isolation level + read only for http batch sql (#4830 ) We will retrieve `neon-batch-isolation-level` and `neon-batch-read-only` from the http header, which sets the txn properties. https://github.com/neondatabase/serverless/pull/38#issuecomment-1653130981 --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2023-08-01 02:59:11 +03:00
Joonas Koivunen	326189d950	consumption_metrics: send timeline_written_size_delta (#4822 ) We want to have timeline_written_size_delta which is defined as difference to the previously sent `timeline_written_size` from the current `timeline_written_size`. Solution is to send it. On the first round `disk_consistent_lsn` is used which is captured during `load` time. After that an incremental "event" is sent on every collection. Incremental "events" are not part of deduplication. I've added some infrastructure to allow somewhat typesafe `EventType::Absolute` and `EventType::Incremental` factories per metrics, now that we have our first `EventType::Incremental` usage.	2023-07-31 22:10:19 +03:00
bojanserafimov	ddbe170454	Prewarm compute nodes (#4828 )	2023-07-31 14:13:32 -04:00
Alexander Bayandin	39e458f049	test_compatibility: fix pg_tenant_only_port port collision (#4850 ) ## Problem Compatibility tests fail from time to time due to `pg_tenant_only_port` port collision (added in https://github.com/neondatabase/neon/pull/4731) ## Summary of changes - replace `pg_tenant_only_port` value in config with new port - remove old logic, than we don't need anymore - unify config overrides	2023-07-31 20:49:46 +03:00
Vadim Kharitonov	e1424647a0	Update pg_embedding to 0.3.1 version (#4811 )	2023-07-31 20:23:18 +03:00
Yinnan Yao	705ae2dce9	Fix error message for listen_pg_addr_tenant_only binding (#4787 ) ## Problem Wrong use of `conf.listen_pg_addr` in `error!()`. ## Summary of changes Use `listen_pg_addr_tenant_only` instead of `conf.listen_pg_addr`. Signed-off-by: yaoyinnan <35447132+yaoyinnan@users.noreply.github.com>	2023-07-31 14:40:52 +01:00
Conrad Ludgate	eb78603121	proxy: div by zero (#4845 ) ## Problem 1. In the CacheInvalid state loop, we weren't checking the `num_retries`. If this managed to get up to `32`, the retry_after procedure would compute 2^32 which would overflow to 0 and trigger a div by zero 2. When fixing the above, I started working on a flow diagram for the state machine logic and realised it was more complex than it had to be: a. We start in a `Cached` state b. `Cached`: call `connect_once`. After the first connect_once error, we always move to the `CacheInvalid` state, otherwise, we return the connection. c. `CacheInvalid`: we attempt to `wake_compute` and we either switch to Cached or we retry this step (or we error). d. `Cached`: call `connect_once`. We either retry this step or we have a connection (or we error) - After num_retries > 1 we never switch back to `CacheInvalid`. ## Summary of changes 1. Insert a `num_retries` check in the `handle_try_wake` procedure. Also using floats in the retry_after procedure to prevent the overflow entirely 2. Refactor connect_to_compute to be more linear in design.	2023-07-31 09:30:24 -04:00
John Spray	f0ad603693	pageserver: add unit test for deleted_at in IndexPart (#4844 ) ## Problem Existing IndexPart unit tests only exercised the version 1 format (i.e. without deleted_at set). ## Summary of changes Add a test that sets version to 2, and sets a value for deleted_at. Closes https://github.com/neondatabase/neon/issues/4162	2023-07-31 12:51:18 +01:00
Arpad Müller	e5183f85dc	Make DiskBtreeReader::dump async (#4838 ) ## Problem `DiskBtreeReader::dump` calls `read_blk` internally, which we want to make async in the future. As it is currently relying on recursion, and async doesn't like recursion, we want to find an alternative to that and instead traverse the tree using a loop and a manual stack. ## Summary of changes * Make `DiskBtreeReader::dump` and all the places calling it async * Make `DiskBtreeReader::dump` non-recursive internally and use a stack instead. It now deparses the node in each iteration, which isn't optimal, but on the other hand it's hard to store the node as it is referencing the buffer. Self referential data are hard in Rust. For a dumping function, speed isn't a priority so we deparse the node multiple times now (up to branching factor many times). Part of https://github.com/neondatabase/neon/issues/4743 I have verified that output is unchanged by comparing the output of this command both before and after this patch: ``` cargo test -p pageserver -- particular_data --nocapture ```	2023-07-31 12:52:29 +02:00
Joonas Koivunen	89ee8f2028	fix: demote warnings, fix flakyness (#4837 ) `WARN ... found future (image\|delta) layer` are not actionable log lines. They don't need to be warnings. `info!` is enough. This also fixes some known but not tracked flakyness in [`test_remote_timeline_client_calls_started_metric`][evidence]. [evidence]: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4829/5683495367/index.html#/testresult/34fe79e24729618b Closes #3369. Closes #4473.	2023-07-31 07:43:12 +00:00