pageserver: cargo fmt

This is in a separate commit to make the previous ones more readable.
pageserver: respect task_mgr cancellation in metrics task
2026-05-20 06:30:43 +00:00 · 2023-08-03 14:12:46 +01:00 · 2023-08-03 10:01:12 +01:00 · 2023-08-03 09:53:38 +01:00 · 2023-08-02 19:31:09 +01:00 · 2023-08-02 14:19:06 -04:00
83 changed files with 3172 additions and 989 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -209,4 +209,4 @@ runs:
      uses: ./.github/actions/allure-report-store
      with:
        report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.build_type }}
+        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -740,6 +740,9 @@ name = "cc"
 version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+dependencies = [
+ "jobserver",
+]

 [[package]]
 name = "cexpr"
@@ -907,12 +910,14 @@ dependencies = [
 "opentelemetry",
 "postgres",
 "regex",
+ "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
 "tar",
 "tokio",
 "tokio-postgres",
+ "toml_edit",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -920,6 +925,7 @@ dependencies = [
 "url",
 "utils",
 "workspace_hack",
+ "zstd",
 ]

 [[package]]
@@ -980,6 +986,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "tracing",
 "url",
 "utils",
 "workspace_hack",
@@ -1972,6 +1979,15 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"

+[[package]]
+name = "jobserver"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.63"
@@ -5296,6 +5312,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "bytes",
+ "cc",
 "chrono",
 "clap",
 "clap_builder",
@@ -5396,3 +5413,33 @@ name = "zeroize"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
+
+[[package]]
+name = "zstd"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "6.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
+dependencies = [
+ "libc",
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.8+zstd.1.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -551,10 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
-# There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
-    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
+    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -774,12 +772,14 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 FROM python:3.9-slim-bullseye AS generate-ext-index
 ARG PG_VERSION
 ARG BUILD_TAG
+RUN apt update && apt install -y zstd
+
 # copy the control files here
 COPY --from=kq-imcx-pg-build /extensions/ /extensions/
 COPY --from=pg-anon-pg-build /extensions/ /extensions/
 COPY --from=postgis-build /extensions/ /extensions/
 COPY scripts/combine_control_files.py ./combine_control_files.py
-RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG}
+RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"

 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
@@ -816,6 +816,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
 # libboost*, libfreetype6, and zlib1g for rdkit
+# ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
@@ -839,7 +840,8 @@ RUN apt update &&  \
        libcurl4-openssl-dev \
        locales \
        procps \
-        zlib1g && \
+        zlib1g \
+        ca-certificates && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/2
+++ b/2
@@ -108,6 +108,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+	+@echo "Compiling amcheck $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -32,3 +32,6 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
+toml_edit.workspace = true
+remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+zstd = "0.12.4"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -5,6 +5,8 @@
 //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
+//! - If remote_extension_config is provided, it will be used to fetch extensions list
+//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -27,7 +29,8 @@
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
-//!             -b /usr/local/bin/postgres
+//!             -b /usr/local/bin/postgres \
+//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
 //! ```
 //!
 use std::collections::HashMap;
@@ -35,7 +38,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex};
+use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -48,22 +51,33 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
+use compute_tools::extension_server::{get_pg_version, init_remote_storage};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;

-const BUILD_TAG_DEFAULT: &str = "local";
+// this is an arbitrary build tag. Fine as a default / for testing purposes
+// in-case of not-set environment var
+const BUILD_TAG_DEFAULT: &str = "5670669815";

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
-
+    let build_tag = option_env!("BUILD_TAG")
+        .unwrap_or(BUILD_TAG_DEFAULT)
+        .to_string();
    info!("build_tag: {build_tag}");

    let matches = cli().get_matches();
+    let pgbin_default = String::from("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+
+    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
+    let ext_remote_storage = remote_ext_config.map(|x| {
+        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
+    });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -128,9 +142,6 @@ fn main() -> Result<()> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.get_one::<String>("pgbin").unwrap();
-
    let spec;
    let mut live_config_allowed = false;
    match spec_json {
@@ -168,6 +179,7 @@ fn main() -> Result<()> {

    let mut new_state = ComputeState::new();
    let spec_set;
+
    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
        new_state.pspec = Some(pspec);
@@ -179,20 +191,37 @@ fn main() -> Result<()> {
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
+        pgversion: get_pg_version(pgbin),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
+        ext_remote_storage,
+        ext_remote_paths: OnceLock::new(),
+        ext_download_progress: RwLock::new(HashMap::new()),
+        library_index: OnceLock::new(),
+        build_tag,
    };
    let compute = Arc::new(compute_node);

+    // If this is a pooled VM, prewarm before starting HTTP server and becoming
+    // available for binding. Prewarming helps postgres start quicker later,
+    // because QEMU will already have it's memory allocated from the host, and
+    // the necessary binaries will alreaady be cached.
+    if !spec_set {
+        compute.prewarm_postgres()?;
+    }
+
    // Launch http service first, so we were able to serve control-plane
    // requests, while configuration is still in progress.
    let _http_handle =
-        launch_http_server(http_port, compute.clone()).expect("cannot launch http endpoint thread");
+        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
+
+    let extension_server_port: u16 = http_port;

    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
+
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
@@ -223,13 +252,13 @@ fn main() -> Result<()> {
    drop(state);

    // Launch remaining service threads
-    let _monitor_handle = launch_monitor(compute.clone());
-    let _configurator_handle = launch_configurator(compute.clone());
+    let _monitor_handle = launch_monitor(&compute);
+    let _configurator_handle = launch_configurator(&compute);

    // Start Postgres
    let mut delay_exit = false;
    let mut exit_code = None;
-    let pg = match compute.start_compute() {
+    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
@@ -358,6 +387,12 @@ fn cli() -> clap::Command {
                .long("control-plane-uri")
                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
+        .arg(
+            Arg::new("remote-ext-config")
+                .short('r')
+                .long("remote-ext-config")
+                .value_name("REMOTE_EXT_CONFIG"),
+        )
 }

 #[test]
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,5 +1,3 @@
-use std::sync::Arc;
-
 use anyhow::{anyhow, Result};
 use tokio_postgres::NoTls;
 use tracing::{error, instrument};
@@ -10,7 +8,7 @@ use crate::compute::ComputeNode;
 /// that we can actually write some data in this particular timeline.
 /// Create table if it's missing.
 #[instrument(skip_all)]
-pub async fn check_writability(compute: Arc<ComputeNode>) -> Result<()> {
+pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    // Connect to the database.
    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
    if client.is_closed() {
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,16 +1,20 @@
+use std::collections::HashMap;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex};
+use std::sync::{Condvar, Mutex, OnceLock, RwLock};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
+use regex::Regex;
+use tokio;
 use tokio_postgres;
 use tracing::{error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -20,10 +24,12 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use crate::config;
+use remote_storage::{GenericRemoteStorage, RemotePath};
+
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
+use crate::{config, extension_server};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -31,6 +37,7 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
+    pub pgversion: String,
    /// We should only allow live re- / configuration of the compute node if
    /// it uses 'pull model', i.e. it can go to control-plane and fetch
    /// the latest configuration. Otherwise, there could be a case:
@@ -50,6 +57,24 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
+    ///  the S3 bucket that we search for extensions in
+    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    // (key: extension name, value: path to extension archive in remote storage)
+    pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
+    // (key: library name, value: name of extension containing this library)
+    pub library_index: OnceLock<HashMap<String, String>>,
+    // key: ext_archive_name, value: started download time, download_completed?
+    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
+    pub build_tag: String,
+}
+
+// store some metrics about download size that might impact startup time
+#[derive(Clone, Debug)]
+pub struct RemoteExtensionMetrics {
+    num_ext_downloaded: u64,
+    largest_ext_size: u64,
+    total_ext_download_size: u64,
+    prep_extensions_ms: u64,
 }

 #[derive(Clone, Debug)]
@@ -473,14 +498,22 @@ impl ComputeNode {
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip_all)]
-    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+    pub fn prepare_pgdata(
+        &self,
+        compute_state: &ComputeState,
+        extension_server_port: u16,
+    ) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
+        config::write_postgres_conf(
+            &pgdata_path.join("postgresql.conf"),
+            &pspec.spec,
+            Some(extension_server_port),
+        )?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
        // is already connected it will be kicked out, so a secondary (standby)
@@ -532,6 +565,50 @@ impl ComputeNode {
        Ok(())
    }

+    /// Start and stop a postgres process to warm up the VM for startup.
+    pub fn prewarm_postgres(&self) -> Result<()> {
+        info!("prewarming");
+
+        // Create pgdata
+        let pgdata = &format!("{}.warmup", self.pgdata);
+        create_pgdata(pgdata)?;
+
+        // Run initdb to completion
+        info!("running initdb");
+        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
+        Command::new(initdb_bin)
+            .args(["-D", pgdata])
+            .output()
+            .expect("cannot start initdb process");
+
+        // Write conf
+        use std::io::Write;
+        let conf_path = Path::new(pgdata).join("postgresql.conf");
+        let mut file = std::fs::File::create(conf_path)?;
+        writeln!(file, "shared_buffers=65536")?;
+        writeln!(file, "port=51055")?; // Nobody should be connecting
+        writeln!(file, "shared_preload_libraries = 'neon'")?;
+
+        // Start postgres
+        info!("starting postgres");
+        let mut pg = Command::new(&self.pgbin)
+            .args(["-D", pgdata])
+            .spawn()
+            .expect("cannot start postgres process");
+
+        // Stop it when it's ready
+        info!("waiting for postgres");
+        wait_for_postgres(&mut pg, Path::new(pgdata))?;
+        pg.kill()?;
+        info!("sent kill signal");
+        pg.wait()?;
+        info!("done prewarming");
+
+        // clean up
+        let _ok = fs::remove_dir_all(pgdata);
+        Ok(())
+    }
+
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
@@ -626,7 +703,7 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
        self.pg_reload_conf(&mut client)?;
@@ -656,7 +733,7 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(&self) -> Result<std::process::Child> {
+    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -667,7 +744,31 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        self.prepare_pgdata(&compute_state)?;
+        // This part is sync, because we need to download
+        // remote shared_preload_libraries before postgres start (if any)
+        {
+            let library_load_start_time = Utc::now();
+            let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
+
+            let library_load_time = Utc::now()
+                .signed_duration_since(library_load_start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            let mut state = self.state.lock().unwrap();
+            state.metrics.load_ext_ms = library_load_time;
+            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
+            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
+            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
+            state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
+            info!(
+                "Loading shared_preload_libraries took {:?}ms",
+                library_load_time
+            );
+            info!("{:?}", remote_ext_metrics);
+        }
+
+        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
@@ -815,4 +916,200 @@ LIMIT 100",
            "{{\"pg_stat_statements\": []}}".to_string()
        }
    }
+
+    // If remote extension storage is configured,
+    // download extension control files
+    pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
+        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
+            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+            let spec = &pspec.spec;
+            let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
+            info!("custom extensions: {:?}", &custom_ext);
+            let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
+                ext_remote_storage,
+                &self.pgbin,
+                &self.pgversion,
+                &custom_ext,
+                &self.build_tag,
+            )
+            .await?;
+            self.ext_remote_paths
+                .set(ext_remote_paths)
+                .expect("this is the only time we set ext_remote_paths");
+            self.library_index
+                .set(library_index)
+                .expect("this is the only time we set library_index");
+        }
+        Ok(())
+    }
+
+    // download an archive, unzip and place files in correct locations
+    pub async fn download_extension(&self, ext_name: &str, is_library: bool) -> Result<u64> {
+        match &self.ext_remote_storage {
+            None => anyhow::bail!("No remote extension storage"),
+            Some(remote_storage) => {
+                let mut real_ext_name = ext_name.to_string();
+                if is_library {
+                    // sometimes library names might have a suffix like
+                    // library.so or library.so.3. We strip this off
+                    // because library_index is based on the name without the file extension
+                    let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
+                    let lib_raw_name = strip_lib_suffix.replace(&real_ext_name, "").to_string();
+                    real_ext_name = self
+                        .library_index
+                        .get()
+                        .expect("must have already downloaded the library_index")[&lib_raw_name]
+                        .clone();
+                }
+
+                let ext_path = &self
+                    .ext_remote_paths
+                    .get()
+                    .expect("error accessing ext_remote_paths")[&real_ext_name];
+                let ext_archive_name = ext_path.object_name().expect("bad path");
+
+                let mut first_try = false;
+                if !self
+                    .ext_download_progress
+                    .read()
+                    .expect("lock err")
+                    .contains_key(ext_archive_name)
+                {
+                    self.ext_download_progress
+                        .write()
+                        .expect("lock err")
+                        .insert(ext_archive_name.to_string(), (Utc::now(), false));
+                    first_try = true;
+                }
+                let (download_start, download_completed) =
+                    self.ext_download_progress.read().expect("lock err")[ext_archive_name];
+                let start_time_delta = Utc::now()
+                    .signed_duration_since(download_start)
+                    .to_std()
+                    .unwrap()
+                    .as_millis() as u64;
+
+                // how long to wait for extension download if it was started by another process
+                const HANG_TIMEOUT: u64 = 3000; // milliseconds
+
+                if download_completed {
+                    info!("extension already downloaded, skipping re-download");
+                    return Ok(0);
+                } else if start_time_delta < HANG_TIMEOUT && !first_try {
+                    info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
+                    let mut interval =
+                        tokio::time::interval(tokio::time::Duration::from_millis(500));
+                    loop {
+                        info!("waiting for download");
+                        interval.tick().await;
+                        let (_, download_completed_now) =
+                            self.ext_download_progress.read().expect("lock")[ext_archive_name];
+                        if download_completed_now {
+                            info!("download finished by whoever else downloaded it");
+                            return Ok(0);
+                        }
+                    }
+                    // NOTE: the above loop will get terminated
+                    // based on the timeout of the download function
+                }
+
+                // if extension hasn't been downloaded before or the previous
+                // attempt to download was at least HANG_TIMEOUT ms ago
+                // then we try to download it here
+                info!("downloading new extension {ext_archive_name}");
+
+                let download_size = extension_server::download_extension(
+                    &real_ext_name,
+                    ext_path,
+                    remote_storage,
+                    &self.pgbin,
+                )
+                .await;
+                self.ext_download_progress
+                    .write()
+                    .expect("bad lock")
+                    .insert(ext_archive_name.to_string(), (download_start, true));
+                download_size
+            }
+        }
+    }
+
+    #[tokio::main]
+    pub async fn prepare_preload_libraries(
+        &self,
+        compute_state: &ComputeState,
+    ) -> Result<RemoteExtensionMetrics> {
+        if self.ext_remote_storage.is_none() {
+            return Ok(RemoteExtensionMetrics {
+                num_ext_downloaded: 0,
+                largest_ext_size: 0,
+                total_ext_download_size: 0,
+                prep_extensions_ms: 0,
+            });
+        }
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        let spec = &pspec.spec;
+
+        info!("parse shared_preload_libraries from spec.cluster.settings");
+        let mut libs_vec = Vec::new();
+        if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+            libs_vec = libs
+                .split(&[',', '\'', ' '])
+                .filter(|s| *s != "neon" && !s.is_empty())
+                .map(str::to_string)
+                .collect();
+        }
+        info!("parse shared_preload_libraries from provided postgresql.conf");
+        // that is used in neon_local and python tests
+        if let Some(conf) = &spec.cluster.postgresql_conf {
+            let conf_lines = conf.split('\n').collect::<Vec<&str>>();
+            let mut shared_preload_libraries_line = "";
+            for line in conf_lines {
+                if line.starts_with("shared_preload_libraries") {
+                    shared_preload_libraries_line = line;
+                }
+            }
+            let mut preload_libs_vec = Vec::new();
+            if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
+                preload_libs_vec = libs
+                    .split(&[',', '\'', ' '])
+                    .filter(|s| *s != "neon" && !s.is_empty())
+                    .map(str::to_string)
+                    .collect();
+            }
+            libs_vec.extend(preload_libs_vec);
+        }
+
+        info!("Download ext_index.json, find the extension paths");
+        let prep_ext_start_time = Utc::now();
+        self.prepare_external_extensions(compute_state).await?;
+        let prep_ext_time_delta = Utc::now()
+            .signed_duration_since(prep_ext_start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+        info!("Prepare extensions took {prep_ext_time_delta}ms");
+
+        info!("Downloading to shared preload libraries: {:?}", &libs_vec);
+        let mut download_tasks = Vec::new();
+        for library in &libs_vec {
+            download_tasks.push(self.download_extension(library, true));
+        }
+        let results = join_all(download_tasks).await;
+
+        let mut remote_ext_metrics = RemoteExtensionMetrics {
+            num_ext_downloaded: 0,
+            largest_ext_size: 0,
+            total_ext_download_size: 0,
+            prep_extensions_ms: prep_ext_time_delta,
+        };
+        for result in results {
+            let download_size = result?;
+            remote_ext_metrics.num_ext_downloaded += 1;
+            remote_ext_metrics.largest_ext_size =
+                std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
+            remote_ext_metrics.total_ext_download_size += download_size;
+        }
+        Ok(remote_ext_metrics)
+    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 }

 /// Create or completely rewrite configuration file specified by `path`
-pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
+pub fn write_postgres_conf(
+    path: &Path,
+    spec: &ComputeSpec,
+    extension_server_port: Option<u16>,
+) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

@@ -87,5 +91,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        writeln!(file, "# Managed by compute_ctl: end")?;
    }

+    if let Some(port) = extension_server_port {
+        writeln!(file, "neon.extension_server_port={}", port)?;
+    }
+
    Ok(())
 }
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -8,46 +8,46 @@ use compute_api::responses::ComputeStatus;
 use crate::compute::ComputeNode;

 #[instrument(skip_all)]
-fn configurator_main_loop(compute: Arc<ComputeNode>) {
+fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    info!("waiting for reconfiguration requests");
    loop {
        let state = compute.state.lock().unwrap();
        let mut state = compute.state_changed.wait(state).unwrap();

-        match state.status {
-            ComputeStatus::ConfigurationPending => {
-                info!("got configuration request");
-                state.status = ComputeStatus::Configuration;
-                compute.state_changed.notify_all();
-                drop(state);
+        if state.status == ComputeStatus::ConfigurationPending {
+            info!("got configuration request");
+            state.status = ComputeStatus::Configuration;
+            compute.state_changed.notify_all();
+            drop(state);

-                let new_status = if let Err(e) = compute.reconfigure() {
-                    error!("could not configure compute node: {}", e);
-                    ComputeStatus::Failed
-                } else {
-                    info!("compute node configured");
-                    ComputeStatus::Running
-                };
-
-                // XXX: used to test that API is blocking
-                // std::thread::sleep(std::time::Duration::from_millis(10000));
-
-                compute.set_status(new_status);
+            let mut new_status = ComputeStatus::Failed;
+            if let Err(e) = compute.reconfigure() {
+                error!("could not configure compute node: {}", e);
+            } else {
+                new_status = ComputeStatus::Running;
+                info!("compute node configured");
            }
-            ComputeStatus::Failed => {
-                info!("compute node is in Failed state, exiting");
-                break;
-            }
-            _ => info!("woken up for compute status: {:?}, sleeping", state.status),
+
+            // XXX: used to test that API is blocking
+            // std::thread::sleep(std::time::Duration::from_millis(10000));
+
+            compute.set_status(new_status);
+        } else if state.status == ComputeStatus::Failed {
+            info!("compute node is now in Failed state, exiting");
+            break;
+        } else {
+            info!("woken up for compute status: {:?}, sleeping", state.status);
        }
    }
 }

-pub fn launch_configurator(compute: Arc<ComputeNode>) -> thread::JoinHandle<()> {
+pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+    let compute = Arc::clone(compute);
+
    thread::Builder::new()
        .name("compute-configurator".into())
        .spawn(move || {
-            configurator_main_loop(compute);
+            configurator_main_loop(&compute);
            info!("configurator thread is exited");
        })
        .expect("cannot launch configurator thread")
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -0,0 +1,275 @@
+// Download extension files from the extension store
+// and put them in the right place in the postgres directory (share / lib)
+/*
+The layout of the S3 bucket is as follows:
+5615610098 // this is an extension build number
+├── v14
+│   ├── extensions
+│   │   ├── anon.tar.zst
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   ├── anon.tar.zst
+    │   └── embedding.tar.zst
+    └── ext_index.json
+5615261079
+├── v14
+│   ├── extensions
+│   │   └── anon.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── anon.tar.zst
+    └── ext_index.json
+5623261088
+├── v14
+│   ├── extensions
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── embedding.tar.zst
+    └── ext_index.json
+
+Note that build number cannot be part of prefix because we might need extensions
+from other build numbers.
+
+ext_index.json stores the control files and location of extension archives
+It also stores a list of public extensions and a library_index
+
+We don't need to duplicate extension.tar.zst files.
+We only need to upload a new one if it is updated.
+(Although currently we just upload every time anyways, hopefully will change
+this sometime)
+
+*access* is controlled by spec
+
+More specifically, here is an example ext_index.json
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
+*/
+use anyhow::Context;
+use anyhow::{self, Result};
+use futures::future::join_all;
+use remote_storage::*;
+use serde_json;
+use std::collections::HashMap;
+use std::io::Read;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::path::Path;
+use std::str;
+use tar::Archive;
+use tokio::io::AsyncReadExt;
+use tracing::info;
+use tracing::log::warn;
+use zstd::stream::read::Decoder;
+
+fn get_pg_config(argument: &str, pgbin: &str) -> String {
+    // gives the result of `pg_config [argument]`
+    // where argument is a flag like `--version` or `--sharedir`
+    let pgconfig = pgbin
+        .strip_suffix("postgres")
+        .expect("bad pgbin")
+        .to_owned()
+        + "/pg_config";
+    let config_output = std::process::Command::new(pgconfig)
+        .arg(argument)
+        .output()
+        .expect("pg_config error");
+    std::str::from_utf8(&config_output.stdout)
+        .expect("pg_config error")
+        .trim()
+        .to_string()
+}
+
+pub fn get_pg_version(pgbin: &str) -> String {
+    // pg_config --version returns a (platform specific) human readable string
+    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    let human_version = get_pg_config("--version", pgbin);
+    if human_version.contains("15") {
+        return "v15".to_string();
+    } else if human_version.contains("14") {
+        return "v14".to_string();
+    }
+    panic!("Unsuported postgres version {human_version}");
+}
+
+// download control files for enabled_extensions
+// return Hashmaps converting library names to extension names (library_index)
+// and specifying the remote path to the archive for each extension name
+pub async fn get_available_extensions(
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+    pg_version: &str,
+    custom_extensions: &[String],
+    build_tag: &str,
+) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
+    let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
+    info!("download ext_index.json from: {:?}", &index_path);
+
+    let mut download = remote_storage.download(&index_path).await?;
+    let mut ext_idx_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut ext_idx_buffer)
+        .await?;
+    info!("ext_index downloaded");
+
+    #[derive(Debug, serde::Deserialize)]
+    struct Index {
+        public_extensions: Vec<String>,
+        library_index: HashMap<String, String>,
+        extension_data: HashMap<String, ExtensionData>,
+    }
+
+    #[derive(Debug, serde::Deserialize)]
+    struct ExtensionData {
+        control_data: HashMap<String, String>,
+        archive_path: String,
+    }
+
+    let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
+    let mut enabled_extensions = ext_index_full.public_extensions;
+    enabled_extensions.extend_from_slice(custom_extensions);
+    let library_index = ext_index_full.library_index;
+    let all_extension_data = ext_index_full.extension_data;
+    info!("library_index: {:?}", library_index);
+
+    info!("enabled_extensions: {:?}", enabled_extensions);
+    let mut ext_remote_paths = HashMap::new();
+    let mut file_create_tasks = Vec::new();
+    for extension in enabled_extensions {
+        let ext_data = &all_extension_data[&extension];
+        for (control_file, control_contents) in &ext_data.control_data {
+            let extension_name = control_file
+                .strip_suffix(".control")
+                .expect("control files must end in .control");
+            ext_remote_paths.insert(
+                extension_name.to_string(),
+                RemotePath::from_string(&ext_data.archive_path)?,
+            );
+            let control_path = local_sharedir.join(control_file);
+            info!("writing file {:?}{:?}", control_path, control_contents);
+            file_create_tasks.push(tokio::fs::write(control_path, control_contents));
+        }
+    }
+    let results = join_all(file_create_tasks).await;
+    for result in results {
+        result?;
+    }
+    info!("ext_remote_paths {:?}", ext_remote_paths);
+    Ok((ext_remote_paths, library_index))
+}
+
+// download the archive for a given extension,
+// unzip it, and place files in the appropriate locations (share/lib)
+pub async fn download_extension(
+    ext_name: &str,
+    ext_path: &RemotePath,
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+) -> Result<u64> {
+    info!("Download extension {:?} from {:?}", ext_name, ext_path);
+    let mut download = remote_storage.download(ext_path).await?;
+    let mut download_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut download_buffer)
+        .await?;
+    let download_size = download_buffer.len() as u64;
+    // it's unclear whether it is more performant to decompress into memory or not
+    // TODO: decompressing into memory can be avoided
+    let mut decoder = Decoder::new(download_buffer.as_slice())?;
+    let mut decompress_buffer = Vec::new();
+    decoder.read_to_end(&mut decompress_buffer)?;
+    let mut archive = Archive::new(decompress_buffer.as_slice());
+    let unzip_dest = pgbin
+        .strip_suffix("/bin/postgres")
+        .expect("bad pgbin")
+        .to_string()
+        + "/download_extensions";
+    archive.unpack(&unzip_dest)?;
+    info!("Download + unzip {:?} completed successfully", &ext_path);
+
+    let sharedir_paths = (
+        unzip_dest.to_string() + "/share/extension",
+        Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
+    );
+    let libdir_paths = (
+        unzip_dest.to_string() + "/lib",
+        Path::new(&get_pg_config("--libdir", pgbin)).join("postgresql"),
+    );
+    // move contents of the libdir / sharedir in unzipped archive to the correct local paths
+    for paths in [sharedir_paths, libdir_paths] {
+        let (zip_dir, real_dir) = paths;
+        info!("mv {zip_dir:?}/*  {real_dir:?}");
+        for file in std::fs::read_dir(zip_dir)? {
+            let old_file = file?.path();
+            let new_file =
+                Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
+            info!("moving {old_file:?} to {new_file:?}");
+
+            // extension download failed: Directory not empty (os error 39)
+            match std::fs::rename(old_file, new_file) {
+                Ok(()) => info!("move succeeded"),
+                Err(e) => {
+                    warn!("move failed, probably because the extension already exists: {e}")
+                }
+            }
+        }
+    }
+    info!("done moving extension {ext_name}");
+    Ok(download_size)
+}
+
+// This function initializes the necessary structs to use remote storage
+pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
+    #[derive(Debug, serde::Deserialize)]
+    struct RemoteExtJson {
+        bucket: String,
+        region: String,
+        endpoint: Option<String>,
+        prefix: Option<String>,
+    }
+    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
+
+    let config = S3Config {
+        bucket_name: remote_ext_json.bucket,
+        bucket_region: remote_ext_json.region,
+        prefix_in_bucket: remote_ext_json.prefix,
+        endpoint: remote_ext_json.endpoint,
+        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
+        max_keys_per_list_response: None,
+    };
+    let config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
+        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
+        storage: RemoteStorageKind::AwsS3(config),
+    };
+    GenericRemoteStorage::from_config(&config)
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -34,7 +34,7 @@ fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
 }

 // Service function to handle all available routes.
-async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
+async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
    //
    // NOTE: The URI path is currently included in traces. That's OK because
    // it doesn't contain any variable parts or sensitive information. But
@@ -121,6 +121,37 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
            }
        }

+        // download extension files from S3 on demand
+        (&Method::POST, route) if route.starts_with("/extension_server/") => {
+            info!("serving {:?} POST request", route);
+            info!("req.uri {:?}", req.uri());
+
+            let mut is_library = false;
+            if let Some(params) = req.uri().query() {
+                info!("serving {:?} POST request with params: {}", route, params);
+                if params == "is_library=true" {
+                    is_library = true;
+                } else {
+                    let mut resp = Response::new(Body::from("Wrong request parameters"));
+                    *resp.status_mut() = StatusCode::BAD_REQUEST;
+                    return resp;
+                }
+            }
+
+            let filename = route.split('/').last().unwrap().to_string();
+            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
+
+            match compute.download_extension(&filename, is_library).await {
+                Ok(_) => Response::new(Body::from("OK")),
+                Err(e) => {
+                    error!("extension download failed: {}", e);
+                    let mut resp = Response::new(Body::from(e.to_string()));
+                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                    resp
+                }
+            }
+        }
+
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
@@ -132,7 +163,7 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>

 async fn handle_configure_request(
    req: Request<Body>,
-    compute: Arc<ComputeNode>,
+    compute: &Arc<ComputeNode>,
 ) -> Result<String, (String, StatusCode)> {
    if !compute.live_config_allowed {
        return Err((
@@ -142,7 +173,8 @@ async fn handle_configure_request(
    }

    let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
-    if let Ok(request) = serde_json::from_slice::<ConfigurationRequest>(&body_bytes) {
+    let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
+    if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
        let spec = request.spec;

        let parsed_spec = match ParsedSpec::try_from(spec) {
@@ -176,29 +208,27 @@ async fn handle_configure_request(
        // This is needed to do not block the main pool of workers and
        // be able to serve other requests while some particular request
        // is waiting for compute to finish configuration.
-        {
-            let compute = compute.clone();
-            task::spawn_blocking(move || {
-                let mut state = compute.state.lock().unwrap();
-                while state.status != ComputeStatus::Running {
-                    state = compute.state_changed.wait(state).unwrap();
-                    info!(
-                        "waiting for compute to become Running, current status: {:?}",
-                        state.status
-                    );
+        let c = compute.clone();
+        task::spawn_blocking(move || {
+            let mut state = c.state.lock().unwrap();
+            while state.status != ComputeStatus::Running {
+                state = c.state_changed.wait(state).unwrap();
+                info!(
+                    "waiting for compute to become Running, current status: {:?}",
+                    state.status
+                );

-                    if state.status == ComputeStatus::Failed {
-                        let err = state.error.as_ref().map_or("unknown error", |x| x);
-                        let msg = format!("compute configuration failed: {:?}", err);
-                        return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
-                    }
+                if state.status == ComputeStatus::Failed {
+                    let err = state.error.as_ref().map_or("unknown error", |x| x);
+                    let msg = format!("compute configuration failed: {:?}", err);
+                    return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
                }
+            }

-                Ok(())
-            })
-            .await
-            .unwrap()?;
-        }
+            Ok(())
+        })
+        .await
+        .unwrap()?;

        // Return current compute state if everything went well.
        let state = compute.state.lock().unwrap().clone();
@@ -236,7 +266,7 @@ async fn serve(port: u16, state: Arc<ComputeNode>) {
                        // information in this API.
                        tracing_utils::http::tracing_handler(
                            req,
-                            |req| routes(req, state.clone()),
+                            |req| routes(req, &state),
                            OtelName::UriPath,
                        )
                        .await,
@@ -257,7 +287,9 @@ async fn serve(port: u16, state: Arc<ComputeNode>) {
 }

 /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
-pub fn launch_http_server(port: u16, state: Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+    let state = Arc::clone(state);
+
    Ok(thread::Builder::new()
        .name("http-endpoint".into())
        .spawn(move || serve(port, state))?)
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -139,6 +139,34 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/GenericError"
+  /extension_server:
+    post:
+      tags:
+      - Extension
+      summary: Download extension from S3 to local folder.
+      description: ""
+      operationId: downloadExtension
+      responses:
+        200:
+          description: Extension downloaded
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'OK' if download succeeded.
+                example: "OK"
+        400:
+        description: Request is invalid.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"
+        500:
+        description: Extension download request failed.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -9,6 +9,7 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
+pub mod extension_server;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -12,7 +12,7 @@ const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
 // XXX: the only expected panic is at `RwLock` unwrap().
-fn watch_compute_activity(compute: Arc<ComputeNode>) {
+fn watch_compute_activity(compute: &ComputeNode) {
    // Suppose that `connstr` doesn't change
    let connstr = compute.connstr.as_str();
    // Define `client` outside of the loop to reuse existing connection if it's active.
@@ -104,9 +104,11 @@ fn watch_compute_activity(compute: Arc<ComputeNode>) {
 }

 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: Arc<ComputeNode>) -> thread::JoinHandle<()> {
+pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+    let state = Arc::clone(state);
+
    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(state))
+        .spawn(move || watch_compute_activity(&state))
        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
 pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;

    update_pg_hba(pgdata_path)?;

--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -32,3 +32,4 @@ utils.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
+tracing.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -658,6 +658,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
+
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -699,7 +701,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers)?;
+                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -743,7 +745,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token, safekeepers)?;
+                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
        "stop" => {
@@ -1003,6 +1005,12 @@ fn cli() -> Command {
        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
        .required(false);

+    let remote_ext_config_args = Arg::new("remote-ext-config")
+        .long("remote-ext-config")
+        .num_args(1)
+        .help("Configure the S3 bucket that we search for extensions in.")
+        .required(false);
+
    let lsn_arg = Arg::new("lsn")
        .long("lsn")
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1161,6 +1169,7 @@ fn cli() -> Command {
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
+                    .arg(remote_ext_config_args)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -313,7 +313,7 @@ impl Endpoint {

                // TODO: use future host field from safekeeper spec
                // Pass the list of safekeepers to the replica so that it can connect to any of them,
-                // whichever is availiable.
+                // whichever is available.
                let sk_ports = self
                    .env
                    .safekeepers
@@ -420,7 +420,12 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
+    pub fn start(
+        &self,
+        auth_token: &Option<String>,
+        safekeepers: Vec<NodeId>,
+        remote_ext_config: Option<&String>,
+    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }
@@ -488,6 +493,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
+            custom_extensions: Some(vec![]),
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -519,6 +525,11 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
+
+        if let Some(remote_ext_config) = remote_ext_config {
+            cmd.args(["--remote-ext-config", remote_ext_config]);
+        }
+
        let child = cmd.spawn()?;

        // Write down the pid so we can wait for it when we want to stop
@@ -564,9 +575,7 @@ impl Endpoint {
                }
                Err(e) => {
                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context(
-                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
-                        );
+                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
                    }
                }
            }
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -0,0 +1,236 @@
+# Supporting custom user Extensions (Dynamic Extension Loading)
+Created 2023-05-03
+
+## Motivation
+
+There are many extensions in the PostgreSQL ecosystem, and not all extensions
+are of a quality that we can confidently support them. Additionally, our
+current extension inclusion mechanism has several problems because we build all
+extensions into the primary Compute image: We build the extensions every time
+we build the compute image regardless of whether we actually need to rebuild
+the image, and the inclusion of these extensions in the image adds a hard
+dependency on all supported extensions - thus increasing the image size, and
+with it the time it takes to download that image - increasing first start
+latency.
+
+This RFC proposes a dynamic loading mechanism that solves most of these
+problems.
+
+## Summary
+
+`compute_ctl` is made responsible for loading extensions on-demand into
+the container's file system for dynamically loaded extensions, and will also
+make sure that the extensions in `shared_preload_libraries` are downloaded
+before the compute node starts.
+
+## Components
+
+compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
+
+## Requirements
+
+Compute nodes with no extra extensions should not be negatively impacted by
+the existence of support for many extensions.
+
+Installing an extension into PostgreSQL should be easy.
+
+Non-preloaded extensions shouldn't impact startup latency.
+
+Uninstalled extensions shouldn't impact query latency.
+
+A small latency penalty for dynamically loaded extensions is acceptable in
+the first seconds of compute startup, but not in steady-state operations.
+
+## Proposed implementation
+
+### On-demand, JIT-loading of extensions
+
+Before postgres starts we download 
+- control files for all extensions available to that compute node;
+- all `shared_preload_libraries`;
+
+After postgres is running, `compute_ctl` listens for requests to load files.
+When PostgreSQL requests a file, `compute_ctl` downloads it.
+
+PostgreSQL requests files in the following cases:
+- When loading a preload library set in `local_preload_libraries`
+- When explicitly loading a library with `LOAD`
+- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
+
+
+#### Summary
+
+Pros:
+ - Startup is only as slow as it takes to load all (shared_)preload_libraries
+ - Supports BYO Extension
+
+Cons:
+ - O(sizeof(extensions)) IO requirement for loading all extensions.
+
+### Alternative solutions
+
+1. Allow users to add their extensions to the base image
+   
+   Pros:
+    - Easy to deploy
+
+   Cons:
+    - Doesn't scale - first start size is dependent on image size;
+    - All extensions are shared across all users: It doesn't allow users to
+      bring their own restrictive-licensed extensions
+
+2. Bring Your Own compute image
+   
+   Pros:
+    - Still easy to deploy
+    - User can bring own patched version of PostgreSQL
+
+   Cons:
+    - First start latency is O(sizeof(extensions image))
+    - Warm instance pool for skipping pod schedule latency is not feasible with
+      O(n) custom images
+    - Support channels are difficult to manage
+
+3. Download all user extensions in bulk on compute start
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues for "clean" users.
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - Downloading all extensions in advance takes a lot of time, thus startup
+      latency issues
+
+4. Store user's extensions in persistent storage
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - EC2 instances have only limited number of attachments shared between EBS
+      volumes, direct-attached NVMe drives, and ENIs.
+    - Compute instance migration isn't trivially solved for EBS mounts (e.g.
+      the device is unavailable whilst moving the mount between instances).
+    - EBS can only mount on one instance at a time (except the expensive IO2
+      device type).
+
+5. Store user's extensions in network drive
+   
+   Pros:
+    - Easy to deploy
+    - Few startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - We'd need networked drives, and a lot of them, which would store many
+      duplicate extensions.
+    - **UNCHECKED:** Compute instance migration may not work nicely with
+      networked IOs
+
+
+### Idea extensions
+
+The extension store does not have to be S3 directly, but could be a Node-local
+caching service on top of S3. This would reduce the load on the network for
+popular extensions.
+
+## Extension Storage implementation
+
+The layout of the S3 bucket is as follows:
+```
+5615610098 // this is an extension build number
+├── v14
+│   ├── extensions
+│   │   ├── anon.tar.zst
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   ├── anon.tar.zst
+    │   └── embedding.tar.zst
+    └── ext_index.json
+5615261079
+├── v14
+│   ├── extensions
+│   │   └── anon.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── anon.tar.zst
+    └── ext_index.json
+5623261088
+├── v14
+│   ├── extensions
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── embedding.tar.zst
+    └── ext_index.json
+```
+
+Note that build number cannot be part of prefix because we might need extensions
+from other build numbers.
+
+`ext_index.json` stores the control files and location of extension archives. 
+It also stores a list of public extensions and a library_index
+
+We don't need to duplicate `extension.tar.zst`` files.
+We only need to upload a new one if it is updated.
+(Although currently we just upload every time anyways, hopefully will change
+this sometime)
+
+*access* is controlled by spec
+
+More specifically, here is an example ext_index.json
+```
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+        // for more complex extensions like postgis
+        // we might have something like:
+        // address_standardizer: postgis
+        // postgis_tiger: postgis
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
+```
+
+### How to add new extension to the Extension Storage?
+
+Simply upload build artifacts to the S3 bucket.
+Implement a CI step for that. Splitting it from compute-node-image build.
+
+### How do we deal with extension versions and updates?
+
+Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
+This is needed to ensure that `/share` and `/lib` files are in sync.
+
+For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
+
+### Alternatives
+
+For extensions written on trusted languages we can also adopt
+`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
+This will increase the amount supported extensions and decrease the amount of work required to support them.
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -76,6 +76,11 @@ pub struct ComputeMetrics {
    pub start_postgres_ms: u64,
    pub config_ms: u64,
    pub total_startup_ms: u64,
+    pub load_ext_ms: u64,
+    pub num_ext_downloaded: u64,
+    pub largest_ext_size: u64, // these are measured in bytes
+    pub total_ext_download_size: u64,
+    pub prep_extensions_ms: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -60,6 +60,9 @@ pub struct ComputeSpec {
    /// If set, 'storage_auth_token' is used as the password to authenticate to
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
+
+    // list of prefixes to search for custom extensions in remote extension storage
+    pub custom_extensions: Option<Vec<String>>,
 }

 #[serde_as]
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;

-#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -17,6 +17,32 @@ pub enum EventType {
    },
 }

+impl EventType {
+    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
+        use EventType::*;
+        match self {
+            Absolute { time } => Some(time),
+            _ => None,
+        }
+    }
+
+    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
+        // these can most likely be thought of as Range or RangeFull
+        use EventType::*;
+        match self {
+            Incremental {
+                start_time,
+                stop_time,
+            } => Some(start_time..stop_time),
+            _ => None,
+        }
+    }
+
+    pub fn is_incremental(&self) -> bool {
+        matches!(self, EventType::Incremental { .. })
+    }
+}
+
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
    #[serde(flatten)]
@@ -31,7 +57,7 @@ pub struct Event<Extra> {
    pub extra: Extra,
 }

-pub fn idempotency_key(node_id: String) -> String {
+pub fn idempotency_key(node_id: &str) -> String {
    format!(
        "{}-{}-{:04}",
        Utc::now(),
@@ -45,6 +71,6 @@ pub const CHUNK_SIZE: usize = 1000;
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
 #[derive(serde::Serialize)]
-pub struct EventChunk<'a, T> {
-    pub events: &'a [T],
+pub struct EventChunk<'a, T: Clone> {
+    pub events: std::borrow::Cow<'a, [T]>,
 }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -65,6 +65,10 @@ impl RemotePath {
        Ok(Self(relative_path.to_path_buf()))
    }

+    pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
+        Self::new(Path::new(relative_path))
+    }
+
    pub fn with_base(&self, base_path: &Path) -> PathBuf {
        base_path.join(&self.0)
    }
@@ -190,6 +194,20 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
+    // A function for listing all the files in a "directory"
+    // Example:
+    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
+    // lists common *prefixes*, if any of files
+    // Example:
+    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
@@ -201,14 +219,6 @@ impl GenericRemoteStorage {
        }
    }

-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
-        }
-    }
-
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -200,13 +200,17 @@ impl S3Bucket {
        )
    }

-    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
-        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
-        for segment in path.0.iter() {
-            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-            full_path.push_str(segment.to_str().unwrap_or_default());
+    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path
+            .get_path()
+            .to_string_lossy()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
+            .to_string();
+        match &self.prefix_in_bucket {
+            Some(prefix) => prefix.clone() + "/" + &path_string,
+            None => path_string,
        }
-        full_path
    }

    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
@@ -427,10 +431,12 @@ impl RemoteStorage for S3Bucket {
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        // if prefix is not none then download file `prefix/from`
+        // if prefix is none then download file `from`
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
            key: self.relative_path_to_s3_object(from),
-            ..GetObjectRequest::default()
+            range: None,
        })
        .await
    }
@@ -523,3 +529,63 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::num::NonZeroUsize;
+    use std::path::Path;
+
+    use crate::{RemotePath, S3Bucket, S3Config};
+
+    #[test]
+    fn relative_path() {
+        let all_paths = vec!["", "some/path", "some/path/"];
+        let all_paths: Vec<RemotePath> = all_paths
+            .iter()
+            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
+            .collect();
+        let prefixes = [
+            None,
+            Some(""),
+            Some("test/prefix"),
+            Some("test/prefix/"),
+            Some("/test/prefix/"),
+        ];
+        let expected_outputs = vec![
+            vec!["", "some/path", "some/path"],
+            vec!["/", "/some/path", "/some/path"],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+        ];
+
+        for (prefix_idx, prefix) in prefixes.iter().enumerate() {
+            let config = S3Config {
+                bucket_name: "bucket".to_owned(),
+                bucket_region: "region".to_owned(),
+                prefix_in_bucket: prefix.map(str::to_string),
+                endpoint: None,
+                concurrency_limit: NonZeroUsize::new(100).unwrap(),
+                max_keys_per_list_response: Some(5),
+            };
+            let storage = S3Bucket::new(&config).expect("remote storage init");
+            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
+                let result = storage.relative_path_to_s3_object(test_path);
+                let expected = expected_outputs[prefix_idx][test_path_idx];
+                assert_eq!(result, expected);
+            }
+        }
+    }
+}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

-const BASE_PREFIX: &str = "test/";
+const BASE_PREFIX: &str = "test";

 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -68,7 +68,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
            true
        },
    )?;
-    let mut cursor = BlockCursor::new(&file);
+    let cursor = BlockCursor::new(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos())?;
        println!("key:{} value_len:{}", k, value.len());
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,27 +7,23 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::Utc;
+use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
 use serde::Serialize;
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
-use std::time::Duration;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};
-
-const WRITTEN_SIZE: &str = "written_size";
-const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
-const RESIDENT_SIZE: &str = "resident_size";
-const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
-const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
+use utils::lsn::Lsn;

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Clone, Copy)]
 struct Ids {
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
@@ -38,10 +34,142 @@ struct Ids {

 /// Key that uniquely identifies the object, this metric describes.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub struct PageserverConsumptionMetricsKey {
-    pub tenant_id: TenantId,
-    pub timeline_id: Option<TimelineId>,
-    pub metric: &'static str,
+struct MetricsKey {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    metric: &'static str,
+}
+
+impl MetricsKey {
+    const fn absolute_values(self) -> AbsoluteValueFactory {
+        AbsoluteValueFactory(self)
+    }
+    const fn incremental_values(self) -> IncrementalValueFactory {
+        IncrementalValueFactory(self)
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only absolute values.
+struct AbsoluteValueFactory(MetricsKey);
+
+impl AbsoluteValueFactory {
+    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
+        let key = self.0;
+        (key, (EventType::Absolute { time }, val))
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only incremental values.
+struct IncrementalValueFactory(MetricsKey);
+
+impl IncrementalValueFactory {
+    #[allow(clippy::wrong_self_convention)]
+    fn from_previous_up_to(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> (MetricsKey, (EventType, u64)) {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        (
+            key,
+            (
+                EventType::Incremental {
+                    start_time: prev_end,
+                    stop_time: up_to,
+                },
+                val,
+            ),
+        )
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+// the static part of a MetricsKey
+impl MetricsKey {
+    /// Absolute value of [`Timeline::get_last_record_lsn`].
+    ///
+    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
+    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "written_size",
+        }
+        .absolute_values()
+    }
+
+    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
+    /// previously sent, starting from the previously sent incremental time range ending at the
+    /// latest absolute measurement.
+    const fn written_size_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> IncrementalValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            // the name here is correctly about data not size, because that is what is wanted by
+            // downstream pipeline
+            metric: "written_data_bytes_delta",
+        }
+        .incremental_values()
+    }
+
+    /// Exact [`Timeline::get_current_logical_size`].
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    const fn timeline_logical_size(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "timeline_logical_size",
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::remote_size`]
+    ///
+    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "remote_storage_size",
+        }
+        .absolute_values()
+    }
+
+    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
+    ///
+    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "resident_size",
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    ///
+    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "synthetic_storage_size",
+        }
+        .absolute_values()
+    }
 }

 /// Main thread that serves metrics collection
@@ -79,7 +207,7 @@ pub async fn collect_metrics(
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
+    let mut cached_metrics = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
@@ -119,15 +247,15 @@ pub async fn collect_metrics(
 ///
 /// TODO
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
-pub async fn collect_metrics_iteration(
+async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
+    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
+    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -161,99 +289,65 @@ pub async fn collect_metrics_iteration(
        let mut tenant_resident_size = 0;

        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines().iter() {
+        for timeline in tenant.list_timelines() {
            // collect per-timeline metrics only for active timelines
-            if timeline.is_active() {
-                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
+            let timeline_id = timeline.timeline_id;
+
+            match TimelineSnapshot::collect(&timeline, ctx) {
+                Ok(Some(snap)) => {
+                    snap.to_metrics(
                        tenant_id,
-                        timeline_id: Some(timeline.timeline_id),
-                        metric: WRITTEN_SIZE,
-                    },
-                    timeline_written_size,
-                ));
-
-                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
-                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
-                    // Only send timeline logical size when it is fully calculated.
-                    Ok((size, is_exact)) if is_exact => {
-                        current_metrics.push((
-                            PageserverConsumptionMetricsKey {
-                                tenant_id,
-                                timeline_id: Some(timeline.timeline_id),
-                                metric: TIMELINE_LOGICAL_SIZE,
-                            },
-                            size,
-                        ));
-                    }
-                    Ok((_, _)) => {}
-                    Err(err) => {
-                        error!(
-                            "failed to get current logical size for timeline {}: {err:?}",
-                            timeline.timeline_id
-                        );
-                        continue;
-                    }
-                };
+                        timeline_id,
+                        Utc::now(),
+                        &mut current_metrics,
+                        cached_metrics,
+                    );
+                }
+                Ok(None) => {}
+                Err(e) => {
+                    error!(
+                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
+                        timeline.timeline_id
+                    );
+                    continue;
+                }
            }

-            let timeline_resident_size = timeline.get_resident_physical_size();
-            tenant_resident_size += timeline_resident_size;
+            tenant_resident_size += timeline.resident_physical_size();
        }

-        match tenant.get_remote_size().await {
-            Ok(tenant_remote_size) => {
-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
-                        tenant_id,
-                        timeline_id: None,
-                        metric: REMOTE_STORAGE_SIZE,
-                    },
-                    tenant_remote_size,
-                ));
-            }
-            Err(err) => {
-                error!(
-                    "failed to get remote size for tenant {}: {err:?}",
-                    tenant_id
-                );
-            }
-        }
+        current_metrics
+            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));

-        current_metrics.push((
-            PageserverConsumptionMetricsKey {
-                tenant_id,
-                timeline_id: None,
-                metric: RESIDENT_SIZE,
-            },
-            tenant_resident_size,
-        ));
+        current_metrics
+            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
-        let tenant_synthetic_size = tenant.get_cached_synthetic_size();
+        let synthetic_size = tenant.cached_synthetic_size();

-        if tenant_synthetic_size != 0 {
+        if synthetic_size != 0 {
            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push((
-                PageserverConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: None,
-                    metric: SYNTHETIC_STORAGE_SIZE,
-                },
-                tenant_synthetic_size,
-            ));
+            current_metrics
+                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
        }
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
    // See: https://github.com/neondatabase/neon/issues/3485
    if !send_cached {
-        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
-            Some(val) => val != curr_val,
-            None => true,
+        current_metrics.retain(|(curr_key, (kind, curr_val))| {
+            if kind.is_incremental() {
+                // incremental values (currently only written_size_delta) should not get any cache
+                // deduplication because they will be used by upstream for "is still alive."
+                true
+            } else {
+                match cached_metrics.get(curr_key) {
+                    Some((_, val)) => val != curr_val,
+                    None => true,
+                }
+            }
        });
    }

@@ -268,14 +362,16 @@ pub async fn collect_metrics_iteration(

    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

+    let node_id = node_id.to_string();
+
    for chunk in chunks {
        chunk_to_send.clear();

        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
-            kind: EventType::Absolute { time: Utc::now() },
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
+            kind: *when,
            metric: curr_key.metric,
-            idempotency_key: idempotency_key(node_id.to_string()),
+            idempotency_key: idempotency_key(&node_id),
            value: *curr_val,
            extra: Ids {
                tenant_id: curr_key.tenant_id,
@@ -283,17 +379,14 @@ pub async fn collect_metrics_iteration(
            },
        }));

-        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
-            events: &chunk_to_send,
-        })
-        .expect("PageserverConsumptionMetric should not fail serialization");
-
        const MAX_RETRIES: u32 = 3;

        for attempt in 0..MAX_RETRIES {
            let res = client
                .post(metric_collection_endpoint.clone())
-                .json(&chunk_json)
+                .json(&EventChunk {
+                    events: (&chunk_to_send).into(),
+                })
                .send()
                .await;

@@ -329,6 +422,130 @@ pub async fn collect_metrics_iteration(
    }
 }

+/// Internal type to make timeline metric production testable.
+///
+/// As this value type contains all of the information needed from a timeline to produce the
+/// metrics, it can easily be created with different values in test.
+struct TimelineSnapshot {
+    loaded_at: (Lsn, SystemTime),
+    last_record_lsn: Lsn,
+    current_exact_logical_size: Option<u64>,
+}
+
+impl TimelineSnapshot {
+    /// Collect the metrics from an actual timeline.
+    ///
+    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    fn collect(
+        t: &Arc<crate::tenant::Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Self>> {
+        use anyhow::Context;
+
+        if !t.is_active() {
+            // no collection for broken or stopping needed, we will still keep the cached values
+            // though at the caller.
+            Ok(None)
+        } else {
+            let loaded_at = t.loaded_at;
+            let last_record_lsn = t.get_last_record_lsn();
+
+            let current_exact_logical_size = {
+                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
+                let res = span
+                    .in_scope(|| t.get_current_logical_size(ctx))
+                    .context("get_current_logical_size");
+                match res? {
+                    // Only send timeline logical size when it is fully calculated.
+                    (size, is_exact) if is_exact => Some(size),
+                    (_, _) => None,
+                }
+            };
+
+            Ok(Some(TimelineSnapshot {
+                loaded_at,
+                last_record_lsn,
+                current_exact_logical_size,
+            }))
+        }
+    }
+
+    /// Produce the timeline consumption metrics into the `metrics` argument.
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        now: DateTime<Utc>,
+        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
+        cache: &HashMap<MetricsKey, (EventType, u64)>,
+    ) {
+        let timeline_written_size = u64::from(self.last_record_lsn);
+
+        let (key, written_size_now) =
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
+
+        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
+        // features might change this.
+
+        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
+
+        // use this when available, because in a stream of incremental values, it will be
+        // accurate where as when last_record_lsn stops moving, we will only cache the last
+        // one of those.
+        let last_stop_time = cache
+            .get(written_size_delta_key.key())
+            .map(|(until, _val)| {
+                until
+                    .incremental_timerange()
+                    .expect("never create EventType::Absolute for written_size_delta")
+                    .end
+            });
+
+        // by default, use the last sent written_size as the basis for
+        // calculating the delta. if we don't yet have one, use the load time value.
+        let prev = cache
+            .get(&key)
+            .map(|(prev_at, prev)| {
+                // use the prev time from our last incremental update, or default to latest
+                // absolute update on the first round.
+                let prev_at = prev_at
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let prev_at = last_stop_time.unwrap_or(prev_at);
+                (*prev_at, *prev)
+            })
+            .unwrap_or_else(|| {
+                // if we don't have a previous point of comparison, compare to the load time
+                // lsn.
+                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
+                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+            });
+
+        // written_size_bytes_delta
+        metrics.extend(
+            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+                let up_to = written_size_now
+                    .0
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
+                Some(key_value)
+            } else {
+                None
+            },
+        );
+
+        // written_size
+        metrics.push((key, written_size_now));
+
+        if let Some(size) = self.current_exact_logical_size {
+            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
+        }
+    }
+}
+
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
@@ -343,7 +560,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-        tick_at = ticker.tick() => {
+            tick_at = ticker.tick() => {

                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -379,3 +596,149 @@ pub async fn calculate_synthetic_size_worker(
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use std::time::SystemTime;
+    use utils::{
+        id::{TenantId, TimelineId},
+        lsn::Lsn,
+    };
+
+    use crate::consumption_metrics::MetricsKey;
+
+    use super::TimelineSnapshot;
+    use chrono::{DateTime, Utc};
+
+    #[test]
+    fn startup_collected_timeline_metrics_before_advancing() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::new();
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, SystemTime::now()),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        let now = DateTime::<Utc>::from(SystemTime::now());
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    snap.loaded_at.1.into(),
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_second_round() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id)
+                    .from_previous_up_to(before, now, 0),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, just_before, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let just_before = DateTime::<Utc>::from(just_before);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            // at t=before was the last time the last_record_lsn changed
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+            // end time of this event is used for the next ones
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                before,
+                just_before,
+                0,
+            ),
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    just_before,
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
+        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
+        times[0] = std::time::SystemTime::now();
+        for behind in 1..N {
+            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
+        }
+
+        times
+    }
+}
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -73,7 +73,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
 // Buckets for background operations like compaction, GC, size calculation
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];

-pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
@@ -93,7 +93,7 @@ pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
 });

 // Metrics collected on operations on the storage repository.
-pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -102,7 +102,7 @@ pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
        "Number of cache hits from materialized page cache without redo",
@@ -119,7 +119,7 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
@@ -280,7 +280,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
        "Total on-demand downloaded layers"
@@ -288,7 +288,7 @@ pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

-pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_bytes_total",
        "Total bytes of layers on-demand downloaded",
@@ -327,7 +327,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("Failed to register pageserver_tenant_states_count metric")
 });

-pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_synthetic_cached_size_bytes",
        "Synthetic size of each tenant in bytes",
@@ -385,7 +385,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
    .expect("failed to define a metric")
 });

-pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_unexpected_ondemand_downloads_count",
        "Number of unexpected on-demand downloads. \
@@ -690,7 +690,7 @@ pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_tenant_task_events",
        "Number of task start/stop/fail events.",
@@ -699,7 +699,7 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

-pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
        "Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -710,7 +710,7 @@ pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new

 // walreceiver metrics

-pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_started_connections_total",
        "Number of started walreceiver connections"
@@ -718,7 +718,7 @@ pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
        "pageserver_walreceiver_active_managers",
        "Number of active walreceiver managers"
@@ -726,7 +726,7 @@ pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_switches_total",
        "Number of walreceiver manager change_connection calls",
@@ -735,7 +735,7 @@ pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_walreceiver_broker_updates_total",
        "Number of received broker updates in walreceiver"
@@ -743,7 +743,7 @@ pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_walreceiver_candidates_events_total",
        "Number of walreceiver candidate events",
@@ -752,10 +752,10 @@ pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));

-pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));

 // Metrics collected on WAL redo operations
@@ -802,7 +802,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
    };
 }

-pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
        "Time spent on WAL redo",
@@ -811,7 +811,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the Postgres WAL redo process",
@@ -820,7 +820,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -829,7 +829,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
        "Histogram of number of records replayed per redo sent to Postgres",
@@ -838,7 +838,8 @@ pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
+pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_replayed_wal_records_total",
        "Number of WAL records replayed in WAL redo process"
@@ -1394,15 +1395,51 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }

 pub fn preinitialize_metrics() {
-    // We want to alert on this metric increasing.
-    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
-    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
-    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+    // Python tests need these and on some we do alerting.
+    //
+    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
+    // order:
+    // - global metrics reside in a Lazy<PageserverMetrics>
+    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
+    // - could move the statics into TimelineMetrics::new()?

-    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
-    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
+    // counters
+    [
+        &MATERIALIZED_PAGE_CACHE_HIT,
+        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+        &UNEXPECTED_ONDEMAND_DOWNLOADS,
+        &WALRECEIVER_STARTED_CONNECTIONS,
+        &WALRECEIVER_BROKER_UPDATES,
+        &WALRECEIVER_CANDIDATES_ADDED,
+        &WALRECEIVER_CANDIDATES_REMOVED,
+    ]
+    .into_iter()
+    .for_each(|c| {
+        Lazy::force(c);
+    });

-    // Python tests need these.
-    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
-    MATERIALIZED_PAGE_CACHE_HIT.get();
+    // countervecs
+    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
+        .into_iter()
+        .for_each(|c| {
+            Lazy::force(c);
+        });
+
+    // gauges
+    WALRECEIVER_ACTIVE_MANAGERS.get();
+
+    // histograms
+    [
+        &READ_NUM_FS_LAYERS,
+        &RECONSTRUCT_TIME,
+        &WAIT_LSN_TIME,
+        &WAL_REDO_TIME,
+        &WAL_REDO_WAIT_TIME,
+        &WAL_REDO_RECORDS_HISTOGRAM,
+        &WAL_REDO_BYTES_HISTOGRAM,
+    ]
+    .into_iter()
+    .for_each(|h| {
+        Lazy::force(h);
+    });
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -23,7 +23,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::fs_ext;

 use std::cmp::min;
 use std::collections::hash_map::Entry;
@@ -32,6 +31,7 @@ use std::collections::HashMap;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
+use std::io;
 use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
@@ -46,7 +46,7 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::TenantConf;
-use self::delete::DeleteTimelineFlow;
+use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
@@ -69,6 +69,7 @@ use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;

+use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
@@ -116,7 +117,6 @@ mod remote_timeline_client;
 pub mod storage_layer;

 pub mod config;
-pub mod delete;
 pub mod mgr;
 pub mod tasks;
 pub mod upload_queue;
@@ -338,7 +338,7 @@ pub enum CreateTimelineError {

 struct TenantDirectoryScan {
    sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
-    timelines_to_resume_deletion: Vec<(TimelineId, TimelineMetadata)>,
+    timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
 }

 enum CreateTimelineCause {
@@ -644,20 +644,19 @@ impl Tenant {
        Ok(())
    }

-    /// get size of all remote timelines
+    /// Get sum of all remote timelines sizes
    ///
    /// This function relies on the index_part instead of listing the remote storage
-    ///
-    pub async fn get_remote_size(&self) -> anyhow::Result<u64> {
+    pub fn remote_size(&self) -> u64 {
        let mut size = 0;

-        for timeline in self.list_timelines().iter() {
+        for timeline in self.list_timelines() {
            if let Some(remote_client) = &timeline.remote_client {
                size += remote_client.get_remote_physical_size();
            }
        }

-        Ok(size)
+        size
    }

    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
@@ -818,7 +817,8 @@ impl Tenant {
        // Note timelines_to_resume_deletion needs to be separate because it can be not sortable
        // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion
        // completed in non topological order (for example because parent has smaller number of layer files in it)
-        let mut timelines_to_resume_deletion: Vec<(TimelineId, TimelineMetadata)> = vec![];
+        let mut timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)> = vec![];
+
        let timelines_dir = self.conf.timelines_path(&self.tenant_id);

        for entry in
@@ -868,7 +868,6 @@ impl Tenant {
                }
            } else if crate::is_delete_mark(&timeline_dir) {
                // If metadata exists, load as usual, continue deletion
-                // If metadata doesnt exist remove timeline dir and delete mark
                let timeline_id =
                    TimelineId::try_from(timeline_dir.file_stem()).with_context(|| {
                        format!(
@@ -877,33 +876,37 @@ impl Tenant {
                        )
                    })?;

-                let metadata_path = self.conf.metadata_path(&self.tenant_id, &timeline_id);
-                if metadata_path.exists() {
-                    // Remote deletion did not finish. Need to resume.
-                    timelines_to_resume_deletion.push((
-                        timeline_id,
-                        load_metadata(self.conf, &self.tenant_id, &timeline_id)?,
-                    ));
-                    continue;
-                }
+                match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
+                    Ok(metadata) => {
+                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
+                    }
+                    Err(e) => match &e {
+                        LoadMetadataError::Read(r) => {
+                            if r.kind() != io::ErrorKind::NotFound {
+                                return Err(anyhow::anyhow!(e)).with_context(|| {
+                                    format!("Failed to load metadata for timeline_id {timeline_id}")
+                                });
+                            }

-                // Missing metadata means that timeline directory should be empty at this point.
-                // Remove delete mark afterwards.
-                // Note that failure during the process wont prevent tenant from successfully loading.
-                // TODO: this is very much similar to DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces
-                // but here we're inside spawn_blocking.
-                if let Err(e) = fs_ext::ignore_absent_files(|| {
-                    fs::remove_dir(self.conf.timeline_path(&self.tenant_id, &timeline_id))
-                })
-                .context("remove deleted timeline dir")
-                .and_then(|_| fs::remove_file(&timeline_dir).context("remove delete mark"))
-                {
-                    warn!(
-                        "cannot clean up deleted timeline dir at: {} error: {:#}",
-                        timeline_dir.display(),
-                        e
-                    );
-                };
+                            // If metadata doesnt exist it means that we've crashed without
+                            // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow.
+                            // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`.
+                            // We cant do it here because the method is async so we'd need block_on
+                            // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations
+                            // so that basically results in a cycle:
+                            // spawn_blocking
+                            // - block_on
+                            //   - spawn_blocking
+                            // which can lead to running out of threads in blocing pool.
+                            timelines_to_resume_deletion.push((timeline_id, None));
+                        }
+                        _ => {
+                            return Err(anyhow::anyhow!(e)).with_context(|| {
+                                format!("Failed to load metadata for timeline_id {timeline_id}")
+                            })
+                        }
+                    },
+                }
            } else {
                if !timeline_dir.exists() {
                    warn!(
@@ -1022,21 +1025,37 @@ impl Tenant {
        }

        // Resume deletion ones with deleted_mark
-        for (timeline_id, local_metadata) in scan.timelines_to_resume_deletion {
-            if let Err(e) = self
-                .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true)
-                .await
-            {
-                match e {
-                    LoadLocalTimelineError::Load(source) => {
-                        // We tried to load deleted timeline, this is a bug.
-                        return Err(anyhow::anyhow!(source).context(
-                            "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
-                        ));
+        for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion {
+            match maybe_local_metadata {
+                None => {
+                    // See comment in `scan_and_sort_timelines_dir`.
+                    if let Err(e) =
+                        DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id)
+                            .await
+                    {
+                        warn!(
+                            "cannot clean up deleted timeline dir timeline_id: {} error: {:#}",
+                            timeline_id, e
+                        );
                    }
-                    LoadLocalTimelineError::ResumeDeletion(source) => {
-                        // Make sure resumed deletion wont fail loading for entire tenant.
-                        error!("Failed to resume timeline deletion: {source:#}")
+                }
+                Some(local_metadata) => {
+                    if let Err(e) = self
+                        .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true)
+                        .await
+                    {
+                        match e {
+                            LoadLocalTimelineError::Load(source) => {
+                                // We tried to load deleted timeline, this is a bug.
+                                return Err(anyhow::anyhow!(source).context(
+                                "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
+                            ));
+                            }
+                            LoadLocalTimelineError::ResumeDeletion(source) => {
+                                // Make sure resumed deletion wont fail loading for entire tenant.
+                                error!("Failed to resume timeline deletion: {source:#}")
+                            }
+                        }
                    }
                }
            }
@@ -2057,57 +2076,74 @@ impl Tenant {
    ) -> Tenant {
        let (state, mut rx) = watch::channel(state);

-        tokio::spawn(async move {
-            let tid = tenant_id.to_string();
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MetricsCollection,
+            Some(tenant_id),
+            None,
+            &format!("state metrics collector for tenant {tenant_id}"),
+            false,
+            async move {
+                let cancel = task_mgr::shutdown_token();

-            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
-                ([state.into()], matches!(state, TenantState::Broken { .. }))
-            }
+                let tid = tenant_id.to_string();

-            let mut tuple = inspect_state(&rx.borrow_and_update());
-
-            let is_broken = tuple.1;
-            let mut counted_broken = if !is_broken {
-                // the tenant might be ignored and reloaded, so first remove any previous set
-                // element. it most likely has already been scraped, as these are manual operations
-                // right now. most likely we will add it back very soon.
-                drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
-                false
-            } else {
-                // add the id to the set right away, there should not be any updates on the channel
-                // after
-                crate::metrics::BROKEN_TENANTS_SET
-                    .with_label_values(&[&tid])
-                    .set(1);
-                true
-            };
-
-            loop {
-                let labels = &tuple.0;
-                let current = TENANT_STATE_METRIC.with_label_values(labels);
-                current.inc();
-
-                if rx.changed().await.is_err() {
-                    // tenant has been dropped; decrement the counter because a tenant with that
-                    // state is no longer in tenant map, but allow any broken set item to exist
-                    // still.
-                    current.dec();
-                    break;
+                fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
+                    ([state.into()], matches!(state, TenantState::Broken { .. }))
                }

-                current.dec();
-                tuple = inspect_state(&rx.borrow_and_update());
+                let mut tuple = inspect_state(&rx.borrow_and_update());

                let is_broken = tuple.1;
-                if is_broken && !counted_broken {
-                    counted_broken = true;
-                    // insert the tenant_id (back) into the set
+                let mut counted_broken = if !is_broken {
+                    // the tenant might be ignored and reloaded, so first remove any previous set
+                    // element. it most likely has already been scraped, as these are manual operations
+                    // right now. most likely we will add it back very soon.
+                    drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
+                    false
+                } else {
+                    // add the id to the set right away, there should not be any updates on the channel
+                    // after
                    crate::metrics::BROKEN_TENANTS_SET
                        .with_label_values(&[&tid])
-                        .inc();
+                        .set(1);
+                    true
+                };
+
+                loop {
+                    let labels = &tuple.0;
+                    let current = TENANT_STATE_METRIC.with_label_values(labels);
+                    current.inc();
+
+                    let changed = tokio::select! {
+                        changed = rx.changed() => {changed},
+                        _ = cancel.cancelled() => {return Ok(())}
+                    };
+
+                    if changed.is_err() {
+                        // tenant has been dropped; decrement the counter because a tenant with that
+                        // state is no longer in tenant map, but allow any broken set item to exist
+                        // still.
+                        current.dec();
+                        break;
+                    }
+
+                    current.dec();
+                    tuple = inspect_state(&rx.borrow_and_update());
+
+                    let is_broken = tuple.1;
+                    if is_broken && !counted_broken {
+                        counted_broken = true;
+                        // insert the tenant_id (back) into the set
+                        crate::metrics::BROKEN_TENANTS_SET
+                            .with_label_values(&[&tid])
+                            .inc();
+                    }
                }
+                Ok(())
            }
-        });
+            .instrument(info_span!("state_metrics", tenant_id = %tenant_id)),
+        );

        Tenant {
            tenant_id,
@@ -2869,7 +2905,7 @@ impl Tenant {
            .set(size);
    }

-    pub fn get_cached_synthetic_size(&self) -> u64 {
+    pub fn cached_synthetic_size(&self) -> u64 {
        self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
    }
 }
@@ -3814,9 +3850,9 @@ mod tests {
            .await
            .err()
            .expect("should fail");
-        // get all the stack with all .context, not tonly the last one
+        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
-        let expected = "Failed to parse metadata bytes from path";
+        let expected = "failed to load metadata";
        assert!(
            message.contains(expected),
            "message '{message}' expected to contain {expected}"
@@ -3833,7 +3869,8 @@ mod tests {
        }
        assert!(
            found_error_message,
-            "didn't find the corrupted metadata error"
+            "didn't find the corrupted metadata error in {}",
+            message
        );

        Ok(())
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -21,7 +21,7 @@ where
    R: BlockReader,
 {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
        self.read_blob_into_buf(offset, &mut buf)?;
        Ok(buf)
@@ -29,7 +29,7 @@ where
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
    pub fn read_blob_into_buf(
-        &mut self,
+        &self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
    ) -> Result<(), std::io::Error> {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -80,7 +80,7 @@ where
        BlockCursor { reader }
    }

-    pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+    pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -390,39 +390,42 @@ where
    }

    #[allow(dead_code)]
-    pub fn dump(&self) -> Result<()> {
-        self.dump_recurse(self.root_blk, &[], 0)
-    }
+    pub async fn dump(&self) -> Result<()> {
+        let mut stack = Vec::new();

-    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
-        let blk = self.reader.read_blk(self.start_blk + blknum)?;
-        let buf: &[u8] = blk.as_ref();
+        stack.push((self.root_blk, String::new(), 0, 0, 0));

-        let node = OnDiskNode::<L>::deparse(buf)?;
+        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
+            let blk = self.reader.read_blk(self.start_blk + blknum)?;
+            let buf: &[u8] = blk.as_ref();
+            let node = OnDiskNode::<L>::deparse(buf)?;

-        print!("{:indent$}", "", indent = depth * 2);
-        println!(
-            "blk #{}: path {}: prefix {}, suffix_len {}",
-            blknum,
-            hex::encode(path),
-            hex::encode(node.prefix),
-            node.suffix_len
-        );
+            if child_idx == 0 {
+                print!("{:indent$}", "", indent = depth * 2);
+                let path_prefix = stack
+                    .iter()
+                    .map(|(_blknum, path, ..)| path.as_str())
+                    .collect::<String>();
+                println!(
+                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
+                    hex::encode(node.prefix),
+                    node.suffix_len
+                );
+            }

-        let mut idx = 0;
-        let mut key_off = 0;
-        while idx < node.num_children {
+            if child_idx + 1 < node.num_children {
+                let key_off = key_off + node.suffix_len as usize;
+                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
+            }
            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(idx as usize);
+            let val = node.value(child_idx as usize);
+
            print!("{:indent$}", "", indent = depth * 2 + 2);
            println!("{}: {}", hex::encode(key), hex::encode(val.0));

            if node.level > 0 {
-                let child_path = [path, node.prefix].concat();
-                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
+                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
            }
-            idx += 1;
-            key_off += node.suffix_len as usize;
        }
        Ok(())
    }
@@ -754,8 +757,8 @@ mod tests {
        }
    }

-    #[test]
-    fn basic() -> Result<()> {
+    #[tokio::test]
+    async fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -775,7 +778,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump()?;
+        reader.dump().await?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
@@ -835,8 +838,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn lots_of_keys() -> Result<()> {
+    #[tokio::test]
+    async fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -856,7 +859,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump()?;
+        reader.dump().await?;

        use std::sync::Mutex;

@@ -994,8 +997,8 @@ mod tests {
    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
-    #[test]
-    fn particular_data() -> Result<()> {
+    #[tokio::test]
+    async fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1022,7 +1025,7 @@ mod tests {
        })?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump()?;
+        reader.dump().await?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -420,7 +420,7 @@ mod tests {
            blobs.push((pos, data));
        }

-        let mut cursor = BlockCursor::new(&file);
+        let cursor = BlockCursor::new(&file);
        for (pos, expected) in blobs {
            let actual = cursor.read_blob(pos)?;
            assert_eq!(actual, expected);
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -9,10 +9,11 @@
 //! [`remote_timeline_client`]: super::remote_timeline_client

 use std::fs::{File, OpenOptions};
-use std::io::Write;
+use std::io::{self, Write};

 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
+use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
 use utils::{
@@ -267,24 +268,24 @@ pub fn save_metadata(
    Ok(())
 }

+#[derive(Error, Debug)]
+pub enum LoadMetadataError {
+    #[error(transparent)]
+    Read(#[from] io::Error),
+
+    #[error(transparent)]
+    Decode(#[from] anyhow::Error),
+}
+
 pub fn load_metadata(
    conf: &'static PageServerConf,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
-) -> anyhow::Result<TimelineMetadata> {
+) -> Result<TimelineMetadata, LoadMetadataError> {
    let metadata_path = conf.metadata_path(tenant_id, timeline_id);
-    let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
-        format!(
-            "Failed to read metadata bytes from path {}",
-            metadata_path.display()
-        )
-    })?;
-    TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
-        format!(
-            "Failed to parse metadata bytes from path {}",
-            metadata_path.display()
-        )
-    })
+    let metadata_bytes = std::fs::read(metadata_path)?;
+
+    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,7 +26,7 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

-use super::delete::DeleteTimelineFlow;
+use super::timeline::delete::DeleteTimelineFlow;

 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -223,6 +223,45 @@ mod tests {
        assert_eq!(part, expected);
    }

+    #[test]
+    fn v2_indexpart_is_parsed_with_deleted_at() {
+        let example = r#"{
+            "version":2,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["This shouldn't fail deserialization"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 2,
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
+
    #[test]
    fn empty_layers_are_parsed() {
        let empty_layers_json = r#"{
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -9,7 +9,7 @@ mod remote_layer;

 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::repository::{Key, Value};
+use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
@@ -34,7 +34,7 @@ use utils::{
    lsn::Lsn,
 };

-pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
@@ -381,12 +381,6 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

-/// Returned by [`PersistentLayer::iter`]
-pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
-
-/// Returned by [`PersistentLayer::key_iter`]
-pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
-
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
@@ -427,15 +421,6 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
    // `None` for `RemoteLayer`.
    fn local_path(&self) -> Option<PathBuf>;

-    /// Iterate through all keys and values stored in the layer
-    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
-
-    /// Iterate through all keys stored in the layer. Returns key, lsn and value size
-    /// It is used only for compaction and so is currently implemented only for DeltaLayer
-    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        panic!("Not implemented")
-    }
-
    /// Permanently remove this layer from disk.
    fn delete_resident_layer_file(&self) -> Result<()>;

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -61,8 +61,8 @@ use utils::{
 };

 use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
-    LayerKeyIter, PathOrConf, PersistentLayerDesc,
+    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
+    PersistentLayerDesc,
 };

 ///
@@ -189,7 +189,7 @@ pub struct DeltaLayer {

    access_stats: LayerAccessStats,

-    inner: OnceCell<DeltaLayerInner>,
+    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

 impl std::fmt::Debug for DeltaLayer {
@@ -256,12 +256,12 @@ impl Layer for DeltaLayer {
            file,
        );

-        tree_reader.dump()?;
+        tree_reader.dump().await?;

-        let mut cursor = file.block_cursor();
+        let cursor = file.block_cursor();

        // A subroutine to dump a single blob
-        let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
+        let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
            let buf = cursor.read_blob(blob_ref.pos())?;
            let val = Value::des(&buf)?;
            let desc = match val {
@@ -343,7 +343,7 @@ impl Layer for DeltaLayer {
            })?;

            // Ok, 'offsets' now contains the offsets of all the entries we need to read
-            let mut cursor = file.block_cursor();
+            let cursor = file.block_cursor();
            let mut buf = Vec::new();
            for (entry_lsn, pos) in offsets {
                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
@@ -424,23 +424,6 @@ impl PersistentLayer for DeltaLayer {
        Some(self.path())
    }

-    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .context("load delta layer")?;
-        Ok(match DeltaValueIter::new(inner) {
-            Ok(iter) => Box::new(iter),
-            Err(err) => Box::new(std::iter::once(Err(err))),
-        })
-    }
-
-    fn key_iter(&self, ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        let inner = self.load(LayerAccessKind::KeyIter, ctx)?;
-        Ok(Box::new(
-            DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
-        ))
-    }
-
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -510,7 +493,11 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
+    fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<&Arc<DeltaLayerInner>> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        // Quick exit if already loaded
@@ -519,7 +506,7 @@ impl DeltaLayer {
            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    fn load_inner(&self) -> Result<DeltaLayerInner> {
+    fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

        let file = VirtualFile::open(&path)
@@ -554,11 +541,11 @@ impl DeltaLayer {

        debug!("loaded from {}", &path.display());

-        Ok(DeltaLayerInner {
+        Ok(Arc::new(DeltaLayerInner {
            file,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
-        })
+        }))
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -623,6 +610,24 @@ impl DeltaLayer {
            &self.layer_name(),
        )
    }
+
+    /// Obtains all keys and value references stored in the layer
+    ///
+    /// The value can be obtained via the [`ValueRef::load`] function.
+    pub fn load_val_refs(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, ValueRef)>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .context("load delta layer")?;
+        DeltaLayerInner::load_val_refs(inner).context("Layer index is corrupted")
+    }
+
+    /// Loads all keys stored in the layer. Returns key, lsn and value size.
+    pub fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .context("load delta layer keys")?;
+        inner.load_keys().context("Layer index is corrupted")
+    }
 }

 /// A builder object for constructing a new delta layer.
@@ -893,121 +898,41 @@ impl Drop for DeltaLayerWriter {
    }
 }

-///
-/// Iterator over all key-value pairse stored in a delta layer
-///
-/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
-/// That takes up quite a lot of memory. Should do this in a more streaming
-/// fashion.
-///
-struct DeltaValueIter<'a> {
-    all_offsets: Vec<(DeltaKey, BlobRef)>,
-    next_idx: usize,
-    reader: BlockCursor<Adapter<'a>>,
-}
-
-struct Adapter<'a>(&'a DeltaLayerInner);
-
-impl<'a> BlockReader for Adapter<'a> {
-    type BlockLease = PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.read_blk(blknum)
-    }
-}
-
-impl<'a> Iterator for DeltaValueIter<'a> {
-    type Item = Result<(Key, Lsn, Value)>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.next_res().transpose()
-    }
-}
-
-impl<'a> DeltaValueIter<'a> {
-    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = &inner.file;
+impl DeltaLayerInner {
+    fn load_val_refs(this: &Arc<DeltaLayerInner>) -> Result<Vec<(Key, Lsn, ValueRef)>> {
+        let file = &this.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
+            this.index_start_blk,
+            this.index_root_blk,
            file,
        );

-        let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
+        let mut all_offsets = Vec::<(Key, Lsn, ValueRef)>::new();
        tree_reader.visit(
            &[0u8; DELTA_KEY_SIZE],
            VisitDirection::Forwards,
            |key, value| {
-                all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
+                let delta_key = DeltaKey::from_slice(key);
+                let val_ref = ValueRef {
+                    blob_ref: BlobRef(value),
+                    reader: BlockCursor::new(Adapter(this.clone())),
+                };
+                all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
                true
            },
        )?;

-        let iter = DeltaValueIter {
-            all_offsets,
-            next_idx: 0,
-            reader: BlockCursor::new(Adapter(inner)),
-        };
-
-        Ok(iter)
+        Ok(all_offsets)
    }
-
-    fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
-        if self.next_idx < self.all_offsets.len() {
-            let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];
-
-            let key = delta_key.key();
-            let lsn = delta_key.lsn();
-
-            let buf = self.reader.read_blob(blob_ref.pos())?;
-            let val = Value::des(&buf)?;
-            self.next_idx += 1;
-            Ok(Some((key, lsn, val)))
-        } else {
-            Ok(None)
-        }
-    }
-}
-///
-/// Iterator over all keys stored in a delta layer
-///
-/// FIXME: This creates a Vector to hold all keys.
-/// That takes up quite a lot of memory. Should do this in a more streaming
-/// fashion.
-///
-struct DeltaKeyIter {
-    all_keys: Vec<(DeltaKey, u64)>,
-    next_idx: usize,
-}
-
-impl Iterator for DeltaKeyIter {
-    type Item = (Key, Lsn, u64);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.next_idx < self.all_keys.len() {
-            let (delta_key, size) = &self.all_keys[self.next_idx];
-
-            let key = delta_key.key();
-            let lsn = delta_key.lsn();
-
-            self.next_idx += 1;
-            Some((key, lsn, *size))
-        } else {
-            None
-        }
-    }
-}
-
-impl<'a> DeltaKeyIter {
-    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = &inner.file;
+    fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
+        let file = &self.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
+            self.index_start_blk,
+            self.index_root_blk,
            file,
        );

-        let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
+        let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
        tree_reader.visit(
            &[0u8; DELTA_KEY_SIZE],
            VisitDirection::Forwards,
@@ -1015,46 +940,48 @@ impl<'a> DeltaKeyIter {
                let delta_key = DeltaKey::from_slice(key);
                let pos = BlobRef(value).pos();
                if let Some(last) = all_keys.last_mut() {
-                    if last.0.key() == delta_key.key() {
+                    if last.0 == delta_key.key() {
                        return true;
                    } else {
                        // subtract offset of new key BLOB and first blob of this key
                        // to get total size if values associated with this key
-                        let first_pos = last.1;
-                        last.1 = pos - first_pos;
+                        let first_pos = last.2;
+                        last.2 = pos - first_pos;
                    }
                }
-                all_keys.push((delta_key, pos));
+                all_keys.push((delta_key.key(), delta_key.lsn(), pos));
                true
            },
        )?;
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of layer
-            last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
+            last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
        }
-        let iter = DeltaKeyIter {
-            all_keys,
-            next_idx: 0,
-        };
-
-        Ok(iter)
+        Ok(all_keys)
    }
 }

-#[cfg(test)]
-mod test {
-    use super::DeltaKeyIter;
-    use super::DeltaLayer;
-    use super::DeltaValueIter;
+/// Reference to an on-disk value
+pub struct ValueRef {
+    blob_ref: BlobRef,
+    reader: BlockCursor<Adapter>,
+}

-    // We will soon need the iters to be send in the compaction code.
-    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
-    // Cf https://github.com/neondatabase/neon/issues/4471
-    #[test]
-    fn is_send() {
-        fn assert_send<T: Send>() {}
-        assert_send::<DeltaLayer>();
-        assert_send::<DeltaValueIter>();
-        assert_send::<DeltaKeyIter>();
+impl ValueRef {
+    /// Loads the value from disk
+    pub fn load(&self) -> Result<Value> {
+        let buf = self.reader.read_blob(self.blob_ref.pos())?;
+        let val = Value::des(&buf)?;
+        Ok(val)
+    }
+}
+
+struct Adapter(Arc<DeltaLayerInner>);
+
+impl BlockReader for Adapter {
+    type BlockLease = PageReadGuard<'static>;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+        self.0.file.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,6 +38,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -47,7 +48,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;

 use utils::{
@@ -57,9 +57,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{
-    AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
-};
+use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};

 ///
 /// Header stored in the beginning of the file
@@ -117,7 +115,7 @@ pub struct ImageLayer {

    access_stats: LayerAccessStats,

-    inner: RwLock<ImageLayerInner>,
+    inner: OnceCell<ImageLayerInner>,
 }

 impl std::fmt::Debug for ImageLayer {
@@ -134,21 +132,17 @@ impl std::fmt::Debug for ImageLayer {
 }

 pub struct ImageLayerInner {
-    /// If false, the 'index' has not been loaded into memory yet.
-    loaded: bool,
-
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file. (None if not loaded yet)
-    file: Option<FileBlockReader<VirtualFile>>,
+    /// Reader object for reading blocks from the file.
+    file: FileBlockReader<VirtualFile>,
 }

 impl std::fmt::Debug for ImageLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ImageLayerInner")
-            .field("loaded", &self.loaded)
            .field("index_start_blk", &self.index_start_blk)
            .field("index_root_blk", &self.index_root_blk)
            .finish()
@@ -175,11 +169,11 @@ impl Layer for ImageLayer {
        }

        let inner = self.load(LayerAccessKind::Dump, ctx)?;
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

-        tree_reader.dump()?;
+        tree_reader.dump().await?;

        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
            println!("key: {} offset {}", hex::encode(key), value);
@@ -203,7 +197,7 @@ impl Layer for ImageLayer {

        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
@@ -259,10 +253,6 @@ impl PersistentLayer for ImageLayer {
        Some(self.path())
    }

-    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        unimplemented!();
-    }
-
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -322,52 +312,26 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<RwLockReadGuard<ImageLayerInner>> {
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        loop {
-            // Quick exit if already loaded
-            let inner = self.inner.read().unwrap();
-            if inner.loaded {
+            if let Some(inner) = self.inner.get() {
                return Ok(inner);
            }
-
-            // Need to open the file and load the metadata. Upgrade our lock to
-            // a write lock. (Or rather, release and re-lock in write mode.)
-            drop(inner);
-            let mut inner = self.inner.write().unwrap();
-            if !inner.loaded {
-                self.load_inner(&mut inner).with_context(|| {
-                    format!("Failed to load image layer {}", self.path().display())
-                })?
-            } else {
-                // Another thread loaded it while we were not holding the lock.
-            }
-
-            // We now have the file open and loaded. There's no function to do
-            // that in the std library RwLock, so we have to release and re-lock
-            // in read mode. (To be precise, the lock guard was moved in the
-            // above call to `load_inner`, so it's already been released). And
-            // while we do that, another thread could unload again, so we have
-            // to re-check and retry if that happens.
-            drop(inner);
+            self.inner
+                .get_or_try_init(|| self.load_inner())
+                .with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
        }
    }

-    fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
+    fn load_inner(&self) -> Result<ImageLayerInner> {
        let path = self.path();

        // Open the file if it's not open already.
-        if inner.file.is_none() {
-            let file = VirtualFile::open(&path)
-                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-            inner.file = Some(FileBlockReader::new(file));
-        }
-        let file = inner.file.as_mut().unwrap();
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);
        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

@@ -395,10 +359,11 @@ impl ImageLayer {
            }
        }

-        inner.index_start_blk = actual_summary.index_start_blk;
-        inner.index_root_blk = actual_summary.index_root_blk;
-        inner.loaded = true;
-        Ok(())
+        Ok(ImageLayerInner {
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+            file,
+        })
    }

    /// Create an ImageLayer struct representing an existing file on disk
@@ -422,12 +387,7 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
            access_stats,
-            inner: RwLock::new(ImageLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: OnceCell::new(),
        }
    }

@@ -454,12 +414,7 @@ impl ImageLayer {
            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(ImageLayerInner {
-                file: None,
-                loaded: false,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: OnceCell::new(),
        })
    }

@@ -620,12 +575,7 @@ impl ImageLayerWriterInner {
            desc,
            lsn: self.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(ImageLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk,
-                index_root_blk,
-            }),
+            inner: OnceCell::new(),
        };

        // fsync the file
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -151,7 +151,7 @@ impl Layer for InMemoryLayer {
            return Ok(());
        }

-        let mut cursor = inner.file.block_cursor();
+        let cursor = inner.file.block_cursor();
        let mut buf = Vec::new();
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
@@ -196,7 +196,7 @@ impl Layer for InMemoryLayer {

        let inner = self.inner.read().unwrap();

-        let mut reader = inner.file.block_cursor();
+        let reader = inner.file.block_cursor();

        // Scan the page versions backwards, starting from `lsn`.
        if let Some(vec_map) = inner.index.get(&key) {
@@ -354,7 +354,7 @@ impl InMemoryLayer {

        let mut buf = Vec::new();

-        let mut cursor = inner.file.block_cursor();
+        let cursor = inner.file.block_cursor();

        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
        keys.sort_by_key(|k| k.0);
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -20,8 +20,8 @@ use utils::{

 use super::filename::{DeltaFileName, ImageFileName};
 use super::{
-    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
-    LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
+    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -129,14 +129,6 @@ impl PersistentLayer for RemoteLayer {
        None
    }

-    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        bail!("cannot iterate a remote layer");
-    }
-
-    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        bail!("cannot iterate a remote layer");
-    }
-
    fn delete_resident_layer_file(&self) -> Result<()> {
        bail!("remote layer has no layer file");
    }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -103,6 +103,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            }

+            if cancel.is_cancelled() {
+                info!("received cancellation request");
+                break;
+            }
+
            let started_at = Instant::now();

            let sleep_duration = if period == Duration::ZERO {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,3 +1,4 @@
+pub mod delete;
 mod eviction_task;
 pub mod layer_manager;
 mod logical_size;
@@ -79,6 +80,7 @@ use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};

+use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
 use self::layer_manager::LayerManager;
@@ -86,7 +88,6 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
-use super::delete::DeleteTimelineFlow;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{
@@ -293,6 +294,10 @@ pub struct Timeline {
    /// Completion shared between all timelines loaded during startup; used to delay heavier
    /// background tasks until some logical sizes have been calculated.
    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
+
+    /// Load or creation time information about the disk_consistent_lsn and when the loading
+    /// happened. Used for consumption metrics.
+    pub(crate) loaded_at: (Lsn, SystemTime),
 }

 pub struct WalReceiverInfo {
@@ -523,7 +528,7 @@ impl Timeline {
        size
    }

-    pub fn get_resident_physical_size(&self) -> u64 {
+    pub fn resident_physical_size(&self) -> u64 {
        self.metrics.resident_physical_size_gauge.get()
    }

@@ -692,6 +697,9 @@ impl Timeline {
                Err(CompactionError::DownloadRequired(rls)) => {
                    anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
                }
+                Err(CompactionError::ShuttingDown) => {
+                    return Ok(());
+                }
                Err(CompactionError::Other(e)) => {
                    return Err(e);
                }
@@ -773,7 +781,8 @@ impl Timeline {
        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
        // Is the timeline being deleted?
        if self.is_stopping() {
-            return Err(anyhow::anyhow!("timeline is Stopping").into());
+            trace!("Dropping out of compaction on timeline shutdown");
+            return Err(CompactionError::ShuttingDown);
        }

        let target_file_size = self.get_checkpoint_distance();
@@ -1403,6 +1412,8 @@ impl Timeline {
                last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
                last_freeze_ts: RwLock::new(Instant::now()),

+                loaded_at: (disk_consistent_lsn, SystemTime::now()),
+
                ancestor_timeline: ancestor,
                ancestor_lsn: metadata.ancestor_lsn(),

@@ -1599,7 +1610,7 @@ impl Timeline {
            if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
                // create an ImageLayer struct for each image file.
                if imgfilename.lsn > disk_consistent_lsn {
-                    warn!(
+                    info!(
                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
                        imgfilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1631,7 +1642,7 @@ impl Timeline {
                // is 102, then it might not have been fully flushed to disk
                // before crash.
                if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
-                    warn!(
+                    info!(
                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
                        deltafilename, self.timeline_id, disk_consistent_lsn
                    );
@@ -1773,7 +1784,7 @@ impl Timeline {
            match remote_layer_name {
                LayerFileName::Image(imgfilename) => {
                    if imgfilename.lsn > up_to_date_disk_consistent_lsn {
-                        warn!(
+                        info!(
                        "found future image layer {} on timeline {} remote_consistent_lsn is {}",
                        imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
                    );
@@ -1798,7 +1809,7 @@ impl Timeline {
                    // is 102, then it might not have been fully flushed to disk
                    // before crash.
                    if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
-                        warn!(
+                        info!(
                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
                        );
@@ -3228,6 +3239,8 @@ enum CompactionError {
    /// This should not happen repeatedly, but will be retried once by top-level
    /// `Timeline::compact`.
    DownloadRequired(Vec<Arc<RemoteLayer>>),
+    /// The timeline or pageserver is shutting down
+    ShuttingDown,
    /// Compaction cannot be done right now; page reconstruction and so on.
    Other(anyhow::Error),
 }
@@ -3506,7 +3519,13 @@ impl Timeline {
        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
        let mut prev: Option<Key> = None;
        for (next_key, _next_lsn, _size) in itertools::process_results(
-            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_keys(ctx)?
+                    .into_iter())
+            }),
            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
        )? {
            if let Some(prev_key) = prev {
@@ -3542,25 +3561,31 @@ impl Timeline {
        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
        let all_values_iter = itertools::process_results(
-            deltas_to_compact.iter().map(|l| l.iter(ctx)),
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_val_refs(ctx)?
+                    .into_iter())
+            }),
            |iter_iter| {
                iter_iter.kmerge_by(|a, b| {
-                    if let Ok((a_key, a_lsn, _)) = a {
-                        if let Ok((b_key, b_lsn, _)) = b {
-                            (a_key, a_lsn) < (b_key, b_lsn)
-                        } else {
-                            false
-                        }
-                    } else {
-                        true
-                    }
+                    let (a_key, a_lsn, _) = a;
+                    let (b_key, b_lsn, _) = b;
+                    (a_key, a_lsn) < (b_key, b_lsn)
                })
            },
        )?;

        // This iterator walks through all keys and is needed to calculate size used by each key
        let mut all_keys_iter = itertools::process_results(
-            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_keys(ctx)?
+                    .into_iter())
+            }),
            |iter_iter| {
                iter_iter.kmerge_by(|a, b| {
                    let (a_key, a_lsn, _) = a;
@@ -3622,8 +3647,8 @@ impl Timeline {
        let mut key_values_total_size = 0u64;
        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-        for x in all_values_iter {
-            let (key, lsn, value) = x?;
+        for (key, lsn, value_ref) in all_values_iter {
+            let value = value_ref.load()?;
            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
            // We need to check key boundaries once we reach next key or end of layer with the same key
            if !same_key || lsn == dup_end_lsn {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -15,15 +15,17 @@ use utils::{
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind},
-    tenant::{remote_timeline_client, DeleteTimelineError},
+    tenant::{
+        metadata::TimelineMetadata,
+        remote_timeline_client::{
+            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
+        },
+        CreateTimelineCause, DeleteTimelineError, Tenant,
+    },
    InitializationOrder,
 };

-use super::{
-    metadata::TimelineMetadata,
-    remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
-    CreateTimelineCause, Tenant, Timeline,
-};
+use super::Timeline;

 /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
 async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -308,8 +308,13 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        let mut state = self.eviction_task_timeline_state.lock().await;
+
+        // Only do the imitate_layer accesses approximately as often as the threshold.  A little
+        // more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
+        let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
+
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
                    .await;
@@ -332,7 +337,7 @@ impl Timeline {
        };
        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
                self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
                    .await;
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,6 +4,7 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	extension_server.o \
 	file_cache.o \
 	libpagestore.o \
 	libpqwalproposer.o \
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -0,0 +1,103 @@
+
+/*-------------------------------------------------------------------------
+ *
+ * extension_server.c
+ *	  Request compute_ctl to download extension files.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/extension_server.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "tcop/pquery.h"
+#include "tcop/utility.h"
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "commands/defrem.h"
+#include "miscadmin.h"
+#include "utils/acl.h"
+#include "fmgr.h"
+#include "utils/guc.h"
+#include "port.h"
+#include "fmgr.h"
+
+#include <curl/curl.h>
+
+static int extension_server_port = 0;
+
+static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
+
+// to download all SQL (and data) files for an extension:
+// curl -X POST http://localhost:8080/extension_server/postgis
+// it covers two possible extension files layouts:
+// 1. extension_name--version--platform.sql
+// 2. extension_name/extension_name--version.sql
+//    extension_name/extra_files.csv
+//
+// to download specific library file:
+// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
+static bool
+neon_download_extension_file_http(const char *filename, bool is_library)
+{
+    CURL *curl;
+    CURLcode res;
+    char *compute_ctl_url;
+    char *postdata;
+    bool ret = false;
+
+    if ((curl = curl_easy_init()) == NULL)
+    {
+        elog(ERROR, "Failed to initialize curl handle");
+    }
+
+    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
+                               extension_server_port, filename, is_library ? "?is_library=true" : "");
+
+    elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
+
+    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
+    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
+    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
+
+    if (curl)
+    {
+        /* Perform the request, res will get the return code */
+        res = curl_easy_perform(curl);
+        /* Check for errors */
+        if (res == CURLE_OK)
+        {
+            ret = true;
+        }
+        else
+        {
+            // Don't error here because postgres will try to find the file
+            // and will fail with some proper error message if it's not found.
+            elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
+        }
+
+        /* always cleanup */
+        curl_easy_cleanup(curl);
+    }
+
+    return ret;
+}
+
+void pg_init_extension_server()
+{
+    // Port to connect to compute_ctl on localhost
+    // to request extension files.
+    DefineCustomIntVariable("neon.extension_server_port",
+                            "connection string to the compute_ctl",
+                            NULL,
+                            &extension_server_port,
+                            0, 0, INT_MAX,
+                            PGC_POSTMASTER,
+                            0, /* no flags required */
+                            NULL, NULL, NULL);
+
+    // set download_extension_file_hook
+    prev_download_extension_file_hook = download_extension_file_hook;
+    download_extension_file_hook = neon_download_extension_file_http;
+}
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -35,8 +35,11 @@ _PG_init(void)
 {
 	pg_init_libpagestore();
 	pg_init_walproposer();
+
 	InitControlPlaneConnector();

+	pg_init_extension_server();
+
        // Important: This must happen after other parts of the extension
        // are loaded, otherwise any settings to GUCs that were set before
        // the extension was loaded will be removed.
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -21,6 +21,8 @@ extern char *neon_tenant;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

+extern void pg_init_extension_server(void);
+
 /*
 * Returns true if we shouldn't do REDO on that block in record indicated by
 * block_id; false otherwise.
--- a/poetry.lock
+++ b/poetry.lock
@@ -740,13 +740,13 @@ typing-extensions = ">=4.1.0"

 [[package]]
 name = "certifi"
-version = "2022.12.7"
+version = "2023.7.22"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
-    {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
+    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
+    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
 ]

 [[package]]
@@ -887,34 +887,34 @@ files = [

 [[package]]
 name = "cryptography"
-version = "41.0.2"
+version = "41.0.3"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:01f1d9e537f9a15b037d5d9ee442b8c22e3ae11ce65ea1f3316a41c78756b711"},
-    {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:079347de771f9282fbfe0e0236c716686950c19dee1b76240ab09ce1624d76d7"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:439c3cc4c0d42fa999b83ded80a9a1fb54d53c58d6e59234cfe97f241e6c781d"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f14ad275364c8b4e525d018f6716537ae7b6d369c094805cae45300847e0894f"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:84609ade00a6ec59a89729e87a503c6e36af98ddcd566d5f3be52e29ba993182"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:49c3222bb8f8e800aead2e376cbef687bc9e3cb9b58b29a261210456a7783d83"},
-    {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d73f419a56d74fef257955f51b18d046f3506270a5fd2ac5febbfa259d6c0fa5"},
-    {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:2a034bf7d9ca894720f2ec1d8b7b5832d7e363571828037f9e0c4f18c1b58a58"},
-    {file = "cryptography-41.0.2-cp37-abi3-win32.whl", hash = "sha256:d124682c7a23c9764e54ca9ab5b308b14b18eba02722b8659fb238546de83a76"},
-    {file = "cryptography-41.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:9c3fe6534d59d071ee82081ca3d71eed3210f76ebd0361798c74abc2bcf347d4"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a719399b99377b218dac6cf547b6ec54e6ef20207b6165126a280b0ce97e0d2a"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:182be4171f9332b6741ee818ec27daff9fb00349f706629f5cbf417bd50e66fd"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7a9a3bced53b7f09da251685224d6a260c3cb291768f54954e28f03ef14e3766"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f0dc40e6f7aa37af01aba07277d3d64d5a03dc66d682097541ec4da03cc140ee"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:674b669d5daa64206c38e507808aae49904c988fa0a71c935e7006a3e1e83831"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7af244b012711a26196450d34f483357e42aeddb04128885d95a69bd8b14b69b"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9b6d717393dbae53d4e52684ef4f022444fc1cce3c48c38cb74fca29e1f08eaa"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:192255f539d7a89f2102d07d7375b1e0a81f7478925b3bc2e0549ebf739dae0e"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f772610fe364372de33d76edcd313636a25684edb94cee53fd790195f5989d14"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b332cba64d99a70c1e0836902720887fb4529ea49ea7f5462cf6640e095e11d2"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9a6673c1828db6270b76b22cc696f40cde9043eb90373da5c2f8f2158957f42f"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:342f3767e25876751e14f8459ad85e77e660537ca0a066e10e75df9c9e9099f0"},
-    {file = "cryptography-41.0.2.tar.gz", hash = "sha256:7d230bf856164de164ecb615ccc14c7fc6de6906ddd5b491f3af90d3514c925c"},
+    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"},
+    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"},
+    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"},
+    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"},
+    {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"},
+    {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"},
+    {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"},
 ]

 [package.dependencies]
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -53,6 +53,12 @@ pub enum BackendType<'a, T> {
    Postgres(Cow<'a, console::provider::mock::Api>, T),
    /// Authentication via a web browser.
    Link(Cow<'a, url::ApiUrl>),
+    /// Test backend.
+    Test(&'a dyn TestBackend),
+}
+
+pub trait TestBackend: Send + Sync + 'static {
+    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 }

 impl std::fmt::Display for BackendType<'_, ()> {
@@ -62,6 +68,7 @@ impl std::fmt::Display for BackendType<'_, ()> {
            Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
            Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Test(_) => fmt.debug_tuple("Test").finish(),
        }
    }
 }
@@ -75,6 +82,7 @@ impl<T> BackendType<'_, T> {
            Console(c, x) => Console(Cow::Borrowed(c), x),
            Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
            Link(c) => Link(Cow::Borrowed(c)),
+            Test(x) => Test(*x),
        }
    }
 }
@@ -89,6 +97,7 @@ impl<'a, T> BackendType<'a, T> {
            Console(c, x) => Console(c, f(x)),
            Postgres(c, x) => Postgres(c, f(x)),
            Link(c) => Link(c),
+            Test(x) => Test(x),
        }
    }
 }
@@ -102,6 +111,7 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
            Console(c, x) => x.map(|x| Console(c, x)),
            Postgres(c, x) => x.map(|x| Postgres(c, x)),
            Link(c) => Ok(Link(c)),
+            Test(x) => Ok(Test(x)),
        }
    }
 }
@@ -147,6 +157,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Console(_, creds) => creds.project.clone(),
            Postgres(_, creds) => creds.project.clone(),
            Link(_) => Some("link".to_owned()),
+            Test(_) => Some("test".to_owned()),
        }
    }
    /// Authenticate the client via the requested backend, possibly using credentials.
@@ -188,6 +199,9 @@ impl BackendType<'_, ClientCredentials<'_>> {
                    .await?
                    .map(CachedNodeInfo::new_uncached)
            }
+            Test(_) => {
+                unreachable!("this function should never be called in the test backend")
+            }
        };

        info!("user successfully authenticated");
@@ -206,6 +220,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Link(_) => Ok(None),
+            Test(x) => x.wake_compute().map(Some),
        }
    }
 }
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,8 +1,11 @@
+use std::ops::ControlFlow;
+
 use super::AuthSuccess;
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
+    proxy::handle_try_wake,
    sasl, scram,
    stream::PqStream,
 };
@@ -48,7 +51,16 @@ pub(super) async fn authenticate(
        }
    };

-    let mut node = api.wake_compute(extra, creds).await?;
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let mut num_retries = 0;
+    let mut node = loop {
+        let wake_res = api.wake_compute(extra, creds).await;
+        match handle_try_wake(wake_res, num_retries)? {
+            ControlFlow::Continue(_) => num_retries += 1,
+            ControlFlow::Break(n) => break n,
+        }
+        info!(num_retries, "retrying wake compute");
+    };
    if let Some(keys) = scram_keys {
        use tokio_postgres::config::AuthKeys;
        node.config.auth_keys(AuthKeys::ScramSha256(keys));
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -14,6 +14,7 @@ pub mod errors {
    use crate::{
        error::{io_error, UserFacingError},
        http,
+        proxy::ShouldRetry,
    };
    use thiserror::Error;

@@ -72,6 +73,24 @@ pub mod errors {
        }
    }

+    impl ShouldRetry for ApiError {
+        fn could_retry(&self) -> bool {
+            match self {
+                // retry some transport errors
+                Self::Transport(io) => io.could_retry(),
+                // retry some temporary failures because the compute was in a bad state
+                // (bad request can be returned when the endpoint was in transition)
+                Self::Console {
+                    status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED,
+                    ..
+                } => true,
+                // retry server errors
+                Self::Console { status, .. } if status.is_server_error() => true,
+                _ => false,
+            }
+        }
+    }
+
    impl From<reqwest::Error> for ApiError {
        fn from(e: reqwest::Error) -> Self {
            io_error(e).into()
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,7 +1,9 @@
 use std::sync::Arc;

+use anyhow::bail;
 use futures::pin_mut;
 use futures::StreamExt;
+use hashbrown::HashMap;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
@@ -12,6 +14,7 @@ use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
+use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
 use url::Url;

@@ -37,6 +40,8 @@ const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
+static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
+static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");

 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

@@ -170,7 +175,7 @@ pub async fn handle(
    request: Request<Body>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
-) -> anyhow::Result<Value> {
+) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
    //
    // Determine the destination and connection params
    //
@@ -185,6 +190,23 @@ pub async fn handle(
    // Allow connection pooling only if explicitly requested
    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

+    // isolation level and read only
+
+    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
+    let txn_isolation_level = match txn_isolation_level_raw {
+        Some(ref x) => Some(match x.as_bytes() {
+            b"Serializable" => IsolationLevel::Serializable,
+            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
+            b"ReadCommitted" => IsolationLevel::ReadCommitted,
+            b"RepeatableRead" => IsolationLevel::RepeatableRead,
+            _ => bail!("invalid isolation level"),
+        }),
+        None => None,
+    };
+
+    let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
+    let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
+
    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
        None => MAX_REQUEST_SIZE + 1,
@@ -208,10 +230,19 @@ pub async fn handle(
    // Now execute the query and return the result
    //
    let result = match payload {
-        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode).await,
+        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
+            .await
+            .map(|x| (x, HashMap::default())),
        Payload::Batch(queries) => {
            let mut results = Vec::new();
-            let transaction = client.transaction().await?;
+            let mut builder = client.build_transaction();
+            if let Some(isolation_level) = txn_isolation_level {
+                builder = builder.isolation_level(isolation_level);
+            }
+            if txn_read_only {
+                builder = builder.read_only(true);
+            }
+            let transaction = builder.start().await?;
            for query in queries {
                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
                match result {
@@ -223,7 +254,15 @@ pub async fn handle(
                }
            }
            transaction.commit().await?;
-            Ok(json!({ "results": results }))
+            let mut headers = HashMap::default();
+            headers.insert(
+                TXN_READ_ONLY.clone(),
+                HeaderValue::try_from(txn_read_only.to_string())?,
+            );
+            if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
+                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
+            }
+            Ok((json!({ "results": results }), headers))
        }
    };

--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -6,6 +6,7 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
+use hashbrown::HashMap;
 use hyper::{
    server::{
        accept,
@@ -181,13 +182,15 @@ async fn ws_handler(

    // Check if the request is a websocket upgrade request.
    if hyper_tungstenite::is_upgrade_request(&request) {
+        info!(session_id = ?session_id, "performing websocket upgrade");
+
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

        tokio::spawn(async move {
            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
            {
-                error!("error in websocket connection: {e:?}");
+                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
            }
        });

@@ -203,7 +206,7 @@ async fn ws_handler(
            Ok(_) => StatusCode::OK,
            Err(_) => StatusCode::BAD_REQUEST,
        };
-        let json = match result {
+        let (json, headers) = match result {
            Ok(r) => r,
            Err(e) => {
                let message = format!("{:?}", e);
@@ -214,7 +217,10 @@ async fn ws_handler(
                    },
                    None => Value::Null,
                };
-                json!({ "message": message, "code": code })
+                (
+                    json!({ "message": message, "code": code }),
+                    HashMap::default(),
+                )
            }
        };
        json_response(status_code, json).map(|mut r| {
@@ -222,6 +228,9 @@ async fn ws_handler(
                "Access-Control-Allow-Origin",
                hyper::http::HeaderValue::from_static("*"),
            );
+            for (k, v) in headers {
+                r.headers_mut().insert(k, v);
+            }
            r
        })
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -11,7 +11,6 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

-///
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
 /// so keep it in a named struct.
@@ -19,8 +18,7 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// Both the proxy and the ingestion endpoint will live in the same region (or cell)
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
-///
-#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
+#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
 pub struct Ids {
    pub endpoint_id: String,
    pub branch_id: String,
@@ -149,7 +147,7 @@ async fn collect_metrics_iteration(
                    stop_time: *curr_time,
                },
                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname.to_owned()),
+                idempotency_key: idempotency_key(hostname),
                value,
                extra: Ids {
                    endpoint_id: curr_key.endpoint_id.clone(),
@@ -167,12 +165,11 @@ async fn collect_metrics_iteration(
    // Send metrics.
    // Split into chunks of 1000 metrics to avoid exceeding the max request size
    for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
-        let chunk_json = serde_json::value::to_raw_value(&EventChunk { events: chunk })
-            .expect("ProxyConsumptionMetric should not fail serialization");
-
        let res = client
            .post(metric_collection_endpoint.clone())
-            .json(&chunk_json)
+            .json(&EventChunk {
+                events: chunk.into(),
+            })
            .send()
            .await;

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,17 +6,12 @@ use crate::{
    cancellation::{self, CancelMap},
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
-    console::{
-        self,
-        errors::{ApiError, WakeComputeError},
-        messages::MetricsAuxInfo,
-    },
+    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
    stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use hyper::StatusCode;
 use metrics::{
    exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec,
 };
@@ -33,7 +28,7 @@ use utils::measured_stream::MeasuredStream;

 /// Number of times we should retry the `/proxy_wake_compute` http request.
 /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
-const NUM_RETRIES_CONNECT: u32 = 10;
+pub const NUM_RETRIES_CONNECT: u32 = 10;
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);

@@ -196,6 +191,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    stream: S,
    mode: ClientMode,
 ) -> anyhow::Result<()> {
+    info!(
+        protocol = mode.protocol_label(),
+        "handling interactive connection from client"
+    );
+
    // The `closed` counter will increase when this future is destroyed.
    NUM_CONNECTIONS_ACCEPTED_COUNTER
        .with_label_values(&[mode.protocol_label()])
@@ -347,11 +347,6 @@ async fn connect_to_compute_once(
        .await
 }

-enum ConnectionState<E> {
-    Cached(console::CachedNodeInfo),
-    Invalid(compute::ConnCfg, E),
-}
-
 #[async_trait]
 pub trait ConnectMechanism {
    type Connection;
@@ -407,86 +402,87 @@ where

    mechanism.update_connect_config(&mut node_info.config);

-    let mut num_retries = 0;
-    let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);
+    // try once
+    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+        Ok(res) => return Ok(res),
+        Err(e) => {
+            error!(error = ?e, "could not connect to compute node");
+            (invalidate_cache(node_info), e)
+        }
+    };

-    loop {
-        match state {
-            ConnectionState::Invalid(config, err) => {
-                match try_wake(&config, extra, creds).await {
-                    // we can't wake up the compute node
-                    Ok(None) => return Err(err.into()),
-                    // there was an error communicating with the control plane
-                    Err(e) => return Err(e.into()),
-                    // failed to wake up but we can continue to retry
-                    Ok(Some(ControlFlow::Continue(()))) => {
-                        state = ConnectionState::Invalid(config, err);
-                        let wait_duration = retry_after(num_retries);
-                        num_retries += 1;
+    let mut num_retries = 1;

-                        info!(num_retries, "retrying wake compute");
-                        time::sleep(wait_duration).await;
-                        continue;
-                    }
-                    // successfully woke up a compute node and can break the wakeup loop
-                    Ok(Some(ControlFlow::Break(mut node_info))) => {
-                        mechanism.update_connect_config(&mut node_info.config);
-                        state = ConnectionState::Cached(node_info)
-                    }
-                }
+    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let node_info = loop {
+        let wake_res = match creds {
+            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
+            // nothing to do?
+            auth::BackendType::Link(_) => return Err(err.into()),
+            // test backend
+            auth::BackendType::Test(x) => x.wake_compute(),
+        };
+
+        match handle_try_wake(wake_res, num_retries)? {
+            // failed to wake up but we can continue to retry
+            ControlFlow::Continue(_) => {}
+            // successfully woke up a compute node and can break the wakeup loop
+            ControlFlow::Break(mut node_info) => {
+                node_info.config.reuse_password(&config);
+                mechanism.update_connect_config(&mut node_info.config);
+                break node_info;
            }
-            ConnectionState::Cached(node_info) => {
-                match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-                    Ok(res) => return Ok(res),
-                    Err(e) => {
-                        error!(error = ?e, "could not connect to compute node");
-                        if !e.should_retry(num_retries) {
-                            return Err(e.into());
-                        }
+        }

-                        // after the first connect failure,
-                        // we should invalidate the cache and wake up a new compute node
-                        if num_retries == 0 {
-                            state = ConnectionState::Invalid(invalidate_cache(node_info), e);
-                        } else {
-                            state = ConnectionState::Cached(node_info);
-                        }
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;

-                        let wait_duration = retry_after(num_retries);
-                        num_retries += 1;
+        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying wake compute");
+    };

-                        info!(num_retries, "retrying wake compute");
-                        time::sleep(wait_duration).await;
-                    }
+    // now that we have a new node, try connect to it repeatedly.
+    // this can error for a few reasons, for instance:
+    // * DNS connection settings haven't quite propagated yet
+    info!("wake_compute success. attempting to connect");
+    loop {
+        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !e.should_retry(num_retries) {
+                    return Err(e.into());
                }
            }
        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying connect_once");
    }
 }

 /// Attempts to wake up the compute node.
-/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Some(false)) if the wakeup succeeded
-/// * Returns Ok(None) or Err(e) if there was an error
-async fn try_wake(
-    config: &compute::ConnCfg,
-    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-) -> Result<Option<ControlFlow<console::CachedNodeInfo>>, WakeComputeError> {
-    info!("compute node's state has likely changed; requesting a wake-up");
-    match creds.wake_compute(extra).await {
-        // retry wake if the compute was in an invalid state
-        Err(WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        })) => Ok(Some(ControlFlow::Continue(()))),
-        // Update `node_info` and try again.
-        Ok(Some(mut new)) => {
-            new.config.reuse_password(config);
-            Ok(Some(ControlFlow::Break(new)))
-        }
-        Err(e) => Err(e),
-        Ok(None) => Ok(None),
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
+pub fn handle_try_wake(
+    result: Result<console::CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
+) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
+    match result {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
    }
 }

@@ -494,8 +490,6 @@ pub trait ShouldRetry {
    fn could_retry(&self) -> bool;
    fn should_retry(&self, num_retries: u32) -> bool {
        match self {
-            // retry all errors at least once
-            _ if num_retries == 0 => true,
            _ if num_retries >= NUM_RETRIES_CONNECT => false,
            err => err.could_retry(),
        }
@@ -547,14 +541,9 @@ impl ShouldRetry for compute::ConnectionError {
    }
 }

-pub fn retry_after(num_retries: u32) -> time::Duration {
-    match num_retries {
-        0 => time::Duration::ZERO,
-        _ => {
-            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
-        }
-    }
+fn retry_after(num_retries: u32) -> time::Duration {
+    // 1.5 seems to be an ok growth factor heuristic
+    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,10 +1,10 @@
 //! A group of high-level tests for connection establishing logic and auth.
-use std::borrow::Cow;
-
+//!
 use super::*;
+use crate::auth::backend::TestBackend;
 use crate::auth::ClientCredentials;
 use crate::console::{CachedNodeInfo, NodeInfo};
-use crate::{auth, sasl, scram};
+use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
 use tokio_postgres::config::SslMode;
@@ -302,15 +302,18 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 0..10 {
+    for num_retries in 1..10 {
        total_wait += retry_after(num_retries);
    }
    assert!(total_wait < tokio::time::Duration::from_secs(12));
    assert!(total_wait > tokio::time::Duration::from_secs(10));
 }

-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Debug)]
 enum ConnectAction {
+    Wake,
+    WakeFail,
+    WakeRetry,
    Connect,
    Retry,
    Fail,
@@ -321,6 +324,17 @@ struct TestConnectMechanism {
    sequence: Vec<ConnectAction>,
 }

+impl TestConnectMechanism {
+    fn verify(&self) {
+        let counter = self.counter.lock().unwrap();
+        assert_eq!(
+            *counter,
+            self.sequence.len(),
+            "sequence does not proceed to the end"
+        );
+    }
+}
+
 impl TestConnectMechanism {
    fn new(sequence: Vec<ConnectAction>) -> Self {
        Self {
@@ -370,30 +384,63 @@ impl ConnectMechanism for TestConnectMechanism {
            ConnectAction::Connect => Ok(TestConnection),
            ConnectAction::Retry => Err(TestConnectError { retryable: true }),
            ConnectAction::Fail => Err(TestConnectError { retryable: false }),
+            x => panic!("expecting action {:?}, connect is called instead", x),
        }
    }

    fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
 }

-fn helper_create_connect_info() -> (
-    CachedNodeInfo,
-    console::ConsoleReqExtra<'static>,
-    auth::BackendType<'static, ClientCredentials<'static>>,
-) {
+impl TestBackend for TestConnectMechanism {
+    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        let mut counter = self.counter.lock().unwrap();
+        let action = self.sequence[*counter];
+        *counter += 1;
+        match action {
+            ConnectAction::Wake => Ok(helper_create_cached_node_info()),
+            ConnectAction::WakeFail => {
+                let err = console::errors::ApiError::Console {
+                    status: http::StatusCode::FORBIDDEN,
+                    text: "TEST".into(),
+                };
+                assert!(!err.could_retry());
+                Err(console::errors::WakeComputeError::ApiError(err))
+            }
+            ConnectAction::WakeRetry => {
+                let err = console::errors::ApiError::Console {
+                    status: http::StatusCode::INTERNAL_SERVER_ERROR,
+                    text: "TEST".into(),
+                };
+                assert!(err.could_retry());
+                Err(console::errors::WakeComputeError::ApiError(err))
+            }
+            x => panic!("expecting action {:?}, wake_compute is called instead", x),
+        }
+    }
+}
+
+fn helper_create_cached_node_info() -> CachedNodeInfo {
    let node = NodeInfo {
        config: compute::ConnCfg::new(),
        aux: Default::default(),
        allow_self_signed_compute: false,
    };
-    let cache = CachedNodeInfo::new_uncached(node);
+    CachedNodeInfo::new_uncached(node)
+}
+
+fn helper_create_connect_info(
+    mechanism: &TestConnectMechanism,
+) -> (
+    CachedNodeInfo,
+    console::ConsoleReqExtra<'static>,
+    auth::BackendType<'_, ClientCredentials<'static>>,
+) {
+    let cache = helper_create_cached_node_info();
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some("TEST"),
    };
-    let url = "https://TEST_URL".parse().unwrap();
-    let api = console::provider::mock::Api::new(url);
-    let creds = auth::BackendType::Postgres(Cow::Owned(api), ClientCredentials::new_noop());
+    let creds = auth::BackendType::Test(mechanism);
    (cache, extra, creds)
 }

@@ -401,42 +448,46 @@ fn helper_create_connect_info() -> (
 async fn connect_to_compute_success() {
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![Connect]);
-    let (cache, extra, creds) = helper_create_connect_info();
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap();
+    mechanism.verify();
 }

 #[tokio::test]
 async fn connect_to_compute_retry() {
    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info();
+    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap();
+    mechanism.verify();
 }

 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {
    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Fail]);
-    let (cache, extra, creds) = helper_create_connect_info();
+    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap_err();
+    mechanism.verify();
 }

 /// Even for non-retryable errors, we should retry at least once.
 #[tokio::test]
 async fn connect_to_compute_non_retry_2() {
    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Fail, Retry, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info();
+    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap();
+    mechanism.verify();
 }

 /// Retry for at most `NUM_RETRIES_CONNECT` times.
@@ -445,11 +496,36 @@ async fn connect_to_compute_non_retry_3() {
    assert_eq!(NUM_RETRIES_CONNECT, 10);
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![
-        Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
        /* the 11th time */ Retry,
    ]);
-    let (cache, extra, creds) = helper_create_connect_info();
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap_err();
+    mechanism.verify();
+}
+
+/// Should retry wake compute.
+#[tokio::test]
+async fn wake_retry() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap();
+    mechanism.verify();
+}
+
+/// Wake failed with a non-retryable error.
+#[tokio::test]
+async fn wake_non_retry() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap_err();
+    mechanism.verify();
 }
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -234,7 +234,10 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
                listen_pg_addr_tenant_only
            );
            let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
-                error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+                error!(
+                    "failed to bind to address {}: {}",
+                    listen_pg_addr_tenant_only, e
+                );
                e
            })?;
            Some(listener)
--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -4,17 +4,45 @@
 # for all extensions in extensions subdir.
 import argparse
 import json
+import subprocess
 from pathlib import Path

+"""
+# ext_index.json example:
+{
+    "public_extensions": [
+        "anon"
+    ],
+    "library_index": {
+        "anon": "anon",
+        // for more complex extensions like postgis
+        // we might have something like:
+        // address_standardizer: postgis
+        // postgis_tiger: postgis
+    },
+    "extension_data": {
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5648391853/v15/extensions/anon.tar.zst"
+        }
+    }
+}
+"""
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="generate ext_index.json")
    parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
    parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
+    parser.add_argument("--public_extensions", type=str, help="list of public extensions")
    args = parser.parse_args()
    pg_version = args.pg_version
    BUILD_TAG = args.BUILD_TAG
+    public_ext_list = args.public_extensions.split(",")

    ext_index = {}
+    library_index = {}
    EXT_PATH = Path("extensions")
    for extension in EXT_PATH.iterdir():
        if extension.is_dir():
@@ -28,6 +56,21 @@ if __name__ == "__main__":
                "control_data": control_data,
                "archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
            }
+        elif extension.suffix == ".zst":
+            file_list = (
+                str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
+                .strip()
+                .split("\n")
+            )
+            for file in file_list:
+                if file.endswith(".so") and file.startswith("lib/"):
+                    lib_name = file[4:-3]
+                    library_index[lib_name] = extension.name.replace(".tar.zst", "")

+    all_data = {
+        "public_extensions": public_ext_list,
+        "library_index": library_index,
+        "extension_data": ext_index,
+    }
    with open("ext_index.json", "w") as f:
-        json.dump(ext_index, f)
+        json.dump(all_data, f)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -530,6 +530,16 @@ def available_remote_storages() -> List[RemoteStorageKind]:
    return remote_storages


+def available_s3_storages() -> List[RemoteStorageKind]:
+    remote_storages = [RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
+        remote_storages.append(RemoteStorageKind.REAL_S3)
+        log.info("Enabling real s3 storage for tests")
+    else:
+        log.info("Using mock implementations to test remote storage")
+    return remote_storages
+
+
@dataclass
 class LocalFsStorage:
    root: Path
@@ -542,7 +552,7 @@ class S3Storage:
    access_key: str
    secret_key: str
    endpoint: Optional[str] = None
-    prefix_in_bucket: Optional[str] = None
+    prefix_in_bucket: Optional[str] = ""

    def access_env_vars(self) -> Dict[str, str]:
        return {
@@ -550,6 +560,16 @@ class S3Storage:
            "AWS_SECRET_ACCESS_KEY": self.secret_key,
        }

+    def to_string(self) -> str:
+        return json.dumps(
+            {
+                "bucket": self.bucket_name,
+                "region": self.bucket_region,
+                "endpoint": self.endpoint,
+                "prefix": self.prefix_in_bucket,
+            }
+        )
+

 RemoteStorage = Union[LocalFsStorage, S3Storage]

@@ -616,10 +636,12 @@ class NeonEnvBuilder:
        self.rust_log_override = rust_log_override
        self.port_distributor = port_distributor
        self.remote_storage = remote_storage
+        self.ext_remote_storage: Optional[S3Storage] = None
+        self.remote_storage_client: Optional[Any] = None
        self.remote_storage_users = remote_storage_users
        self.broker = broker
        self.run_id = run_id
-        self.mock_s3_server = mock_s3_server
+        self.mock_s3_server: MockS3Server = mock_s3_server
        self.pageserver_config_override = pageserver_config_override
        self.num_safekeepers = num_safekeepers
        self.safekeepers_id_start = safekeepers_id_start
@@ -667,15 +689,24 @@ class NeonEnvBuilder:
        remote_storage_kind: RemoteStorageKind,
        test_name: str,
        force_enable: bool = True,
+        enable_remote_extensions: bool = False,
    ):
        if remote_storage_kind == RemoteStorageKind.NOOP:
            return
        elif remote_storage_kind == RemoteStorageKind.LOCAL_FS:
            self.enable_local_fs_remote_storage(force_enable=force_enable)
        elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
-            self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable)
+            self.enable_mock_s3_remote_storage(
+                bucket_name=test_name,
+                force_enable=force_enable,
+                enable_remote_extensions=enable_remote_extensions,
+            )
        elif remote_storage_kind == RemoteStorageKind.REAL_S3:
-            self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable)
+            self.enable_real_s3_remote_storage(
+                test_name=test_name,
+                force_enable=force_enable,
+                enable_remote_extensions=enable_remote_extensions,
+            )
        else:
            raise RuntimeError(f"Unknown storage type: {remote_storage_kind}")

@@ -689,11 +720,18 @@ class NeonEnvBuilder:
        assert force_enable or self.remote_storage is None, "remote storage is enabled already"
        self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage"))

-    def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True):
+    def enable_mock_s3_remote_storage(
+        self,
+        bucket_name: str,
+        force_enable: bool = True,
+        enable_remote_extensions: bool = False,
+    ):
        """
        Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
        Starts up the mock server, if that does not run yet.
        Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
+
+        Also creates the bucket for extensions, self.ext_remote_storage bucket
        """
        assert force_enable or self.remote_storage is None, "remote storage is enabled already"
        mock_endpoint = self.mock_s3_server.endpoint()
@@ -714,9 +752,25 @@ class NeonEnvBuilder:
            bucket_region=mock_region,
            access_key=self.mock_s3_server.access_key(),
            secret_key=self.mock_s3_server.secret_key(),
+            prefix_in_bucket="pageserver",
        )

-    def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True):
+        if enable_remote_extensions:
+            self.ext_remote_storage = S3Storage(
+                bucket_name=bucket_name,
+                endpoint=mock_endpoint,
+                bucket_region=mock_region,
+                access_key=self.mock_s3_server.access_key(),
+                secret_key=self.mock_s3_server.secret_key(),
+                prefix_in_bucket="ext",
+            )
+
+    def enable_real_s3_remote_storage(
+        self,
+        test_name: str,
+        force_enable: bool = True,
+        enable_remote_extensions: bool = False,
+    ):
        """
        Sets up configuration to use real s3 endpoint without mock server
        """
@@ -756,6 +810,15 @@ class NeonEnvBuilder:
            prefix_in_bucket=self.remote_storage_prefix,
        )

+        if enable_remote_extensions:
+            self.ext_remote_storage = S3Storage(
+                bucket_name="neon-dev-extensions-eu-central-1",
+                bucket_region="eu-central-1",
+                access_key=access_key,
+                secret_key=secret_key,
+                prefix_in_bucket=None,
+            )
+
    def cleanup_local_storage(self):
        if self.preserve_database_files:
            return
@@ -789,6 +852,7 @@ class NeonEnvBuilder:
        # `self.remote_storage_prefix` is coupled with `S3Storage` storage type,
        # so this line effectively a no-op
        assert isinstance(self.remote_storage, S3Storage)
+        assert self.remote_storage_client is not None

        if self.keep_remote_storage_contents:
            log.info("keep_remote_storage_contents skipping remote storage cleanup")
@@ -918,6 +982,8 @@ class NeonEnv:
        self.neon_binpath = config.neon_binpath
        self.pg_distrib_dir = config.pg_distrib_dir
        self.endpoint_counter = 0
+        self.remote_storage_client = config.remote_storage_client
+        self.ext_remote_storage = config.ext_remote_storage

        # generate initial tenant ID here instead of letting 'neon init' generate it,
        # so that we don't need to dig it out of the config file afterwards.
@@ -1504,6 +1570,8 @@ class NeonCli(AbstractNeonCli):
        safekeepers: Optional[List[int]] = None,
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
+        branch_name: Optional[str] = None,
+        remote_ext_config: Optional[str] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1513,12 +1581,17 @@ class NeonCli(AbstractNeonCli):
            "--pg-version",
            self.env.pg_version,
        ]
+        if remote_ext_config is not None:
+            args.extend(["--remote-ext-config", remote_ext_config])
        if lsn is not None:
            args.append(f"--lsn={lsn}")
        args.extend(["--pg-port", str(pg_port)])
        args.extend(["--http-port", str(http_port)])
+
        if safekeepers is not None:
            args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
+        if branch_name is not None:
+            args.extend(["--branch-name", branch_name])
        if endpoint_id is not None:
            args.append(endpoint_id)

@@ -2371,7 +2444,7 @@ class Endpoint(PgProtocol):

        return self

-    def start(self) -> "Endpoint":
+    def start(self, remote_ext_config: Optional[str] = None) -> "Endpoint":
        """
        Start the Postgres instance.
        Returns self.
@@ -2387,6 +2460,7 @@ class Endpoint(PgProtocol):
            http_port=self.http_port,
            tenant_id=self.tenant_id,
            safekeepers=self.active_safekeepers,
+            remote_ext_config=remote_ext_config,
        )
        self.running = True

@@ -2476,6 +2550,7 @@ class Endpoint(PgProtocol):
        hot_standby: bool = False,
        lsn: Optional[Lsn] = None,
        config_lines: Optional[List[str]] = None,
+        remote_ext_config: Optional[str] = None,
    ) -> "Endpoint":
        """
        Create an endpoint, apply config, and start Postgres.
@@ -2490,7 +2565,7 @@ class Endpoint(PgProtocol):
            config_lines=config_lines,
            hot_standby=hot_standby,
            lsn=lsn,
-        ).start()
+        ).start(remote_ext_config=remote_ext_config)

        log.info(f"Postgres startup took {time.time() - started_at} seconds")

@@ -2524,6 +2599,7 @@ class EndpointFactory:
        lsn: Optional[Lsn] = None,
        hot_standby: bool = False,
        config_lines: Optional[List[str]] = None,
+        remote_ext_config: Optional[str] = None,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -2540,6 +2616,7 @@ class EndpointFactory:
            hot_standby=hot_standby,
            config_lines=config_lines,
            lsn=lsn,
+            remote_ext_config=remote_ext_config,
        )

    def create(
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -197,10 +197,9 @@ def wait_timeline_detail_404(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    wait_longer: bool = False,
+    iterations: int,
 ):
    last_exc = None
-    iterations = 10 if wait_longer else 2
    for _ in range(iterations):
        time.sleep(0.250)
        try:
@@ -220,8 +219,8 @@ def timeline_delete_wait_completed(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    wait_longer: bool = False,  # Use when running with RemoteStorageKind.REAL_S3
+    iterations: int = 20,
    **delete_args,
 ):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -89,6 +89,9 @@ class TenantId(Id):
    def __repr__(self) -> str:
        return f'`TenantId("{self.id.hex()}")'

+    def __str__(self) -> str:
+        return self.id.hex()
+

 class TimelineId(Id):
    def __repr__(self) -> str:
--- a/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json
+++ b/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json
@@ -0,0 +1,24 @@
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
--- a/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst
+++ b/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst
--- a/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst
+++ b/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst
--- a/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json
+++ b/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json
@@ -0,0 +1,17 @@
+{
+    "public_extensions": [
+        "anon"
+    ],
+    "library_index": {
+        "anon": "anon"
+    },
+    "extension_data": {
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v15/extensions/anon.tar.zst"
+        }
+    }
+}
+
--- a/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst
+++ b/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -4,7 +4,7 @@ import shutil
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, List, Optional

 import pytest
 import toml  # TODO: replace with tomllib for Python >= 3.11
@@ -14,7 +14,6 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
    PortDistributor,
-    parse_project_git_version_output,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -63,7 +62,6 @@ def test_create_snapshot(
    neon_env_builder.pg_version = pg_version
    neon_env_builder.num_safekeepers = 3
    neon_env_builder.enable_local_fs_remote_storage()
-    neon_env_builder.preserve_database_files = True

    env = neon_env_builder.init_start()
    endpoint = env.endpoints.create_start("main")
@@ -259,36 +257,15 @@ def prepare_snapshot(
        shutil.rmtree(repo_dir / "pgdatadirs")
    os.mkdir(repo_dir / "endpoints")

-    # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create
-    # them anymore, but old versions did.
-    for tenant in (repo_dir / "tenants").glob("*"):
-        wal_redo_dir = tenant / "wal-redo-datadir.___temp"
-        if wal_redo_dir.exists() and wal_redo_dir.is_dir():
-            shutil.rmtree(wal_redo_dir)
-
    # Update paths and ports in config files
    pageserver_toml = repo_dir / "pageserver.toml"
    pageserver_config = toml.load(pageserver_toml)
    pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
-    pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
-        pageserver_config["listen_http_addr"]
-    )
-    pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
-        pageserver_config["listen_pg_addr"]
-    )
-    # since storage_broker these are overridden by neon_local during pageserver
-    # start; remove both to prevent unknown options during etcd ->
-    # storage_broker migration. TODO: remove once broker is released
-    pageserver_config.pop("broker_endpoint", None)
-    pageserver_config.pop("broker_endpoints", None)
-    etcd_broker_endpoints = [f"http://localhost:{port_distributor.get_port()}/"]
-    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
-        pageserver_config["broker_endpoints"] = etcd_broker_endpoints  # old etcd version
+    for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"):
+        pageserver_config[param] = port_distributor.replace_with_new_port(pageserver_config[param])

-    # Older pageserver versions had just one `auth_type` setting. Now there
-    # are separate settings for pg and http ports. We don't use authentication
-    # in compatibility tests so just remove authentication related settings.
-    pageserver_config.pop("auth_type", None)
+    # We don't use authentication in compatibility tests
+    # so just remove authentication related settings.
    pageserver_config.pop("pg_auth_type", None)
    pageserver_config.pop("http_auth_type", None)

@@ -300,31 +277,16 @@ def prepare_snapshot(

    snapshot_config_toml = repo_dir / "config"
    snapshot_config = toml.load(snapshot_config_toml)
-
-    # Provide up/downgrade etcd <-> storage_broker to make forward/backward
-    # compatibility test happy. TODO: leave only the new part once broker is released.
-    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
-        # old etcd version
-        snapshot_config["etcd_broker"] = {
-            "etcd_binary_path": shutil.which("etcd"),
-            "broker_endpoints": etcd_broker_endpoints,
-        }
-        snapshot_config.pop("broker", None)
-    else:
-        # new storage_broker version
-        broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
-        snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
-        snapshot_config.pop("etcd_broker", None)
-
-    snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["pageserver"]["listen_http_addr"]
-    )
-    snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["pageserver"]["listen_pg_addr"]
+    for param in ("listen_http_addr", "listen_pg_addr"):
+        snapshot_config["pageserver"][param] = port_distributor.replace_with_new_port(
+            snapshot_config["pageserver"][param]
+        )
+    snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port(
+        snapshot_config["broker"]["listen_addr"]
    )
    for sk in snapshot_config["safekeepers"]:
-        sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
-        sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])
+        for param in ("http_port", "pg_port", "pg_tenant_only_port"):
+            sk[param] = port_distributor.replace_with_new_port(sk[param])

    if pg_distrib_dir:
        snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
@@ -350,12 +312,6 @@ def prepare_snapshot(
    ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"


-# get git SHA of neon binary
-def get_neon_version(neon_binpath: Path):
-    out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8")
-    return parse_project_git_version_output(out)
-
-
 def check_neon_works(
    repo_dir: Path,
    neon_target_binpath: Path,
@@ -381,7 +337,6 @@ def check_neon_works(
    config.pg_version = pg_version
    config.initial_tenant = snapshot_config["default_tenant_id"]
    config.pg_distrib_dir = pg_distrib_dir
-    config.preserve_database_files = True

    # Use the "target" binaries to launch the storage nodes
    config_target = config
@@ -438,6 +393,14 @@ def check_neon_works(
        test_output_dir / "dump-from-wal.filediff",
    )

+    # TODO: Run pg_amcheck unconditionally after the next release
+    try:
+        pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"])
+    except subprocess.CalledProcessError:
+        log.info("Extension amcheck is not available, skipping pg_amcheck")
+    else:
+        pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
+
    # Check that we can interract with the data
    pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])

@@ -445,10 +408,15 @@ def check_neon_works(
    assert not initial_dump_differs, "initial dump differs"


-def dump_differs(first: Path, second: Path, output: Path) -> bool:
+def dump_differs(
+    first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None
+) -> bool:
    """
    Runs diff(1) command on two SQL dumps and write the output to the given output file.
-    Returns True if the dumps differ, False otherwise.
+    The function supports allowed diffs, if the diff is in the allowed_diffs list, it's not considered as a difference.
+    See the example of it in https://github.com/neondatabase/neon/pull/4425/files#diff-15c5bfdd1d5cc1411b9221091511a60dd13a9edf672bdfbb57dd2ef8bb7815d6
+
+    Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False).
    """

    with output.open("w") as stdout:
@@ -466,51 +434,30 @@ def dump_differs(first: Path, second: Path, output: Path) -> bool:

    differs = res.returncode != 0

-    # TODO: Remove after https://github.com/neondatabase/neon/pull/4425 is merged, and a couple of releases are made
-    if differs:
-        with tempfile.NamedTemporaryFile(mode="w") as tmp:
-            tmp.write(PR4425_ALLOWED_DIFF)
-            tmp.flush()
+    allowed_diffs = allowed_diffs or []
+    if differs and len(allowed_diffs) > 0:
+        for allowed_diff in allowed_diffs:
+            with tempfile.NamedTemporaryFile(mode="w") as tmp:
+                tmp.write(allowed_diff)
+                tmp.flush()

-            allowed = subprocess.run(
-                [
-                    "diff",
-                    "--unified",  # Make diff output more readable
-                    r"--ignore-matching-lines=^---",  # Ignore diff headers
-                    r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
-                    "--ignore-matching-lines=^@@",  # Ignore diff blocks location
-                    "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
-                    "--ignore-matching-lines=^ --.*",  # Ignore the " --" lines for compatibility with PG14
-                    "--ignore-blank-lines",
-                    str(output),
-                    str(tmp.name),
-                ],
-            )
+                allowed = subprocess.run(
+                    [
+                        "diff",
+                        "--unified",  # Make diff output more readable
+                        r"--ignore-matching-lines=^---",  # Ignore diff headers
+                        r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
+                        "--ignore-matching-lines=^@@",  # Ignore diff blocks location
+                        "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
+                        "--ignore-matching-lines=^ --.*",  # Ignore SQL comments in diff
+                        "--ignore-blank-lines",
+                        str(output),
+                        str(tmp.name),
+                    ],
+                )

-            differs = allowed.returncode != 0
+                differs = allowed.returncode != 0
+                if not differs:
+                    break

    return differs
-
-
-PR4425_ALLOWED_DIFF = """
--- /tmp/test_output/test_backward_compatibility[release-pg15]/compatibility_snapshot/dump.sql 2023-06-08 18:12:45.000000000 +0000
-+++ /tmp/test_output/test_backward_compatibility[release-pg15]/dump.sql        2023-06-13 07:25:35.211733653 +0000
-@@ -13,12 +13,20 @@
-
- CREATE ROLE cloud_admin;
- ALTER ROLE cloud_admin WITH SUPERUSER INHERIT CREATEROLE CREATEDB LOGIN REPLICATION BYPASSRLS;
-+CREATE ROLE neon_superuser;
-+ALTER ROLE neon_superuser WITH NOSUPERUSER INHERIT CREATEROLE CREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS;
-
- --
- -- User Configurations
- --
-
-
-+--
-+-- Role memberships
-+--
-+
-+GRANT pg_read_all_data TO neon_superuser GRANTED BY cloud_admin;
-+GRANT pg_write_all_data TO neon_superuser GRANTED BY cloud_admin;
-"""
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -0,0 +1,324 @@
+import os
+import shutil
+import threading
+from contextlib import closing
+from pathlib import Path
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_s3_storages,
+)
+from fixtures.pg_version import PgVersion
+
+
+# Cleaning up downloaded files is important for local tests
+# or else one test could reuse the files from another test or another test run
+def cleanup(pg_version):
+    PGDIR = Path(f"pg_install/v{pg_version}")
+
+    LIB_DIR = PGDIR / Path("lib/postgresql")
+    cleanup_lib_globs = ["anon*", "postgis*", "pg_buffercache*"]
+    cleanup_lib_glob_paths = [LIB_DIR.glob(x) for x in cleanup_lib_globs]
+
+    SHARE_DIR = PGDIR / Path("share/postgresql/extension")
+    cleanup_ext_globs = [
+        "anon*",
+        "address_standardizer*",
+        "postgis*",
+        "pageinspect*",
+        "pg_buffercache*",
+        "pgrouting*",
+    ]
+    cleanup_ext_glob_paths = [SHARE_DIR.glob(x) for x in cleanup_ext_globs]
+
+    all_glob_paths = cleanup_lib_glob_paths + cleanup_ext_glob_paths
+    all_cleanup_files = []
+    for file_glob in all_glob_paths:
+        for file in file_glob:
+            all_cleanup_files.append(file)
+
+    for file in all_cleanup_files:
+        try:
+            os.remove(file)
+            log.info(f"removed file {file}")
+        except Exception as err:
+            log.info(
+                f"skipping remove of file {file} because it doesn't exist.\
+                      this may be expected or unexpected depending on the test {err}"
+            )
+
+    cleanup_folders = [SHARE_DIR / Path("anon"), PGDIR / Path("download_extensions")]
+    for folder in cleanup_folders:
+        try:
+            shutil.rmtree(folder)
+            log.info(f"removed folder {folder}")
+        except Exception as err:
+            log.info(
+                f"skipping remove of folder {folder} because it doesn't exist.\
+                      this may be expected or unexpected depending on the test {err}"
+            )
+
+
+def upload_files(env):
+    log.info("Uploading test files to mock bucket")
+    os.chdir("test_runner/regress/data/extension_test")
+    for path in os.walk("."):
+        prefix, _, files = path
+        for file in files:
+            # the [2:] is to remove the leading "./"
+            full_path = os.path.join(prefix, file)[2:]
+
+            with open(full_path, "rb") as f:
+                log.info(f"UPLOAD {full_path} to ext/{full_path}")
+                env.remote_storage_client.upload_fileobj(
+                    f,
+                    env.ext_remote_storage.bucket_name,
+                    f"ext/{full_path}",
+                )
+    os.chdir("../../../..")
+
+
+# Test downloading remote extension.
+@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
+def test_remote_extensions(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_version: PgVersion,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_remote_extensions",
+        enable_remote_extensions=True,
+    )
+    env = neon_env_builder.init_start()
+    tenant_id, _ = env.neon_cli.create_tenant()
+    env.neon_cli.create_timeline("test_remote_extensions", tenant_id=tenant_id)
+
+    assert env.ext_remote_storage is not None  # satisfy mypy
+    assert env.remote_storage_client is not None  # satisfy mypy
+
+    # For MOCK_S3 we upload test files.
+    # For REAL_S3 we use the files already in the bucket
+    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
+        upload_files(env)
+
+    # Start a compute node and check that it can download the extensions
+    # and use them to CREATE EXTENSION and LOAD
+    endpoint = env.endpoints.create_start(
+        "test_remote_extensions",
+        tenant_id=tenant_id,
+        remote_ext_config=env.ext_remote_storage.to_string(),
+        # config_lines=["log_min_messages=debug3"],
+    )
+    try:
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                # Check that appropriate control files were downloaded
+                cur.execute("SELECT * FROM pg_available_extensions")
+                all_extensions = [x[0] for x in cur.fetchall()]
+                log.info(all_extensions)
+                assert "anon" in all_extensions
+
+                # postgis is on real s3 but not mock s3.
+                # it's kind of a big file, would rather not upload to github
+                if remote_storage_kind == RemoteStorageKind.REAL_S3:
+                    assert "postgis" in all_extensions
+                    # this may fail locally if dependency is missing
+                    # we don't really care about the error,
+                    # we just want to make sure it downloaded
+                    try:
+                        cur.execute("CREATE EXTENSION postgis")
+                    except Exception as err:
+                        log.info(f"(expected) error creating postgis extension: {err}")
+                        # we do not check the error, so this is basically a NO-OP
+                        # however checking the log you can make sure that it worked
+                        # and also get valuable information about how long loading the extension took
+
+                # this is expected to fail on my computer because I don't have the pgcrypto extension
+                try:
+                    cur.execute("CREATE EXTENSION anon")
+                except Exception as err:
+                    log.info("error creating anon extension")
+                    assert "pgcrypto" in str(err), "unexpected error creating anon extension"
+    finally:
+        cleanup(pg_version)
+
+
+# Test downloading remote library.
+@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
+def test_remote_library(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_version: PgVersion,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_remote_library",
+        enable_remote_extensions=True,
+    )
+    env = neon_env_builder.init_start()
+    tenant_id, _ = env.neon_cli.create_tenant()
+    env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id)
+
+    assert env.ext_remote_storage is not None  # satisfy mypy
+    assert env.remote_storage_client is not None  # satisfy mypy
+
+    # For MOCK_S3 we upload test files.
+    # For REAL_S3 we use the files already in the bucket
+    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
+        upload_files(env)
+
+    # and use them to run LOAD library
+    endpoint = env.endpoints.create_start(
+        "test_remote_library",
+        tenant_id=tenant_id,
+        remote_ext_config=env.ext_remote_storage.to_string(),
+        # config_lines=["log_min_messages=debug3"],
+    )
+    try:
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                # try to load library
+                try:
+                    cur.execute("LOAD 'anon'")
+                except Exception as err:
+                    log.info(f"error loading anon library: {err}")
+                    raise AssertionError("unexpected error loading anon library") from err
+
+                # test library which name is different from extension name
+                # this may fail locally if dependency is missing
+                # however, it does successfully download the postgis archive
+                if remote_storage_kind == RemoteStorageKind.REAL_S3:
+                    try:
+                        cur.execute("LOAD 'postgis_topology-3'")
+                    except Exception as err:
+                        log.info("error loading postgis_topology-3")
+                        assert "No such file or directory" in str(
+                            err
+                        ), "unexpected error loading postgis_topology-3"
+    finally:
+        cleanup(pg_version)
+
+
+# Here we test a complex extension
+# which has multiple extensions in one archive
+# using postgis as an example
+@pytest.mark.skipif(
+    RemoteStorageKind.REAL_S3 not in available_s3_storages(),
+    reason="skipping test because real s3 not enabled",
+)
+def test_multiple_extensions_one_archive(
+    neon_env_builder: NeonEnvBuilder,
+    pg_version: PgVersion,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.REAL_S3,
+        test_name="test_multiple_extensions_one_archive",
+        enable_remote_extensions=True,
+    )
+    env = neon_env_builder.init_start()
+    tenant_id, _ = env.neon_cli.create_tenant()
+    env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id)
+
+    assert env.ext_remote_storage is not None  # satisfy mypy
+    assert env.remote_storage_client is not None  # satisfy mypy
+
+    endpoint = env.endpoints.create_start(
+        "test_multiple_extensions_one_archive",
+        tenant_id=tenant_id,
+        remote_ext_config=env.ext_remote_storage.to_string(),
+    )
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CREATE EXTENSION address_standardizer;")
+            cur.execute("CREATE EXTENSION address_standardizer_data_us;")
+            # execute query to ensure that it works
+            cur.execute(
+                "SELECT house_num, name, suftype, city, country, state, unit \
+                        FROM standardize_address('us_lex', 'us_gaz', 'us_rules', \
+                        'One Rust Place, Boston, MA 02109');"
+            )
+            res = cur.fetchall()
+            log.info(res)
+            assert len(res) > 0
+
+    cleanup(pg_version)
+
+
+# Test that extension is downloaded after endpoint restart,
+# when the library is used in the query.
+#
+# Run the test with mutliple simultaneous connections to an endpoint.
+# to ensure that the extension is downloaded only once.
+#
+def test_extension_download_after_restart(
+    neon_env_builder: NeonEnvBuilder,
+    pg_version: PgVersion,
+):
+    if "15" in pg_version:  # SKIP v15 for now because test set only has extension built for v14
+        return None
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_extension_download_after_restart",
+        enable_remote_extensions=True,
+    )
+    env = neon_env_builder.init_start()
+    tenant_id, _ = env.neon_cli.create_tenant()
+    env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id)
+
+    assert env.ext_remote_storage is not None  # satisfy mypy
+    assert env.remote_storage_client is not None  # satisfy mypy
+
+    # For MOCK_S3 we upload test files.
+    upload_files(env)
+
+    endpoint = env.endpoints.create_start(
+        "test_extension_download_after_restart",
+        tenant_id=tenant_id,
+        remote_ext_config=env.ext_remote_storage.to_string(),
+        config_lines=["log_min_messages=debug3"],
+    )
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CREATE extension pg_buffercache;")
+            cur.execute("SELECT * from pg_buffercache;")
+            res = cur.fetchall()
+            assert len(res) > 0
+            log.info(res)
+
+    # shutdown compute node
+    endpoint.stop()
+    # remove extension files locally
+    cleanup(pg_version)
+
+    # spin up compute node again (there are no extension files available, because compute is stateless)
+    endpoint = env.endpoints.create_start(
+        "test_extension_download_after_restart",
+        tenant_id=tenant_id,
+        remote_ext_config=env.ext_remote_storage.to_string(),
+        config_lines=["log_min_messages=debug3"],
+    )
+
+    # connect to compute node and run the query
+    # that will trigger the download of the extension
+    def run_query(endpoint, thread_id: int):
+        log.info("thread_id {%d} starting", thread_id)
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT * from pg_buffercache;")
+                res = cur.fetchall()
+                assert len(res) > 0
+                log.info("thread_id {%d}, res = %s", thread_id, res)
+
+    threads = [threading.Thread(target=run_query, args=(endpoint, i)) for i in range(2)]
+
+    for thread in threads:
+        thread.start()
+    for thread in threads:
+        thread.join()
+
+    cleanup(pg_version)
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -14,10 +14,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-
    pageserver_http = env.pageserver.http_client()

    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -16,11 +16,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por
            endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port
        )

-        env.neon_cli.create_branch(new_branch_name="migration_check")
+        branch_name = "migration-check"
+
+        env.neon_cli.create_branch(new_branch_name=branch_name)
        pg_port = port_distributor.get_port()
        http_port = port_distributor.get_port()
        env.neon_cli.endpoint_start(
-            endpoint_id="ep-migration_check", pg_port=pg_port, http_port=http_port
+            f"ep-{branch_name}", pg_port, http_port, branch_name=branch_name
        )
    finally:
        env.neon_cli.stop()
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -690,10 +690,6 @@ def test_ondemand_download_failure_to_replace(

    pageserver_http = env.pageserver.http_client()

-    lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
-
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
-
    # remove layers so that they will be redownloaded
    pageserver_http.tenant_detach(tenant_id)
    pageserver_http.tenant_attach(tenant_id)
@@ -704,8 +700,10 @@ def test_ondemand_download_failure_to_replace(
    # requesting details with non-incremental size should trigger a download of the only layer
    # this will need to be adjusted if an index for logical sizes is ever implemented
    with pytest.raises(PageserverApiException):
-        # error message is not useful
-        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
+        # PageserverApiException is expected because of the failpoint (timeline_detail building does something)
+        # ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping
+        # but should it be added back, we would wait for 15s here.
+        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15)

    actual_message = ".* ERROR .*layermap-replace-notfound"
    assert env.pageserver.log_contains(actual_message) is not None
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -72,10 +72,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-
    # Use a tiny checkpoint distance, to create a lot of layers quickly.
    # That allows us to stress the compaction and layer flushing logic more.
    tenant, _ = env.neon_cli.create_tenant(
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -265,18 +265,23 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):
 def test_sql_over_http_batch(static_proxy: NeonProxy):
    static_proxy.safe_psql("create role http with login password 'http' superuser")

-    def qq(queries: List[Tuple[str, Optional[List[Any]]]]) -> Any:
+    def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any:
        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
        response = requests.post(
            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
            data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
-            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Batch-Isolation-Level": "Serializable",
+                "Neon-Batch-Read-Only": "true" if read_only else "false",
+            },
            verify=str(static_proxy.test_output_dir / "proxy.crt"),
        )
        assert response.status_code == 200
-        return response.json()["results"]
+        return response.json()["results"], response.headers

-    result = qq(
+    result, headers = qq(
        [
            ("select 42 as answer", None),
            ("select $1 as answer", [42]),
@@ -291,6 +296,9 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
        ]
    )

+    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
+    assert headers["Neon-Batch-Read-Only"] == "false"
+
    assert result[0]["rows"] == [{"answer": 42}]
    assert result[1]["rows"] == [{"answer": "42"}]
    assert result[2]["rows"] == [{"answer": 42}]
@@ -311,3 +319,14 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
    assert res["command"] == "DROP"
    assert res["rowCount"] is None
    assert len(result) == 10
+
+    result, headers = qq(
+        [
+            ("select 42 as answer", None),
+        ],
+        True,
+    )
+    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
+    assert headers["Neon-Batch-Read-Only"] == "true"
+
+    assert result[0]["rows"] == [{"answer": 42}]
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -15,10 +15,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    env.pageserver.is_testing_enabled_or_skip()

-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-
    # Create a branch for us
    env.neon_cli.create_branch("test_pageserver_recovery", "main")

--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -38,6 +38,12 @@ def test_threshold_based_eviction(
    env = neon_env_builder.init_start()
    env.pageserver.allowed_errors.append(metrics_refused_log_line)

+    # these can happen whenever we run consumption metrics collection
+    env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled")
+    env.pageserver.allowed_errors.append(
+        r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes"
+    )
+
    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
    assert isinstance(timeline_id, TimelineId)

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -229,6 +229,8 @@ def test_delete_timeline_exercise_crash_safety_failpoints(

    ps_http.configure_failpoints((failpoint, "return"))

+    iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4
+
    # These failpoints are earlier than background task is spawned.
    # so they result in api request failure.
    if failpoint in (
@@ -245,7 +247,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
            tenant_id=env.initial_tenant,
            timeline_id=timeline_id,
            expected_state="Broken",
-            iterations=2,  # effectively try immediately and retry once in one second
+            iterations=iterations,
        )

        reason = timeline_info["state"]["Broken"]["reason"]
@@ -254,21 +256,19 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
        # failpoint may not be the only error in the stack
        assert reason.endswith(f"failpoint: {failpoint}"), reason

-    wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3
    if check is Check.RETRY_WITH_RESTART:
        env.pageserver.stop()
        env.pageserver.start()
+
+        wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations)
+
        if failpoint == "timeline-delete-before-index-deleted-at":
            # We crashed before persisting this to remote storage, need to retry delete request
-
-            # Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading)
-            wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2)
-
            timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
        else:
            # Pageserver should've resumed deletion after restart.
            wait_timeline_detail_404(
-                ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
+                ps_http, env.initial_tenant, timeline_id, iterations=iterations
            )
    elif check is Check.RETRY_WITHOUT_RESTART:
        # this should succeed
@@ -276,7 +276,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
        ps_http.configure_failpoints((failpoint, "off"))

        timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
+            ps_http, env.initial_tenant, timeline_id, iterations=iterations
        )

    # Check remote is impty
@@ -404,6 +404,7 @@ def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str]
    assert isinstance(neon_env_builder.remote_storage, S3Storage)

    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    assert neon_env_builder.remote_storage_client is not None
    response = neon_env_builder.remote_storage_client.list_objects_v2(
        Bucket=neon_env_builder.remote_storage.bucket_name,
        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
@@ -569,7 +570,7 @@ def test_concurrent_timeline_delete_stuck_on(
        try:
            log.info("first call start")
            timeline_delete_wait_completed(
-                ps_http, env.initial_tenant, child_timeline_id, timeout=10
+                ps_http, env.initial_tenant, child_timeline_id, timeout=20
            )
            log.info("first call success")
            result_queue.put("success")
@@ -683,7 +684,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
    wait_until(50, 0.1, first_request_finished)

    # check that the timeline is gone
-    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2)


@pytest.mark.parametrize(
@@ -758,7 +759,7 @@ def test_timeline_delete_works_for_remote_smoke(
        )

    # for some reason the check above doesnt immediately take effect for the below.
-    # Assume it is mock server incosistency and check twice.
+    # Assume it is mock server inconsistency and check twice.
    wait_until(
        2,
        0.5,
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "e3fbfc4d143b2d3c3c1813ce747f8af35aa9405e",
-    "postgres-v14": "12c5dc8281d20b5bd636e1097eea80a7bc609591"
+    "postgres-v15": "553f2d3618a6d4893bde67f1c065926ee8a3a118",
+    "postgres-v14": "28bf5ccfa2fda9677566a25abd450e714d9ed055"
 }
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,6 +60,7 @@ url = { version = "2", features = ["serde"] }
 [build-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
+cc = { version = "1", default-features = false, features = ["parallel"] }
 either = { version = "1" }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
Author	SHA1	Message	Date
John Spray	550eadc1d5	pageserver: cargo fmt This is in a separate commit to make the previous ones more readable.	2023-08-03 14:12:46 +01:00
John Spray	db0deb8457	pageserver: respect task_mgr cancellation in metrics task This previously relied on seeing a channel close, when Tenant is destroyed: this task ran beyond Tenant::shutdown, whereas the idea of that shutdown function is that all the per-tenant background tasks are joined when it completes. Instead, stop this task as soon as background tasks for the Tenant are cancelled, making the behavior of shutdown() much more obvious and bringing the task into line with how we do shutdown in other background tasks.	2023-08-03 10:01:12 +01:00
John Spray	a7ad080961	pageserver: remove bare tokio::spawn Commit `ddb9c2fe9` added this task, which is launched with tokio::spawn rather than task_mgr::spawn, and is not wrapped in a tracing Instrumented<>. While the task doesn't overtly do any logging, for consistency we should spawn all our tasks via the wrapper: in future this task might be extended to e.g. emit log lines that we would expect to have the proper tracing spans.	2023-08-03 09:53:38 +01:00
John Spray	df49a9b7aa	pagekeeper: suppress error logs in shutdown/detach (#4876 ) ## Problem Error messages like this coming up during normal operations: ``` Compaction failed, retrying in 2s: timeline is Stopping Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant ``` ## Summary of changes Add explicit handling for the shutdown case in these locations, to suppress error logs.	2023-08-02 19:31:09 +01:00
bojanserafimov	4ad0c8f960	compute_ctl: Prewarm before starting http server (#4867 )	2023-08-02 14:19:06 -04:00
Joonas Koivunen	e0b05ecafb	build: ca-certificates need to be present (#4880 ) as needed since #4715 or this will happen: ``` ERROR panic{thread=main location=.../hyper-rustls-0.23.2/src/config.rs:48:9}: no CA certificates found ```	2023-08-02 20:34:21 +03:00
Vadim Kharitonov	ca4d71a954	Upgrade pg_embedding to 0.3.5 (#4873 )	2023-08-02 18:18:33 +03:00
Alexander Bayandin	381f41e685	Bump cryptography from 41.0.2 to 41.0.3 (#4870 )	2023-08-02 14:10:36 +03:00
Alek Westover	d005c77ea3	Tar Remote Extensions (#4715 ) Add infrastructure to dynamically load postgres extensions and shared libraries from remote extension storage. Before postgres start downloads list of available remote extensions and libraries, and also downloads 'shared_preload_libraries'. After postgres is running, 'compute_ctl' listens for HTTP requests to load files. Postgres has new GUC 'extension_server_port' to specify port on which 'compute_ctl' listens for requests. When PostgreSQL requests a file, 'compute_ctl' downloads it. See more details about feature design and remote extension storage layout in docs/rfcs/024-extension-loading.md --------- Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech> Co-authored-by: Alek Westover <alek.westover@gmail.com>	2023-08-02 12:38:12 +03:00
Joonas Koivunen	04776ade6c	fix(consumption): rename _size_ => _data_ (#4866 ) I failed at renaming the metric middle part while managing to do a great job with the suffix. Fix the middle part as well.	2023-08-01 19:18:25 +03:00
Dmitry Rodionov	c3fe335eaf	wait for tenant to be active before polling for timeline absence (#4856 ) ## Problem https://neon-github-public-dev.s3.amazonaws.com/reports/main/5692829577/index.html#suites/f588e0a787c49e67b29490359c589fae/4c50937643d68a66 ## Summary of changes wait for tenant to be active after restart before polling for timeline absence	2023-08-01 18:28:18 +03:00
Joonas Koivunen	3a00a5deb2	refactor: tidy consumption metrics (#4860 ) Tidying up I've been wanting to do for some time. Follow-up to #4857.	2023-08-01 18:14:16 +03:00
Joonas Koivunen	78fa2b13e5	test: written_size_bytes_delta (#4857 ) Two stabs at this, by mocking a http receiver and the globals out (now reverted) and then by separating the timeline dependency and just testing what kind of events certain timelines produce. I think this pattern could work for some of our problems. Follow-up to #4822.	2023-08-01 15:30:36 +03:00
John Spray	7c076edeea	pageserver: tweak period of imitate_layer_accesses (#4859 ) ## Problem When the eviction threshold is an integer multiple of the eviction period, it is unreliable to skip imitating accesses based on whether the last imitation was more recent than the threshold. This is because as finite time passes between the time used for the periodic execution, and the 'now' time used for updating last_layer_access_imitation. When this is just a few milliseconds, and everything else is on-time, then a 5 second threshold with a 1 second period will end up entering its 5th iteration slightly _less than_ 5 second since last_layer_access_imitation, and thereby skipping instead of running the imitation. If a few milliseconds then pass before we check the access time of a file that _should_ have been bumped by the imitation pass, then we end up evicting something we shouldn't have evicted. ## Summary of changes We can make this race far less likely by using the threshold minus one interval as the period for re-executing the imitate_layer_accesses: that way we're not vulnerable to racing by just a few millis, and there would have to be a delay of the order `period` to cause us to wrongly evict a layer. This is not a complete solution: it would be good to revisit this and use a non-walltime mechanism for pinning these layers into local storage, rather than relying on bumping access times.	2023-08-01 13:17:49 +01:00
Arpad Müller	69528b7c30	Prepare k-merge in compaction for async I/O (#4836 ) ## Problem The k-merge in pageserver compaction currently relies on iterators over the keys and also over the values. This approach does not support async code because we are using iterators and those don't support async in general. Also, the k-merge implementation we use doesn't support async either. Instead, as we already load all the keys into memory, the plan is to just do the sorting in-memory for now, switch to async, and then once we want to support workloads that don't have all keys stored in memory, we can look into switching to a k-merge implementation that supports async instead. ## Summary of changes The core of this PR is the move from functions on the `PersistentLayer` trait to return custom iterator types to inherent functions on `DeltaLayer` that return buffers with all keys or value references. Value references are a type we created in this PR, containing a `BlobRef` as well as an `Arc` pointer to the `DeltaLayerInner`, so that we can lazily load the values during compaction. This preserves the property of the current code. This PR does not switch us to doing the k-merge via sort on slices, but with this PR, doing such a switch is relatively easy and only requires changes of the compaction code itself. Part of https://github.com/neondatabase/neon/issues/4743	2023-08-01 13:38:35 +02:00
Konstantin Knizhnik	a98a80abc2	Deffine NEON_SMGR to make it possible for extensions to use Neon SMG API (#4840 ) ## Problem See https://neondb.slack.com/archives/C036U0GRMRB/p1689148023067319 ## Summary of changes Define NEON_SMGR in smgr.h ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2023-08-01 10:04:45 +03:00
Alex Chi Z	7b6c849456	support isolation level + read only for http batch sql (#4830 ) We will retrieve `neon-batch-isolation-level` and `neon-batch-read-only` from the http header, which sets the txn properties. https://github.com/neondatabase/serverless/pull/38#issuecomment-1653130981 --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2023-08-01 02:59:11 +03:00
Joonas Koivunen	326189d950	consumption_metrics: send timeline_written_size_delta (#4822 ) We want to have timeline_written_size_delta which is defined as difference to the previously sent `timeline_written_size` from the current `timeline_written_size`. Solution is to send it. On the first round `disk_consistent_lsn` is used which is captured during `load` time. After that an incremental "event" is sent on every collection. Incremental "events" are not part of deduplication. I've added some infrastructure to allow somewhat typesafe `EventType::Absolute` and `EventType::Incremental` factories per metrics, now that we have our first `EventType::Incremental` usage.	2023-07-31 22:10:19 +03:00
bojanserafimov	ddbe170454	Prewarm compute nodes (#4828 )	2023-07-31 14:13:32 -04:00
Alexander Bayandin	39e458f049	test_compatibility: fix pg_tenant_only_port port collision (#4850 ) ## Problem Compatibility tests fail from time to time due to `pg_tenant_only_port` port collision (added in https://github.com/neondatabase/neon/pull/4731) ## Summary of changes - replace `pg_tenant_only_port` value in config with new port - remove old logic, than we don't need anymore - unify config overrides	2023-07-31 20:49:46 +03:00
Vadim Kharitonov	e1424647a0	Update pg_embedding to 0.3.1 version (#4811 )	2023-07-31 20:23:18 +03:00
Yinnan Yao	705ae2dce9	Fix error message for listen_pg_addr_tenant_only binding (#4787 ) ## Problem Wrong use of `conf.listen_pg_addr` in `error!()`. ## Summary of changes Use `listen_pg_addr_tenant_only` instead of `conf.listen_pg_addr`. Signed-off-by: yaoyinnan <35447132+yaoyinnan@users.noreply.github.com>	2023-07-31 14:40:52 +01:00
Conrad Ludgate	eb78603121	proxy: div by zero (#4845 ) ## Problem 1. In the CacheInvalid state loop, we weren't checking the `num_retries`. If this managed to get up to `32`, the retry_after procedure would compute 2^32 which would overflow to 0 and trigger a div by zero 2. When fixing the above, I started working on a flow diagram for the state machine logic and realised it was more complex than it had to be: a. We start in a `Cached` state b. `Cached`: call `connect_once`. After the first connect_once error, we always move to the `CacheInvalid` state, otherwise, we return the connection. c. `CacheInvalid`: we attempt to `wake_compute` and we either switch to Cached or we retry this step (or we error). d. `Cached`: call `connect_once`. We either retry this step or we have a connection (or we error) - After num_retries > 1 we never switch back to `CacheInvalid`. ## Summary of changes 1. Insert a `num_retries` check in the `handle_try_wake` procedure. Also using floats in the retry_after procedure to prevent the overflow entirely 2. Refactor connect_to_compute to be more linear in design.	2023-07-31 09:30:24 -04:00
John Spray	f0ad603693	pageserver: add unit test for deleted_at in IndexPart (#4844 ) ## Problem Existing IndexPart unit tests only exercised the version 1 format (i.e. without deleted_at set). ## Summary of changes Add a test that sets version to 2, and sets a value for deleted_at. Closes https://github.com/neondatabase/neon/issues/4162	2023-07-31 12:51:18 +01:00
Arpad Müller	e5183f85dc	Make DiskBtreeReader::dump async (#4838 ) ## Problem `DiskBtreeReader::dump` calls `read_blk` internally, which we want to make async in the future. As it is currently relying on recursion, and async doesn't like recursion, we want to find an alternative to that and instead traverse the tree using a loop and a manual stack. ## Summary of changes * Make `DiskBtreeReader::dump` and all the places calling it async * Make `DiskBtreeReader::dump` non-recursive internally and use a stack instead. It now deparses the node in each iteration, which isn't optimal, but on the other hand it's hard to store the node as it is referencing the buffer. Self referential data are hard in Rust. For a dumping function, speed isn't a priority so we deparse the node multiple times now (up to branching factor many times). Part of https://github.com/neondatabase/neon/issues/4743 I have verified that output is unchanged by comparing the output of this command both before and after this patch: ``` cargo test -p pageserver -- particular_data --nocapture ```	2023-07-31 12:52:29 +02:00
Joonas Koivunen	89ee8f2028	fix: demote warnings, fix flakyness (#4837 ) `WARN ... found future (image\|delta) layer` are not actionable log lines. They don't need to be warnings. `info!` is enough. This also fixes some known but not tracked flakyness in [`test_remote_timeline_client_calls_started_metric`][evidence]. [evidence]: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4829/5683495367/index.html#/testresult/34fe79e24729618b Closes #3369. Closes #4473.	2023-07-31 07:43:12 +00:00
Alex Chi Z	a8f3540f3d	proxy: add unit test for wake_compute (#4819 ) ## Problem ref https://github.com/neondatabase/neon/pull/4721, ref https://github.com/neondatabase/neon/issues/4709 ## Summary of changes This PR adds unit tests for wake_compute. The patch adds a new variant `Test` to auth backends. When `wake_compute` is called, we will verify if it is the exact operation sequence we are expecting. The operation sequence now contains 3 more operations: `Wake`, `WakeRetry`, and `WakeFail`. The unit tests for proxy connects are now complete and I'll continue work on WebSocket e2e test in future PRs. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2023-07-28 19:10:55 -04:00
Konstantin Knizhnik	4338eed8c4	Make it possible to grant self perfmissions to self created roles (#4821 ) ## Problem See: https://neondb.slack.com/archives/C04USJQNLD6/p1689973957908869 ## Summary of changes Bump Postgres version ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2023-07-28 22:06:03 +03:00
Joonas Koivunen	2fbdf26094	test: raise timeout to avoid flakyness (#4832 ) 2s timeout was too tight for our CI, [evidence](https://neon-github-public-dev.s3.amazonaws.com/reports/main/5669956577/index.html#/testresult/6388e31182cc2d6e). 15s might be better. Also cleanup code no longer needed after #4204.	2023-07-28 14:32:01 -04:00
Alexander Bayandin	7374634845	test_runner: clean up test_compatibility (#4770 ) ## Problem We have some amount of outdated logic in test_compatibility, that we don't need anymore. ## Summary of changes - Remove `PR4425_ALLOWED_DIFF` and tune `dump_differs` method to accept allowed diffs in the future (a cleanup after https://github.com/neondatabase/neon/pull/4425) - Remote etcd related code (a cleanup after https://github.com/neondatabase/neon/pull/2733) - Don't set `preserve_database_files`	2023-07-28 16:15:31 +01:00
Alexander Bayandin	9fdd3a4a1e	test_runner: add amcheck to test_compatibility (#4772 ) Run `pg_amcheck` in forward and backward compatibility tests to catch some data corruption. ## Summary of changes - Add amcheck compiling to Makefile - Add `pg_amcheck` to test_compatibility	2023-07-28 16:00:55 +01:00
Alek Westover	3681fc39fd	modify `relative_path_to_s3_object` logic for `prefix=None` (#4795 ) see added unit tests for more description	2023-07-28 10:03:18 -04:00
Joonas Koivunen	67d2fa6dec	test: fix `test_neon_cli_basics` flakyness without making it better for future (#4827 ) The test was starting two endpoints on the same branch as discovered by @petuhovskiy. The fix is to allow passing branch-name from the python side over to neon_local, which already accepted it. Split from #4824, which will handle making this more misuse resistant.	2023-07-27 19:13:58 +03:00
Dmitry Rodionov	cafbe8237e	Move tenant/delete.rs to tenant/timeline/delete.rs (#4825 ) move tenant/delete.rs to tenant/timeline/delete.rs to prepare for appearance of tenant deletion routines in tenant/delete.rs	2023-07-27 15:52:36 +03:00
Joonas Koivunen	3e425c40c0	fix(compute_ctl): remove stray variable in error message (#4823 ) error is not needed because anyhow will have the cause chain reported anyways. related to test_neon_cli_basics being flaky, but doesn't actually fix any flakyness, just the obvious stray `{e}`.	2023-07-27 15:40:53 +03:00
Joonas Koivunen	395bd9174e	test: allow future image layer warning (#4818 ) https://neon-github-public-dev.s3.amazonaws.com/reports/main/5670795960/index.html#suites/837740b64a53e769572c4ed7b7a7eeeb/5a73fa4a69399123/retries Allow it because we are doing immediate stop.	2023-07-27 10:22:44 +03:00
Alek Westover	b9a7a661d0	add list of public extensions and lookup table for libraries (#4807 )	2023-07-26 15:55:55 -04:00
Joonas Koivunen	48ce95533c	test: allow normal warnings in test_threshold_based_eviction (#4801 ) See: https://neon-github-public-dev.s3.amazonaws.com/reports/main/5654328815/index.html#suites/3fc871d9ee8127d8501d607e03205abb/3482458eba88c021	2023-07-26 20:20:12 +03:00
Dmitry Rodionov	874c31976e	dedup cleanup fs traces (#4778 ) This is a follow up for discussion: https://github.com/neondatabase/neon/pull/4552#discussion_r1253417777 see context there	2023-07-26 18:39:32 +03:00
Conrad Ludgate	231d7a7616	proxy: retry compute wake in auth (#4817 ) ## Problem wake_compute can fail sometimes but is eligible for retries. We retry during the main connect, but not during auth. ## Summary of changes retry wake_compute during auth flow if there was an error talking to control plane, or if there was a temporary error in waking the compute node	2023-07-26 16:34:46 +01:00
arpad-m	5705413d90	Use OnceLock instead of manually implementing it (#4805 ) ## Problem In https://github.com/neondatabase/neon/issues/4743 , I'm trying to make more of the pageserver async, but in order for that to happen, I need to be able to persist the result of `ImageLayer::load` across await points. For that to happen, the return value needs to be `Send`. ## Summary of changes Use `OnceLock` in the image layer instead of manually implementing it with booleans, locks and `Option`. Part of #4743	2023-07-26 17:20:09 +02:00
Conrad Ludgate	35370f967f	proxy: add some connection init logs (#4812 ) ## Problem The first session event we emit is after we receive the first startup packet from the client. This means we can't detect any issues between TCP open and handling of the first PG packet ## Summary of changes Add some new logs for websocket upgrade and connection handling	2023-07-26 15:03:51 +00:00
Alexander Bayandin	b98419ee56	Fix allure report overwriting for different Postgres versions (#4806 ) ## Problem We've got an example of Allure reports from 2 different runners for the same build that started to upload at the exact second, making one overwrite another ## Summary of changes - Use the Postgres version to distinguish artifacts (along with the build type)	2023-07-26 15:19:18 +01:00
Alexander Bayandin	86a61b318b	Bump certifi from 2022.12.7 to 2023.7.22 (#4815 )	2023-07-26 16:32:56 +03:00