clippy

minor fixes
few more panics instead of bailouts
2026-05-24 08:30:37 +00:00 · 2023-06-23 15:48:42 +02:00 · 2023-06-23 13:00:28 +02:00 · 2023-06-23 13:00:17 +02:00 · 2023-06-23 12:47:33 +02:00 · 2023-06-16 17:34:08 +02:00
58 changed files with 1597 additions and 2714 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -180,8 +180,7 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

-    # Increase timeout to 8h, default timeout is 6h
-    timeout-minutes: 480
+    timeout-minutes: 360 # 6h

    steps:
    - uses: actions/checkout@v3
@@ -322,6 +321,8 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

+    timeout-minutes: 360 # 6h
+
    steps:
    - uses: actions/checkout@v3

@@ -413,6 +414,8 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

+    timeout-minutes: 360 # 6h
+
    steps:
    - uses: actions/checkout@v3

@@ -498,6 +501,8 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

+    timeout-minutes: 360 # 6h
+
    steps:
    - uses: actions/checkout@v3

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -916,7 +916,7 @@ jobs:
            exit 1
          fi

-      - name: Create git tag
+      - name: Create tag "release-${{ needs.tag.outputs.build-tag }}"
        if: github.ref_name == 'release'
        uses: actions/github-script@v6
        with:
@@ -926,7 +926,7 @@ jobs:
            github.rest.git.createRef({
              owner: context.repo.owner,
              repo: context.repo.repo,
-              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
+              ref: "refs/tags/release-${{ needs.tag.outputs.build-tag }}",
              sha: context.sha,
            })

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -924,14 +924,12 @@ dependencies = [
 "opentelemetry",
 "postgres",
 "regex",
- "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
 "tar",
 "tokio",
 "tokio-postgres",
- "toml_edit",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -999,7 +997,6 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
- "tracing",
 "url",
 "utils",
 "workspace_hack",
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -481,23 +481,6 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control

-#########################################################################################
-#
-# Layer "pg-uuidv7-pg-build"
-# compile pg_uuidv7 extension
-#
-#########################################################################################
-FROM build-deps AS pg-uuidv7-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
-    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
-    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
-
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -631,7 +614,6 @@ COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -30,5 +30,3 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
-toml_edit.workspace = true
-remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -27,8 +27,7 @@
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
-//!             -b /usr/local/bin/postgres \
-//!             -r {"bucket": "my-bucket", "region": "eu-central-1"}
+//!             -b /usr/local/bin/postgres
 //! ```
 //!
 use std::collections::HashMap;
@@ -36,7 +35,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
+use std::sync::{mpsc, Arc, Condvar, Mutex};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -49,7 +48,6 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
-use compute_tools::extension_server::{get_pg_version, init_remote_storage};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
@@ -66,20 +64,6 @@ fn main() -> Result<()> {
    info!("build_tag: {build_tag}");

    let matches = cli().get_matches();
-    let pgbin_default = String::from("postgres");
-    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
-
-    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
-    let ext_remote_storage = match remote_ext_config {
-        Some(x) => match init_remote_storage(x) {
-            Ok(y) => Some(y),
-            Err(e) => {
-                dbg!("Error {:?}, setting remote storage to None", e);
-                None
-            }
-        },
-        None => None,
-    };

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -144,6 +128,9 @@ fn main() -> Result<()> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

+    // Try to use just 'postgres' if no path is provided
+    let pgbin = matches.get_one::<String>("pgbin").unwrap();
+
    let spec;
    let mut live_config_allowed = false;
    match spec_json {
@@ -181,7 +168,6 @@ fn main() -> Result<()> {

    let mut new_state = ComputeState::new();
    let spec_set;
-
    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
        new_state.pspec = Some(pspec);
@@ -193,13 +179,9 @@ fn main() -> Result<()> {
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
-        pgversion: get_pg_version(pgbin),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
-        ext_remote_storage,
-        available_libraries: OnceLock::new(),
-        available_extensions: OnceLock::new(),
    };
    let compute = Arc::new(compute_node);

@@ -208,8 +190,6 @@ fn main() -> Result<()> {
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

-    let extension_server_port: u16 = http_port;
-
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
@@ -250,7 +230,7 @@ fn main() -> Result<()> {
    // Start Postgres
    let mut delay_exit = false;
    let mut exit_code = None;
-    let pg = match compute.start_compute(extension_server_port) {
+    let pg = match compute.start_compute() {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
@@ -369,12 +349,6 @@ fn cli() -> clap::Command {
                .long("control-plane-uri")
                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
-        .arg(
-            Arg::new("remote-ext-config")
-                .short('r')
-                .long("remote-ext-config")
-                .value_name("REMOTE_EXT_CONFIG"),
-        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,15 +1,13 @@
-use std::collections::HashMap;
 use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex, OnceLock};
+use std::sync::{Condvar, Mutex};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tokio;
 use tokio_postgres;
 use tracing::{info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -18,11 +16,9 @@ use utils::lsn::Lsn;
 use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};

-use remote_storage::{GenericRemoteStorage, RemotePath};
-
+use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
-use crate::{config, extension_server};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -30,7 +26,6 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
-    pub pgversion: String,
    /// We should only allow live re- / configuration of the compute node if
    /// it uses 'pull model', i.e. it can go to control-plane and fetch
    /// the latest configuration. Otherwise, there could be a case:
@@ -50,10 +45,6 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
-    ///  S3 extensions configuration variables
-    pub ext_remote_storage: Option<GenericRemoteStorage>,
-    pub available_libraries: OnceLock<HashMap<String, RemotePath>>,
-    pub available_extensions: OnceLock<HashMap<String, Vec<RemotePath>>>,
 }

 #[derive(Clone, Debug)]
@@ -142,84 +133,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

-/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
-/// that we give to customers
-fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let roles = spec
-        .cluster
-        .roles
-        .iter()
-        .map(|r| format!("'{}'", escape_literal(&r.name)))
-        .collect::<Vec<_>>();
-
-    let dbs = spec
-        .cluster
-        .databases
-        .iter()
-        .map(|db| format!("'{}'", escape_literal(&db.name)))
-        .collect::<Vec<_>>();
-
-    let roles_decl = if roles.is_empty() {
-        String::from("roles text[] := NULL;")
-    } else {
-        format!(
-            r#"
-               roles text[] := ARRAY(SELECT rolname
-                                     FROM pg_catalog.pg_roles
-                                     WHERE rolname IN ({}));"#,
-            roles.join(", ")
-        )
-    };
-
-    let database_decl = if dbs.is_empty() {
-        String::from("dbs text[] := NULL;")
-    } else {
-        format!(
-            r#"
-               dbs text[] := ARRAY(SELECT datname
-                                   FROM pg_catalog.pg_database
-                                   WHERE datname IN ({}));"#,
-            dbs.join(", ")
-        )
-    };
-
-    // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
-    // (see https://www.postgresql.org/docs/current/ddl-priv.html)
-    let query = format!(
-        r#"
-            DO $$
-                DECLARE
-                    r text;
-                    {}
-                    {}
-                BEGIN
-                    IF NOT EXISTS (
-                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
-                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
-                        IF array_length(roles, 1) IS NOT NULL THEN
-                            EXECUTE format('GRANT neon_superuser TO %s',
-                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
-                            FOREACH r IN ARRAY roles LOOP
-                                EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
-                            END LOOP;
-                        END IF;
-                        IF array_length(dbs, 1) IS NOT NULL THEN
-                            EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
-                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
-                        END IF;
-                    END IF;
-                END
-            $$;"#,
-        roles_decl, database_decl,
-    );
-    info!("Neon superuser created:\n{}", &query);
-    client
-        .simple_query(&query)
-        .map_err(|e| anyhow::anyhow!(e).context(query))?;
-    Ok(())
-}
-
 impl ComputeNode {
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
@@ -332,22 +245,14 @@ impl ComputeNode {
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip(self, compute_state))]
-    pub fn prepare_pgdata(
-        &self,
-        compute_state: &ComputeState,
-        extension_server_port: u16,
-    ) -> Result<()> {
+    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
-        config::write_postgres_conf(
-            &pgdata_path.join("postgresql.conf"),
-            &pspec.spec,
-            Some(extension_server_port),
-        )?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
        // is already connected it will be kicked out, so a secondary (standby)
@@ -442,8 +347,6 @@ impl ComputeNode {
                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;

                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
-                // Disable forwarding so that users don't get a cloud_admin role
-                client.simple_query("SET neon.forward_ddl = false")?;
                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);
@@ -454,16 +357,14 @@ impl ComputeNode {
            Ok(client) => client,
        };

+        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
        client.simple_query("SET neon.forward_ddl = false")?;
-
-        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, self.connstr.as_str())?;
+        handle_grants(spec, self.connstr.as_str(), &mut client)?;
        handle_extensions(spec, &mut client)?;

        // 'Close' connection
@@ -489,7 +390,7 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
        self.pg_reload_conf(&mut client)?;
@@ -501,7 +402,7 @@ impl ComputeNode {
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, self.connstr.as_str())?;
+            handle_grants(&spec, self.connstr.as_str(), &mut client)?;
            handle_extensions(&spec, &mut client)?;
        }

@@ -519,7 +420,7 @@ impl ComputeNode {
    }

    #[instrument(skip(self))]
-    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
+    pub fn start_compute(&self) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -530,9 +431,7 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        self.prepare_external_extensions(&compute_state)?;
-
-        self.prepare_pgdata(&compute_state, extension_server_port)?;
+        self.prepare_pgdata(&compute_state)?;

        let start_time = Utc::now();

@@ -668,131 +567,4 @@ LIMIT 100",
            "{{\"pg_stat_statements\": []}}".to_string()
        }
    }
-
-    // If remote extension storage is configured,
-    // download extension control files
-    // and shared preload libraries.
-    #[tokio::main]
-    pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
-        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
-            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-            // download preload shared libraries before postgres start (if any)
-            let spec = &pspec.spec;
-
-            // 1. parse private extension paths from spec
-            let private_ext_prefixes = match &spec.private_extensions {
-                Some(private_extensions) => private_extensions.clone(),
-                None => Vec::new(),
-            };
-
-            info!("private_ext_prefixes: {:?}", &private_ext_prefixes);
-
-            // 2. parse shared_preload_libraries from spec
-            let mut libs_vec = Vec::new();
-
-            if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
-                libs_vec = libs
-                    .split(&[',', '\'', ' '])
-                    .filter(|s| *s != "neon" && !s.is_empty())
-                    .map(str::to_string)
-                    .collect();
-            }
-
-            info!(
-                "shared_preload_libraries parsed from spec.cluster.settings: {:?}",
-                libs_vec
-            );
-
-            // also parse shared_preload_libraries from provided postgresql.conf
-            // that is used in neon_local and python tests
-            if let Some(conf) = &spec.cluster.postgresql_conf {
-                let conf_lines = conf.split('\n').collect::<Vec<&str>>();
-
-                let mut shared_preload_libraries_line = "";
-                for line in conf_lines {
-                    if line.starts_with("shared_preload_libraries") {
-                        shared_preload_libraries_line = line;
-                    }
-                }
-
-                let mut preload_libs_vec = Vec::new();
-                if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
-                    preload_libs_vec = libs
-                        .split(&[',', '\'', ' '])
-                        .filter(|s| *s != "neon" && !s.is_empty())
-                        .map(str::to_string)
-                        .collect();
-                }
-
-                info!(
-                    "shared_preload_libraries parsed from spec.cluster.postgresql_conf: {:?}",
-                    preload_libs_vec
-                );
-
-                libs_vec.extend(preload_libs_vec);
-            }
-
-            // download extension control files & shared_preload_libraries
-
-            let available_extensions = extension_server::get_available_extensions(
-                ext_remote_storage,
-                &self.pgbin,
-                &self.pgversion,
-                &private_ext_prefixes,
-            )
-            .await?;
-            self.available_extensions
-                .set(available_extensions)
-                .expect("available_extensions.set error");
-
-            info!("Libraries to download: {:?}", &libs_vec);
-            let available_libraries = extension_server::get_available_libraries(
-                ext_remote_storage,
-                &self.pgbin,
-                &self.pgversion,
-                &private_ext_prefixes,
-                &libs_vec,
-            )
-            .await?;
-            self.available_libraries
-                .set(available_libraries)
-                .expect("available_libraries.set error");
-        }
-
-        Ok(())
-    }
-
-    pub async fn download_extension_sql_files(&self, filename: String) -> Result<()> {
-        match &self.ext_remote_storage {
-            None => anyhow::bail!("No remote extension storage"),
-            Some(remote_storage) => {
-                extension_server::download_extension_sql_files(
-                    &filename,
-                    remote_storage,
-                    &self.pgbin,
-                    self.available_extensions
-                        .get()
-                        .context("available_extensions broke")?,
-                )
-                .await
-            }
-        }
-    }
-
-    pub async fn download_library_file(&self, filename: String) -> Result<()> {
-        match &self.ext_remote_storage {
-            None => anyhow::bail!("No remote extension storage"),
-            Some(remote_storage) => {
-                extension_server::download_library_file(
-                    &filename,
-                    remote_storage,
-                    &self.pgbin,
-                    self.available_libraries
-                        .get()
-                        .context("available_libraries broke")?,
-                )
-                .await
-            }
-        }
-    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -33,11 +33,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 }

 /// Create or completely rewrite configuration file specified by `path`
-pub fn write_postgres_conf(
-    path: &Path,
-    spec: &ComputeSpec,
-    extension_server_port: Option<u16>,
-) -> Result<()> {
+pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

@@ -99,9 +95,5 @@ pub fn write_postgres_conf(
        writeln!(file, "# Managed by compute_ctl: end")?;
    }

-    if let Some(port) = extension_server_port {
-        writeln!(file, "neon.extension_server_port={}", port)?;
-    }
-
    Ok(())
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -1,379 +0,0 @@
-// Download extension files from the extension store
-// and put them in the right place in the postgres directory
-use anyhow::{self, bail, Context, Result};
-use remote_storage::*;
-use serde_json::{self, Value};
-use std::collections::HashMap;
-use std::fs::File;
-use std::io::{BufWriter, Write};
-use std::num::{NonZeroU32, NonZeroUsize};
-use std::path::{Path, PathBuf};
-use std::str;
-use tokio::io::AsyncReadExt;
-use tracing::info;
-
-const SHARE_EXT_PATH: &str = "share/postgresql/extension";
-
-fn get_pg_config(argument: &str, pgbin: &str) -> String {
-    // gives the result of `pg_config [argument]`
-    // where argument is a flag like `--version` or `--sharedir`
-    let pgconfig = pgbin.replace("postgres", "pg_config");
-    let config_output = std::process::Command::new(pgconfig)
-        .arg(argument)
-        .output()
-        .expect("pg_config error");
-    std::str::from_utf8(&config_output.stdout)
-        .expect("pg_config error")
-        .trim()
-        .to_string()
-}
-
-pub fn get_pg_version(pgbin: &str) -> String {
-    // pg_config --version returns a (platform specific) human readable string
-    // such as "PostgreSQL 15.4". We parse this to v14/v15
-    let human_version = get_pg_config("--version", pgbin);
-    if human_version.contains("15") {
-        return "v15".to_string();
-    } else if human_version.contains("14") {
-        return "v14".to_string();
-    }
-    panic!("Unsuported postgres version {human_version}");
-}
-
-async fn download_helper(
-    remote_storage: &GenericRemoteStorage,
-    remote_from_path: &RemotePath,
-    remote_from_prefix: Option<&Path>,
-    download_location: &Path,
-) -> anyhow::Result<()> {
-    // downloads file at remote_from_path to download_location/[file_name]
-
-    // we cannot use remote_from_path.object_name() here
-    // because extension files can be in subdirectories of the extension store.
-    //
-    // To handle this, we use remote_from_prefix to strip the prefix from the path
-    // this gives us the relative path of the file in the extension store,
-    // and we use this relative path to construct the local path.
-    //
-    let local_path = match remote_from_prefix {
-        Some(prefix) => {
-            let p = remote_from_path
-                .get_path()
-                .strip_prefix(prefix)
-                .expect("bad prefix");
-
-            download_location.join(p)
-        }
-        None => download_location.join(remote_from_path.object_name().expect("bad object")),
-    };
-
-    if local_path.exists() {
-        info!("File {:?} already exists. Skipping download", &local_path);
-        return Ok(());
-    }
-
-    info!(
-        "Downloading {:?} to location {:?}",
-        &remote_from_path, &local_path
-    );
-    let mut download = remote_storage.download(remote_from_path).await?;
-    let mut write_data_buffer = Vec::new();
-    download
-        .download_stream
-        .read_to_end(&mut write_data_buffer)
-        .await?;
-    if remote_from_prefix.is_some() {
-        if let Some(prefix) = local_path.parent() {
-            info!(
-                "Downloading file with prefix. Create directory {:?}",
-                prefix
-            );
-            // if directory already exists, this is a no-op
-            std::fs::create_dir_all(prefix)?;
-        }
-    }
-
-    let mut output_file = BufWriter::new(File::create(local_path)?);
-    output_file.write_all(&write_data_buffer)?;
-    Ok(())
-}
-
-// download extension control files
-//
-// if private_ext_prefixes is provided - search also in private extension paths
-//
-pub async fn get_available_extensions(
-    remote_storage: &GenericRemoteStorage,
-    pgbin: &str,
-    pg_version: &str,
-    private_ext_prefixes: &Vec<String>,
-) -> anyhow::Result<HashMap<String, Vec<RemotePath>>> {
-    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-
-    let mut paths: Vec<RemotePath> = Vec::new();
-    // public extensions
-    paths.push(RemotePath::new(
-        &Path::new(pg_version).join(SHARE_EXT_PATH),
-    )?);
-    // private extensions
-    for private_prefix in private_ext_prefixes {
-        paths.push(RemotePath::new(
-            &Path::new(pg_version)
-                .join(private_prefix)
-                .join(SHARE_EXT_PATH),
-        )?);
-    }
-
-    let all_available_files = list_files_in_prefixes_for_extensions(remote_storage, &paths).await?;
-
-    info!(
-        "list of available_extension files {:?}",
-        &all_available_files
-    );
-
-    // download all control files
-    for (obj_name, obj_paths) in &all_available_files {
-        for obj_path in obj_paths {
-            if obj_name.ends_with("control") {
-                download_helper(remote_storage, obj_path, None, &local_sharedir).await?;
-            }
-        }
-    }
-
-    Ok(all_available_files)
-}
-
-// Download requested shared_preload_libraries
-//
-// Note that tenant_id is not optional here, because we only download libraries
-// after we know the tenant spec and the tenant_id.
-//
-// return list of all library files to use it in the future searches
-pub async fn get_available_libraries(
-    remote_storage: &GenericRemoteStorage,
-    pgbin: &str,
-    pg_version: &str,
-    private_ext_prefixes: &Vec<String>,
-    preload_libraries: &Vec<String>,
-) -> anyhow::Result<HashMap<String, RemotePath>> {
-    let local_libdir: PathBuf = Path::new(&get_pg_config("--pkglibdir", pgbin)).into();
-    // Construct a hashmap of all available libraries
-    // example (key, value) pair: test_lib0.so, v14/lib/test_lib0.so
-
-    let mut paths: Vec<RemotePath> = Vec::new();
-    // public libraries
-    paths.push(
-        RemotePath::new(&Path::new(&pg_version).join("lib/"))
-            .expect("The hard coded path here is valid"),
-    );
-    // private libraries
-    for private_prefix in private_ext_prefixes {
-        paths.push(
-            RemotePath::new(&Path::new(&pg_version).join(private_prefix).join("lib"))
-                .expect("The hard coded path here is valid"),
-        );
-    }
-
-    let all_available_libraries = list_files_in_prefixes(remote_storage, &paths).await?;
-
-    info!("list of library files {:?}", &all_available_libraries);
-
-    // download all requested libraries
-    for lib_name in preload_libraries {
-        // add file extension if it isn't in the filename
-        let lib_name_with_ext = enforce_so_end(lib_name);
-        info!("looking for library {:?}", &lib_name_with_ext);
-
-        match all_available_libraries.get(&*lib_name_with_ext) {
-            Some(remote_path) => {
-                download_helper(remote_storage, remote_path, None, &local_libdir).await?
-            }
-            None => {
-                let file_path = local_libdir.join(&lib_name_with_ext);
-                if file_path.exists() {
-                    info!("File {:?} already exists. Skipping download", &file_path);
-                } else {
-                    bail!("Shared library file {lib_name} is not found in the extension store")
-                }
-            }
-        }
-    }
-
-    Ok(all_available_libraries)
-}
-
-// download all sqlfiles (and possibly data files) for a given extension name
-//
-pub async fn download_extension_sql_files(
-    ext_name: &str,
-    remote_storage: &GenericRemoteStorage,
-    pgbin: &str,
-    all_available_files: &HashMap<String, Vec<RemotePath>>,
-) -> Result<()> {
-    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    let mut downloaded_something = false;
-
-    if let Some(files) = all_available_files.get(ext_name) {
-        for file in files {
-            if file.extension().context("bad file name")? != "control" {
-                // find files prefix to handle cases when extension files are stored
-                // in a directory with the same name as the extension
-                // example:
-                // share/postgresql/extension/extension_name/extension_name--1.0.sql
-                let index = file
-                    .get_path()
-                    .to_str()
-                    .context("invalid path")?
-                    .find(ext_name)
-                    .context("invalid path")?;
-
-                let prefix_str =
-                    file.get_path().to_str().context("invalid path")?[..index].to_string();
-                let remote_from_prefix = if prefix_str.is_empty() {
-                    None
-                } else {
-                    Some(Path::new(&prefix_str))
-                };
-
-                download_helper(remote_storage, file, remote_from_prefix, &local_sharedir).await?;
-                downloaded_something = true;
-            }
-        }
-    }
-    if !downloaded_something {
-        bail!("Files for extension {ext_name} are not found in the extension store");
-    }
-    Ok(())
-}
-
-// appends an .so suffix to libname if it does not already have one
-fn enforce_so_end(libname: &str) -> String {
-    if !libname.ends_with(".so") {
-        format!("{}.so", libname)
-    } else {
-        libname.to_string()
-    }
-}
-
-// download shared library file
-pub async fn download_library_file(
-    lib_name: &str,
-    remote_storage: &GenericRemoteStorage,
-    pgbin: &str,
-    all_available_libraries: &HashMap<String, RemotePath>,
-) -> Result<()> {
-    let local_libdir: PathBuf = Path::new(&get_pg_config("--pkglibdir", pgbin)).into();
-    let lib_name_with_ext = enforce_so_end(lib_name);
-    info!("looking for library {:?}", &lib_name_with_ext);
-    match all_available_libraries.get(&*lib_name_with_ext) {
-        Some(remote_path) => {
-            download_helper(remote_storage, remote_path, None, &local_libdir).await?
-        }
-        None => bail!("Shared library file {lib_name} is not found in the extension store"),
-    }
-    Ok(())
-}
-
-pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
-    let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
-    let remote_ext_bucket = match &remote_ext_config["bucket"] {
-        Value::String(x) => x,
-        _ => bail!("remote_ext_config missing bucket"),
-    };
-    let remote_ext_region = match &remote_ext_config["region"] {
-        Value::String(x) => x,
-        _ => bail!("remote_ext_config missing region"),
-    };
-    let remote_ext_endpoint = match &remote_ext_config["endpoint"] {
-        Value::String(x) => Some(x.clone()),
-        _ => None,
-    };
-    let remote_ext_prefix = match &remote_ext_config["prefix"] {
-        Value::String(x) => Some(x.clone()),
-        _ => None,
-    };
-
-    // load will not be large, so default parameters are fine
-    let config = S3Config {
-        bucket_name: remote_ext_bucket.to_string(),
-        bucket_region: remote_ext_region.to_string(),
-        prefix_in_bucket: remote_ext_prefix,
-        endpoint: remote_ext_endpoint,
-        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
-        max_keys_per_list_response: None,
-    };
-    let config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
-        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
-        storage: RemoteStorageKind::AwsS3(config),
-    };
-    GenericRemoteStorage::from_config(&config)
-}
-
-// helper to collect all files in the given prefixes
-// returns hashmap of (file_name, file_remote_path)
-async fn list_files_in_prefixes(
-    remote_storage: &GenericRemoteStorage,
-    paths: &Vec<RemotePath>,
-) -> Result<HashMap<String, RemotePath>> {
-    let mut res = HashMap::new();
-
-    for path in paths {
-        for file in remote_storage.list_files(Some(path)).await? {
-            res.insert(
-                file.object_name().expect("bad object").to_owned(),
-                file.to_owned(),
-            );
-        }
-    }
-
-    Ok(res)
-}
-
-// helper to extract extension name
-// extension files can be in subdirectories of the extension store.
-// examples of layout:
-//
-// share/postgresql/extension/extension_name--1.0.sql
-//
-// or
-//
-// share/postgresql/extension/extension_name/extension_name--1.0.sql
-// share/postgresql/extension/extension_name/extra_data.csv
-//
-// Note: we **assume** that the  extension files is in one of these formats.
-// If it is not, this code will not download it.
-fn get_ext_name(path: &str) -> Result<&str> {
-    let path_suffix: Vec<&str> = path.split(&format!("{SHARE_EXT_PATH}/")).collect();
-
-    let path_suffix = path_suffix.last().expect("bad ext name");
-    // the order of these is important
-    // otherwise we'll return incorrect extension name
-    // for path like share/postgresql/extension/extension_name/extension_name--1.0.sql
-    for index in ["/", "--"] {
-        if let Some(index) = path_suffix.find(index) {
-            return Ok(&path_suffix[..index]);
-        }
-    }
-    Ok(path_suffix)
-}
-
-// helper to collect files of given prefixes for extensions
-// and group them by extension
-// returns a hashmap of (extension_name, Vector of remote paths for all files needed for this extension)
-async fn list_files_in_prefixes_for_extensions(
-    remote_storage: &GenericRemoteStorage,
-    paths: &Vec<RemotePath>,
-) -> Result<HashMap<String, Vec<RemotePath>>> {
-    let mut result = HashMap::new();
-    for path in paths {
-        for file in remote_storage.list_files(Some(path)).await? {
-            let file_ext_name = get_ext_name(file.get_path().to_str().context("invalid path")?)?;
-            let ext_file_list = result
-                .entry(file_ext_name.to_string())
-                .or_insert(Vec::new());
-            ext_file_list.push(file.to_owned());
-        }
-    }
-    Ok(result)
-}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -121,62 +121,8 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

-        // download extension files from S3 on demand
-        (&Method::POST, route) if route.starts_with("/extension_server/") => {
-            info!("serving {:?} POST request", route);
-            info!("req.uri {:?}", req.uri());
-
-            let mut is_library = false;
-
-            if let Some(params) = req.uri().query() {
-                info!("serving {:?} POST request with params: {}", route, params);
-
-                if params == "is_library=true" {
-                    is_library = true;
-                } else {
-                    let mut resp = Response::new(Body::from("Wrong request parameters"));
-                    *resp.status_mut() = StatusCode::BAD_REQUEST;
-                    return resp;
-                }
-            }
-
-            let filename = route.split('/').last().unwrap().to_string();
-
-            info!(
-                "serving /extension_server POST request, filename: {:?} is_library: {}",
-                filename, is_library
-            );
-
-            if is_library {
-                match compute.download_library_file(filename.to_string()).await {
-                    Ok(_) => Response::new(Body::from("OK")),
-                    Err(e) => {
-                        error!("library download failed: {}", e);
-                        let mut resp = Response::new(Body::from(e.to_string()));
-                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                        resp
-                    }
-                }
-            } else {
-                match compute
-                    .download_extension_sql_files(filename.to_string())
-                    .await
-                {
-                    Ok(_) => Response::new(Body::from("OK")),
-                    Err(e) => {
-                        error!("extension download failed: {}", e);
-                        let mut resp = Response::new(Body::from(e.to_string()));
-                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                        resp
-                    }
-                }
-            }
-        }
-
        // Return the `404 Not Found` for any other routes.
-        method => {
-            info!("404 Not Found for {:?}", method);
-
+        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
            *not_found.status_mut() = StatusCode::NOT_FOUND;
            not_found
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -139,34 +139,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/GenericError"
-  /extension_server:
-    post:
-      tags:
-      - Extension
-      summary: Download extension from S3 to local folder.
-      description: ""
-      operationId: downloadExtension
-      responses:
-        200:
-          description: Extension downloaded
-          content:
-            text/plain:
-              schema:
-                type: string
-                description: Error text or 'OK' if download succeeded.
-                example: "OK"
-        400:
-        description: Request is invalid.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"
-        500:
-        description: Extension download request failed.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -9,7 +9,6 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
-pub mod extension_server;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -17,7 +17,7 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

 /// Escape a string for including it in a SQL literal
-pub fn escape_literal(s: &str) -> String {
+fn escape_literal(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
 pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;

    update_pg_hba(pgdata_path)?;

@@ -269,13 +269,17 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
-                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
-                    name.pg_quote()
-                );
+                let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
                info!("role create query: '{}'", &query);
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
+
+                let grant_query = format!(
+                    "GRANT pg_read_all_data, pg_write_all_data TO {}",
+                    name.pg_quote()
+                );
+                xact.execute(grant_query.as_str(), &[])?;
+                info!("role grant query: '{}'", &grant_query);
            }
        }

@@ -472,11 +476,6 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                query.push_str(&db.to_pg_options());
                let _guard = info_span!("executing", query).entered();
                client.execute(query.as_str(), &[])?;
-                let grant_query: String = format!(
-                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
-                    name.pg_quote()
-                );
-                client.execute(grant_query.as_str(), &[])?;
            }
        };

@@ -496,9 +495,35 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
    info!("cluster spec grants:");

+    // We now have a separate `web_access` role to connect to the database
+    // via the web interface and proxy link auth. And also we grant a
+    // read / write all data privilege to every role. So also grant
+    // create to everyone.
+    // XXX: later we should stop messing with Postgres ACL in such horrible
+    // ways.
+    let roles = spec
+        .cluster
+        .roles
+        .iter()
+        .map(|r| r.name.pg_quote())
+        .collect::<Vec<_>>();
+
+    for db in &spec.cluster.databases {
+        let dbname = &db.name;
+
+        let query: String = format!(
+            "GRANT CREATE ON DATABASE {} TO {}",
+            dbname.pg_quote(),
+            roles.join(", ")
+        );
+        info!("grant query {}", &query);
+
+        client.execute(query.as_str(), &[])?;
+    }
+
    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -32,4 +32,3 @@ utils.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
-tracing.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -657,8 +657,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

-            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
-
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -700,7 +698,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
+                endpoint.start(&auth_token, safekeepers)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -744,7 +742,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token, safekeepers, remote_ext_config)?;
+                ep.start(&auth_token, safekeepers)?;
            }
        }
        "stop" => {
@@ -1004,12 +1002,6 @@ fn cli() -> Command {
        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
        .required(false);

-    let remote_ext_config_args = Arg::new("remote-ext-config")
-        .long("remote-ext-config")
-        .num_args(1)
-        .help("Configure the S3 bucket that we search for extensions in.")
-        .required(false);
-
    let lsn_arg = Arg::new("lsn")
        .long("lsn")
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1160,7 +1152,6 @@ fn cli() -> Command {
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
-                    .arg(remote_ext_config_args)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -311,7 +311,7 @@ impl Endpoint {

                // TODO: use future host field from safekeeper spec
                // Pass the list of safekeepers to the replica so that it can connect to any of them,
-                // whichever is available.
+                // whichever is availiable.
                let sk_ports = self
                    .env
                    .safekeepers
@@ -408,12 +408,7 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(
-        &self,
-        auth_token: &Option<String>,
-        safekeepers: Vec<NodeId>,
-        remote_ext_config: Option<&String>,
-    ) -> Result<()> {
+    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }
@@ -481,7 +476,6 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            private_extensions: Some(vec![self.tenant_id.to_string()]), //DEBUG ONLY
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -513,9 +507,6 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
-        if let Some(remote_ext_config) = remote_ext_config {
-            cmd.args(["--remote-ext-config", remote_ext_config]);
-        }
        let _child = cmd.spawn()?;

        // Wait for it to start
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -1,301 +0,0 @@
-# Supporting custom user Extensions
-
-Created 2023-05-03
-
-## Motivation
-
-There are many extensions in the PostgreSQL ecosystem, and not all extensions
-are of a quality that we can confidently support them. Additionally, our
-current extension inclusion mechanism has several problems because we build all
-extensions into the primary Compute image: We build the extensions every time
-we build the compute image regardless of whether we actually need to rebuild
-the image, and the inclusion of these extensions in the image adds a hard
-dependency on all supported extensions - thus increasing the image size, and
-with it the time it takes to download that image - increasing first start
-latency.
-
-This RFC proposes a dynamic loading mechanism that solves most of these
-problems.
-
-## Summary
-
-`compute_ctl` is made responsible for loading extensions on-demand into
-the container's file system for dynamically loaded extensions, and will also
-make sure that the extensions in `shared_preload_libraries` are downloaded
-before the compute node starts.
-
-## Components
-
-compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
-
-## Requirements
-
-Compute nodes with no extra extensions should not be negatively impacted by
-the existence of support for many extensions.
-
-Installing an extension into PostgreSQL should be easy.
-
-Non-preloaded extensions shouldn't impact startup latency.
-
-Uninstalled extensions shouldn't impact query latency.
-
-A small latency penalty for dynamically loaded extensions is acceptable in
-the first seconds of compute startup, but not in steady-state operations.
-
-## Proposed implementation
-
-### On-demand, JIT-loading of extensions
-
-TLDR; we download extensions as soon as we need them, or when we have spare
-time.
-
-That means, we first download the extensions required to start the PostMaster
-(`shared_preload_libraries` and their dependencies), then the libraries required
-before a backend can start processing user input (`preload_libraries` and
-dependencies), and then (with network limits applied) the remainder of the
-configured extensions, with prioritization for installed extensions.
-
-If PostgreSQL tries to load a library that is not yet fully on disk, it will
-ask `compute_ctl` first if the extension has been downloaded yet, and will wait
-for `compute_ctl` to finish downloading that extension. `compute_ctl` will
-prioritize downloading that extension over other extensions that were not yet
-requested.
-
-#### Workflow
-
-```mermaid
-sequenceDiagram
-    autonumber
-    participant EX as External (control plane, ...)
-    participant CTL as compute_ctl
-    participant ST as extension store
-    actor PG as PostgreSQL
-
-    EX ->>+ CTL: Start compute with config X
-
-    note over CTL: The configuration contains a list of all <br/>extensions available to that compute node, etc.
-
-    par Optionally parallel or concurrent
-        loop Available extensions
-            CTL ->>+ ST: Download control file of extension
-            activate CTL
-            ST ->>- CTL: Finish downloading control file
-            CTL ->>- CTL: Put control file in extensions directory
-        end
-
-        loop For each extension in shared_preload_libraries
-            CTL ->>+ ST: Download extension's data
-            activate CTL
-            ST ->>- CTL: Finish downloading
-            CTL ->>- CTL: Put extension's files in the right place
-        end
-    end
-
-    CTL ->>+ PG: Start PostgreSQL
-
-    note over CTL: PostgreSQL can now start accepting <br/>connections. However, users may still need to wait <br/>for preload_libraries extensions to get downloaded.
-
-    par Load preload_libraries
-        loop For each extension in preload_libraries
-            CTL ->>+ ST: Download extension's data
-            activate CTL
-            ST ->>- CTL: Finish downloading
-            CTL ->>- CTL: Put extension's files in the right place
-        end
-    end
-
-    note over CTL: After this, connections don't have any hard <br/>waits for extension files left, except for those <br/>connections that override preload_libraries <br/>in their startup packet
-
-    par PG's internal_load_library(library)
-        alt Library is not yet loaded
-            PG ->>+ CTL: Load library X
-            CTL ->>+ ST: Download the extension that provides X
-            ST ->>- CTL: Finish downloading
-            CTL ->> CTL: Put extension's files in the right place
-            CTL ->>- PG: Ready
-        else Library is already loaded
-            note over PG: No-op
-        end
-    and Download all remaining extensions
-        loop Extension X
-            CTL ->>+ ST: Download not-yet-downloaded extension X
-            activate CTL
-            ST ->>- CTL: Finish downloading
-            CTL ->>- CTL: Put extension's files in the right place
-        end
-    end
-
-    deactivate PG
-    deactivate CTL
-```
-
-#### Summary
-
-Pros:
- - Startup is only as slow as it takes to load all (shared_)preload_libraries
- - Supports BYO Extension
-
-Cons:
- - O(sizeof(extensions)) IO requirement for loading all extensions.
-
-### Alternative solutions
-
-1. Allow users to add their extensions to the base image
-   
-   Pros:
-    - Easy to deploy
-
-   Cons:
-    - Doesn't scale - first start size is dependent on image size;
-    - All extensions are shared across all users: It doesn't allow users to
-      bring their own restrictive-licensed extensions
-
-2. Bring Your Own compute image
-   
-   Pros:
-    - Still easy to deploy
-    - User can bring own patched version of PostgreSQL
-
-   Cons:
-    - First start latency is O(sizeof(extensions image))
-    - Warm instance pool for skipping pod schedule latency is not feasible with
-      O(n) custom images
-    - Support channels are difficult to manage
-
-3. Download all user extensions in bulk on compute start
-   
-   Pros:
-    - Easy to deploy
-    - No startup latency issues for "clean" users.
-    - Warm instance pool for skipping pod schedule latency is possible
-
-   Cons:
-    - Downloading all extensions in advance takes a lot of time, thus startup
-      latency issues
-
-4. Store user's extensions in persistent storage
-   
-   Pros:
-    - Easy to deploy
-    - No startup latency issues
-    - Warm instance pool for skipping pod schedule latency is possible
-
-   Cons:
-    - EC2 instances have only limited number of attachments shared between EBS
-      volumes, direct-attached NVMe drives, and ENIs.
-    - Compute instance migration isn't trivially solved for EBS mounts (e.g.
-      the device is unavailable whilst moving the mount between instances).
-    - EBS can only mount on one instance at a time (except the expensive IO2
-      device type).
-
-5. Store user's extensions in network drive
-   
-   Pros:
-    - Easy to deploy
-    - Few startup latency issues
-    - Warm instance pool for skipping pod schedule latency is possible
-
-   Cons:
-    - We'd need networked drives, and a lot of them, which would store many
-      duplicate extensions.
-    - **UNCHECKED:** Compute instance migration may not work nicely with
-      networked IOs
-
-
-### Idea extensions
-
-The extension store does not have to be S3 directly, but could be a Node-local
-caching service on top of S3. This would reduce the load on the network for
-popular extensions.
-
-## Extension Store implementation
-
-Extension Store in our case is a private S3 bucket.
-Extensions are stored as tarballs in the bucket. The tarball contains the extension's control file and all the files that the extension needs to run.
-
-We may also store the control file separately from the tarball to speed up the extension loading.
-
-`s3://<the-bucket>/extensions/ext-name/sha-256+1234abcd1234abcd1234abcd1234abcd/bundle.tar`
-
-where `ext-name` is an extension name and `sha-256+1234abcd1234abcd1234abcd1234abcd` is a hash of a specific extension version tarball.
-
-To ensure security, there is no direct access to the S3 bucket from compute node.
-
-Control plane forms a list of extensions available to the compute node 
-and forms a short-lived [pre-signed URL](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html) 
-for each extension that is available to the compute node.
-
-so, `compute_ctl` receives spec in the following format
-
-```
-"extensions": [{
-  "meta_format": 1,
-  "extension_name": "postgis",
-  "link": "https://<the-bucket>/extensions/sha-256+1234abcd1234abcd1234abcd1234abcd/bundle.tar?AWSAccessKeyId=1234abcd1234abcd1234abcd1234abcd&Expires=1234567890&Signature=1234abcd1234abcd1234abcd1234abcd",
-  ...
-}]
-```
-
-`compute_ctl` then downloads the extension from the link and unpacks it to the right place.
-
-### How do we handle private extensions?
-
-Private and public extensions are treated equally from the Extension Store perspective.
-The only difference is that the private extensions are not listed in the user UI (managed by control plane).
-
-### How to add new extension to the Extension Store?
-
-Since we need to verify that the extension is compatible with the compute node and doesn't contain any malicious code, 
-we need to review the extension before adding it to the Extension Store.
-
-I do not expect that we will have a lot of extensions to review, so we can do it manually for now.
-
-Some admin UI may be added later to automate this process.
-
-The list of extensions available to a compute node is stored in the console database.
-
-### How is the list of available extensions managed? 
-
-We need to add new tables to the console database to store the list of available extensions, their versions and access rights.
-
-something like this:
-
-```
-CREATE TABLE extensions (
-    id SERIAL PRIMARY KEY,
-    name VARCHAR(255) NOT NULL,
-    version VARCHAR(255) NOT NULL,
-    hash VARCHAR(255) NOT NULL, // this is the path to the extension in the Extension Store
-    supported_postgres_versions integer[] NOT NULL, 
-    is_public BOOLEAN NOT NULL, // public extensions are available to all users
-    is_shared_preload BOOLEAN NOT NULL, // these extensions require postgres restart
-    is_preload BOOLEAN NOT NULL,
-    license VARCHAR(255) NOT NULL,
-);
-
-CREATE TABLE user_extensions (
-    user_id INTEGER NOT NULL,
-    extension_id INTEGER NOT NULL,
-    FOREIGN KEY (user_id) REFERENCES users (id),
-    FOREIGN KEY (extension_id) REFERENCES extensions (id)
-);
-```
-
-When new extension is added to the Extension Store, we add a new record to the table and set permissions.
- 
-In UI, user may select the extensions that they want to use with their compute node.
-
-NOTE: Extensions that require postgres restart will not be available until the next compute restart.
-Also, currently user cannot force postgres restart. We should add this feature later.
-
-For other extensions, we must communicate updates to `compute_ctl` and they will be downloaded in the background.
-
-### How can user update the extension?
-
-User can update the extension by selecting the new version of the extension in the UI.
-
-### Alternatives
-
-For extensions written on trusted languages we can also adopt
-`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
-This will increase the amount supported extensions and decrease the amount of work required to support them.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -60,8 +60,6 @@ pub struct ComputeSpec {
    /// If set, 'storage_auth_token' is used as the password to authenticate to
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
-
-    pub private_extensions: Option<Vec<String>>,
 }

 #[serde_as]
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -23,7 +23,6 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
-pub mod metric_vec_duration;

 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -1,23 +0,0 @@
-//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
-
-use std::{future::Future, time::Instant};
-
-pub trait DurationResultObserver {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
-}
-
-pub async fn observe_async_block_duration_by_result<
-    T,
-    E,
-    F: Future<Output = Result<T, E>>,
-    O: DurationResultObserver,
->(
-    observer: &O,
-    block: F,
-) -> Result<T, E> {
-    let start = Instant::now();
-    let result = block.await;
-    let duration = start.elapsed();
-    observer.observe_result(&result, duration);
-    result
-}
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -154,6 +154,7 @@ pub enum ActivatingFrom {
 /// A state of a timeline in pageserver's memory.
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
+    Creating,
    /// The timeline is recognized by the pageserver but is not yet operational.
    /// In particular, the walreceiver connection loop is not running for this timeline.
    /// It will eventually transition to state Active or Broken.
@@ -165,7 +166,10 @@ pub enum TimelineState {
    /// It cannot transition back into any other state.
    Stopping,
    /// The timeline is broken and not operational (previous states: Loading or Active).
-    Broken { reason: String, backtrace: String },
+    Broken {
+        reason: String,
+        backtrace: String,
+    },
 }

 #[serde_as]
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -184,20 +184,6 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
-    // A function for listing all the files in a "directory"
-    // Example:
-    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
-        }
-    }
-
-    // lists common *prefixes*, if any of files
-    // Example:
-    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
@@ -209,6 +195,14 @@ impl GenericRemoteStorage {
        }
    }

+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -349,17 +349,10 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let mut folder_name = folder
+        let folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());

-        // remove leading "/" if one exists
-        if let Some(folder_name_slash) = folder_name.clone() {
-            if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                folder_name = Some(folder_name_slash[1..].to_string());
-            }
-        }
-
        // AWS may need to break the response into several parts
        let mut continuation_token = None;
        let mut all_files = vec![];
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -495,50 +495,50 @@ fn start_pageserver(
                Ok(())
            },
        );
-    }

-    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-        let background_jobs_barrier = background_jobs_barrier;
-        let metrics_ctx = RequestContext::todo_child(
-            TaskKind::MetricsCollection,
-            // This task itself shouldn't download anything.
-            // The actual size calculation does need downloads, and
-            // creates a child context with the right DownloadBehavior.
-            DownloadBehavior::Error,
-        );
-        task_mgr::spawn(
-            crate::BACKGROUND_RUNTIME.handle(),
-            TaskKind::MetricsCollection,
-            None,
-            None,
-            "consumption metrics collection",
-            true,
-            async move {
-                // first wait until background jobs are cleared to launch.
-                //
-                // this is because we only process active tenants and timelines, and the
-                // Timeline::get_current_logical_size will spawn the logical size calculation,
-                // which will not be rate-limited.
-                let cancel = task_mgr::shutdown_token();
+        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let background_jobs_barrier = background_jobs_barrier;
+            let metrics_ctx = RequestContext::todo_child(
+                TaskKind::MetricsCollection,
+                // This task itself shouldn't download anything.
+                // The actual size calculation does need downloads, and
+                // creates a child context with the right DownloadBehavior.
+                DownloadBehavior::Error,
+            );
+            task_mgr::spawn(
+                MGMT_REQUEST_RUNTIME.handle(),
+                TaskKind::MetricsCollection,
+                None,
+                None,
+                "consumption metrics collection",
+                true,
+                async move {
+                    // first wait until background jobs are cleared to launch.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    let cancel = task_mgr::shutdown_token();

-                tokio::select! {
-                    _ = cancel.cancelled() => { return Ok(()); },
-                    _ = background_jobs_barrier.wait() => {}
-                };
+                    tokio::select! {
+                        _ = cancel.cancelled() => { return Ok(()); },
+                        _ = background_jobs_barrier.wait() => {}
+                    };

-                pageserver::consumption_metrics::collect_metrics(
-                    metric_collection_endpoint,
-                    conf.metric_collection_interval,
-                    conf.cached_metric_collection_interval,
-                    conf.synthetic_size_calculation_interval,
-                    conf.id,
-                    metrics_ctx,
-                )
-                .instrument(info_span!("metrics_collection"))
-                .await?;
-                Ok(())
-            },
-        );
+                    pageserver::consumption_metrics::collect_metrics(
+                        metric_collection_endpoint,
+                        conf.metric_collection_interval,
+                        conf.cached_metric_collection_interval,
+                        conf.synthetic_size_calculation_interval,
+                        conf.id,
+                        metrics_ctx,
+                    )
+                    .instrument(info_span!("metrics_collection"))
+                    .await?;
+                    Ok(())
+                },
+            );
+        }
    }

    // Spawn a task to listen for libpq connections. It will spawn further tasks
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1128,6 +1128,8 @@ async fn disk_usage_eviction_run(
        freed_bytes: 0,
    };

+    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
+
    let (tx, rx) = tokio::sync::oneshot::channel();

    let state = get_state(&r);
@@ -1145,7 +1147,7 @@ async fn disk_usage_eviction_run(
    let _g = cancel.drop_guard();

    crate::task_mgr::spawn(
-        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
+        MGMT_REQUEST_RUNTIME.handle(),
        TaskKind::DiskUsageEviction,
        None,
        None,
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,4 +1,3 @@
-use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
@@ -425,27 +424,6 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub struct BasebackupQueryTime(HistogramVec);
-pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
-    BasebackupQueryTime({
-        register_histogram_vec!(
-            "pageserver_basebackup_query_seconds",
-            "Histogram of basebackup queries durations, by result type",
-            &["result"],
-            CRITICAL_OP_BUCKETS.into(),
-        )
-        .expect("failed to define a metric")
-    })
-});
-
-impl DurationResultObserver for BasebackupQueryTime {
-    fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
-        let label_value = if res.is_ok() { "ok" } else { "error" };
-        let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
-        metric.observe(duration.as_secs_f64());
-    }
-}
-
 pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_live_connections",
@@ -768,6 +746,7 @@ impl StorageTimeMetrics {

 #[derive(Debug)]
 pub struct TimelineMetrics {
+    fake: bool,
    tenant_id: String,
    timeline_id: String,
    pub get_reconstruct_data_time_histo: Histogram,
@@ -792,6 +771,7 @@ pub struct TimelineMetrics {

 impl TimelineMetrics {
    pub fn new(
+        fake: bool,
        tenant_id: &TenantId,
        timeline_id: &TimelineId,
        evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
@@ -845,7 +825,13 @@ impl TimelineMetrics {
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

-        TimelineMetrics {
+        // TODO(chi): remove this once we remove Lazy for all metrics. Otherwise this will not appear in the exporter
+        // and integration test will error.
+        MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
+        MATERIALIZED_PAGE_CACHE_HIT.get();
+
+        let m = TimelineMetrics {
+            fake,
            tenant_id,
            timeline_id,
            get_reconstruct_data_time_histo,
@@ -867,12 +853,16 @@ impl TimelineMetrics {
                evictions_with_low_residence_duration,
            ),
            read_num_fs_layers,
-        }
-    }
-}
+        };

-impl Drop for TimelineMetrics {
-    fn drop(&mut self) {
+        if fake {
+            m.remove_metrics();
+        }
+
+        m
+    }
+
+    fn remove_metrics(&self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
@@ -909,6 +899,14 @@ impl Drop for TimelineMetrics {
    }
 }

+impl Drop for TimelineMetrics {
+    fn drop(&mut self) {
+        if !self.fake {
+            self.remove_metrics();
+        }
+    }
+}
+
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
    let tid = tenant_id.to_string();
    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
@@ -1319,8 +1317,4 @@ pub fn preinitialize_metrics() {

    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
-
-    // Python tests need these.
-    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
-    MATERIALIZED_PAGE_CACHE_HIT.get();
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -14,6 +14,7 @@ use bytes::Buf;
 use bytes::Bytes;
 use futures::Stream;
 use pageserver_api::models::TenantState;
+use pageserver_api::models::TimelineState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -24,6 +25,7 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
+use std::collections::hash_map::Entry;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
@@ -51,6 +53,8 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant;
+use crate::tenant::compare_arced_timeline;
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::GetTenantError;
 use crate::tenant::{Tenant, Timeline};
@@ -487,11 +491,20 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
-        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
+
+        let (guard, real_timeline_not_in_tenants_map) = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .await?;
+
+        // TODO spawn flush loop of timeline early (before activation),
+        // but then we need to take care of shutting it down in case we fail
+        // (bootstrap_timeline probably also needs it?)

        // TODO mark timeline as not ready until it reaches end_lsn.
        // We might have some wal to import as well, and we should prevent compute
@@ -505,21 +518,49 @@ impl PageServerHandler {

        // Import basebackup provided via CopyData
        info!("importing basebackup");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        pgb.flush().await?;
+        let doit = async {
+            pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
+            pgb.flush().await?;

-        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
-        timeline
-            .import_basebackup_from_tar(
-                &mut copyin_reader,
-                base_lsn,
-                self.broker_client.clone(),
-                &ctx,
-            )
-            .await?;
+            let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
+            real_timeline_not_in_tenants_map
+                .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
+                .await?;

-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
+            // Read the end of the tar archive.
+            read_tar_eof(copyin_reader).await?;
+            anyhow::Ok(())
+        };
+        let placeholder_timeline = match doit.await {
+            Ok(()) => {
+                match guard.creation_complete_remove_uninit_marker_and_get_placeholder_timeline() {
+                    Ok(placeholder_timeline) => placeholder_timeline,
+                    Err(err) => {
+                        error!(
+                            "failed to remove uninit marker for new_timeline_id={timeline_id}: {err:#}"
+                        );
+                        return Err(QueryError::Other(err.context("remove uninit marker file")));
+                    }
+                }
+            }
+            Err(e) => {
+                debug_assert_current_span_has_tenant_and_timeline_id();
+                guard.creation_failed();
+                return Err(QueryError::Other(e));
+            }
+        };
+
+        // todo share with Tenant::create_timeline
+        match tenant.timelines.lock().unwrap().entry(timeline_id) {
+            Entry::Vacant(_) => unreachable!("we created a placeholder earlier, and load_local_timeline should have inserted the real timeline"),
+            Entry::Occupied(mut o) => {
+                info!("replacing placeholder timeline with the real one");
+                assert_eq!(placeholder_timeline.current_state(), TimelineState::Creating);
+                assert!(compare_arced_timeline(&placeholder_timeline, o.get()));
+                let replaced_placeholder = o.insert(Arc::clone(&real_timeline_not_in_tenants_map));
+                assert!(compare_arced_timeline(&replaced_placeholder, &placeholder_timeline));
+            },
+        }

        // TODO check checksum
        // Meanwhile you can verify client-side by taking fullbackup
@@ -527,7 +568,9 @@ impl PageServerHandler {
        // It wouldn't work if base came from vanilla postgres though,
        // since we discard some log files.

-        info!("done");
+        info!("done, activating timeline");
+        real_timeline_not_in_tenants_map.activate(self.broker_client.clone(), None, &ctx);
+
        Ok(())
    }

@@ -913,24 +956,10 @@ where
                None
            };

-            metrics::metric_vec_duration::observe_async_block_duration_by_result(
-                &*crate::metrics::BASEBACKUP_QUERY_TIME,
-                async move {
-                    self.handle_basebackup_request(
-                        pgb,
-                        tenant_id,
-                        timeline_id,
-                        lsn,
-                        None,
-                        false,
-                        ctx,
-                    )
-                    .await?;
-                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    anyhow::Ok(())
-                },
-            )
-            .await?;
+            // Check that the timeline exists
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
+                .await?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
        // return pair of prev_lsn and last_lsn
        else if query_string.starts_with("get_last_record_rlsn ") {
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -273,6 +273,8 @@ pub enum TaskKind {

    DebugTool,

+    CreateTimeline,
+
    #[cfg(test)]
    UnitTest,
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,7 +20,9 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
+use crate::tenant::{
+    create_tenant_files, CreateTenantFilesMode, Tenant, TenantState, TimelineLoadCause,
+};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};

 use utils::fs_ext::PathExt;
@@ -121,6 +123,7 @@ pub async fn init_tenant_mgr(
                        &tenant_dir_path,
                        broker_client.clone(),
                        remote_storage.clone(),
+                        TimelineLoadCause::Startup,
                        Some(init_order.clone()),
                        &ctx,
                    ) {
@@ -157,6 +160,7 @@ pub fn schedule_local_tenant_processing(
    tenant_path: &Path,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
+    cause: TimelineLoadCause,
    init_order: Option<InitializationOrder>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
@@ -174,6 +178,7 @@ pub fn schedule_local_tenant_processing(
        })?,
        "Cannot load tenant from empty directory {tenant_path:?}"
    );
+    // TODO ensure there's no uninit mark / handle it correctly during ignore and load

    let tenant_id = tenant_path
        .file_name()
@@ -216,6 +221,7 @@ pub fn schedule_local_tenant_processing(
            tenant_id,
            broker_client,
            remote_storage,
+            cause,
            init_order,
            ctx,
        )
@@ -315,7 +321,7 @@ pub async fn create_tenant(
        //       See https://github.com/neondatabase/neon/issues/4233

        let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, TimelineLoadCause::TenantCreate, None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -463,7 +469,7 @@ pub async fn load_tenant(
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
        }

-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, TimelineLoadCause::TenantLoad, None, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -536,7 +542,7 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), TimelineLoadCause::Attach, None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -12,7 +12,7 @@ use crate::context::RequestContext;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{Context, Result};
+use anyhow::Result;
 use bytes::Bytes;
 use enum_map::EnumMap;
 use enumset::EnumSet;
@@ -343,8 +343,7 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
+pub trait Layer: std::fmt::Debug + Send + Sync {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

@@ -374,42 +373,13 @@ pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
    /// is available. If this returns ValueReconstructResult::Continue, look up
    /// the predecessor layer and call again with the same 'reconstruct_data' to
    /// collect more data.
-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        reconstruct_data: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> Result<(ValueReconstructState, ValueReconstructResult)>;
-
-    /// CANCEL SAFETY: if the returned future is dropped,
-    /// the wrapped closure still run to completion and the return value discarded.
-    /// For the case of get_value_reconstruct_data, we expect the closure to not
-    /// have any side effects, as it only attempts to read a layer (and stuff like
-    /// page cache isn't considered a real side effect).
-    /// But, ...
-    /// TRACING:
-    /// If the returned future is cancelled, the spawn_blocking span can outlive
-    /// the caller's span.
-    /// So, technically, we should be using `parent: None` and `follows_from: current`
-    /// instead. However, in practice, the advantage of maintaining the span stack
-    /// in logs outweighs the disadvantage of having a dangling span in a case that
-    /// is not expected to happen because in pageserver we generally don't drop pending futures.
-    async fn get_value_reconstruct_data(
-        self: Arc<Self>,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> Result<(ValueReconstructState, ValueReconstructResult)> {
-        let span = tracing::info_span!("get_value_reconstruct_data_spawn_blocking");
-        tokio::task::spawn_blocking(move || {
-            let _enter = span.enter();
-            self.get_value_reconstruct_data_blocking(key, lsn_range, reconstruct_data, ctx)
-        })
-        .await
-        .context("spawn_blocking")?
-    }
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult>;

    /// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
    fn short_id(&self) -> String;
@@ -529,7 +499,6 @@ impl LayerDescriptor {
    }
 }

-#[async_trait::async_trait]
 impl Layer for LayerDescriptor {
    fn get_key_range(&self) -> Range<Key> {
        self.key.clone()
@@ -543,13 +512,13 @@ impl Layer for LayerDescriptor {
        self.is_incremental
    }

-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        _key: Key,
        _lsn_range: Range<Lsn>,
-        _reconstruct_data: ValueReconstructState,
-        _ctx: RequestContext,
-    ) -> Result<(ValueReconstructState, ValueReconstructResult)> {
+        _reconstruct_data: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
        todo!("This method shouldn't be part of the Layer trait")
    }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -218,7 +218,6 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

-#[async_trait::async_trait]
 impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
@@ -295,13 +294,13 @@ impl Layer for DeltaLayer {
        Ok(())
    }

-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
        ensure!(lsn_range.start >= self.desc.lsn_range.start);
        let mut need_image = true;

@@ -309,7 +308,7 @@ impl Layer for DeltaLayer {

        {
            // Open the file and lock the metadata in memory
-            let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
+            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

            // Scan the page versions backwards, starting from `lsn`.
            let file = &inner.file;
@@ -375,9 +374,9 @@ impl Layer for DeltaLayer {
        // If an older page image is needed to reconstruct the page, let the
        // caller know.
        if need_image {
-            Ok((reconstruct_state, ValueReconstructResult::Continue))
+            Ok(ValueReconstructResult::Continue)
        } else {
-            Ok((reconstruct_state, ValueReconstructResult::Complete))
+            Ok(ValueReconstructResult::Complete)
        }
    }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -149,7 +149,6 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-#[async_trait::async_trait]
 impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
@@ -182,18 +181,18 @@ impl Layer for ImageLayer {
    }

    /// Look up given page in the file
-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
        assert!(self.desc.key_range.contains(&key));
        assert!(lsn_range.start >= self.lsn);
        assert!(lsn_range.end >= self.lsn);

-        let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
+        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

        let file = inner.file.as_ref().unwrap();
        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
@@ -211,9 +210,9 @@ impl Layer for ImageLayer {
            let value = Bytes::from(blob);

            reconstruct_state.img = Some((self.lsn, value));
-            Ok((reconstruct_state, ValueReconstructResult::Complete))
+            Ok(ValueReconstructResult::Complete)
        } else {
-            Ok((reconstruct_state, ValueReconstructResult::Missing))
+            Ok(ValueReconstructResult::Missing)
        }
    }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -110,7 +110,6 @@ impl InMemoryLayer {
    }
 }

-#[async_trait::async_trait]
 impl Layer for InMemoryLayer {
    fn get_key_range(&self) -> Range<Key> {
        Key::MIN..Key::MAX
@@ -191,13 +190,13 @@ impl Layer for InMemoryLayer {
    }

    /// Look up given value in the layer.
-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        mut reconstruct_state: ValueReconstructState,
-        _ctx: RequestContext,
-    ) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
+        reconstruct_state: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
        ensure!(lsn_range.start >= self.start_lsn);
        let mut need_image = true;

@@ -214,7 +213,7 @@ impl Layer for InMemoryLayer {
                match value {
                    Value::Image(img) => {
                        reconstruct_state.img = Some((*entry_lsn, img));
-                        return Ok((reconstruct_state, ValueReconstructResult::Complete));
+                        return Ok(ValueReconstructResult::Complete);
                    }
                    Value::WalRecord(rec) => {
                        let will_init = rec.will_init();
@@ -234,9 +233,9 @@ impl Layer for InMemoryLayer {
        // If an older page image is needed to reconstruct the page, let the
        // caller know.
        if need_image {
-            Ok((reconstruct_state, ValueReconstructResult::Continue))
+            Ok(ValueReconstructResult::Continue)
        } else {
-            Ok((reconstruct_state, ValueReconstructResult::Complete))
+            Ok(ValueReconstructResult::Complete)
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -6,7 +6,7 @@ use crate::context::RequestContext;
 use crate::repository::Key;
 use crate::tenant::layer_map::BatchedUpdates;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::storage_layer::{Layer, ValueReconstructState};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use anyhow::{bail, Result};
 use pageserver_api::models::HistoricLayerInfo;
 use std::ops::Range;
@@ -21,7 +21,7 @@ use utils::{
 use super::filename::{DeltaFileName, ImageFileName};
 use super::{
    DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
-    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, ValueReconstructResult,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -63,15 +63,14 @@ impl std::fmt::Debug for RemoteLayer {
    }
 }

-#[async_trait::async_trait]
 impl Layer for RemoteLayer {
-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        _key: Key,
        _lsn_range: Range<Lsn>,
-        _reconstruct_state: ValueReconstructState,
-        _ctx: RequestContext,
-    ) -> Result<(ValueReconstructState, ValueReconstructResult)> {
+        _reconstruct_state: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
        bail!(
            "layer {} needs to be downloaded",
            self.filename().file_name()
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -68,13 +68,13 @@ use utils::{
    simple_rcu::{Rcu, RcuReadGuard},
 };

-use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
 use crate::walredo::WalRedoManager;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
+use crate::{import_datadir, page_cache};
 use crate::{is_temporary, task_mgr};

 pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -86,6 +86,7 @@ use super::layer_map::BatchedUpdates;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset};
+use super::TimelineLoadCause;

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -129,7 +130,7 @@ pub struct Timeline {

    pub pg_version: u32,

-    pub(crate) layers: Arc<tokio::sync::RwLock<LayerMap<dyn PersistentLayer>>>,
+    pub(crate) layers: tokio::sync::RwLock<LayerMap<dyn PersistentLayer>>,

    /// Set of key ranges which should be covered by image layers to
    /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
@@ -555,14 +556,13 @@ impl Timeline {
            None => None,
        };

-        let reconstruct_state = ValueReconstructState {
+        let mut reconstruct_state = ValueReconstructState {
            records: Vec::new(),
            img: cached_page_img,
        };

        let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
-        let reconstruct_state = self
-            .get_reconstruct_data(key, lsn, reconstruct_state, ctx)
+        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
        timer.stop_and_record();

@@ -691,12 +691,22 @@ impl Timeline {
    /// Flush to disk all data that was written with the put_* functions
    #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+        if self.current_state() == TimelineState::Creating {
+            // make a few additional sanity checks before panicking
+            assert!(self.layers.read().await.open_layer.is_none());
+            panic!("caller must prevent calls for timelines in Creating state")
+        }
        self.freeze_inmem_layer(false).await;
        self.flush_frozen_layers_and_wait().await
    }

    /// Outermost timeline compaction operation; downloads needed layers.
    pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
+        assert!(
+            !matches!(self.current_state(), TimelineState::Creating),
+            "caller must prevent calls for timelines in Creating state"
+        );
+
        const ROUNDS: usize = 2;

        let last_record_lsn = self.get_last_record_lsn();
@@ -950,12 +960,33 @@ impl Timeline {
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
+        if self.current_state() == TimelineState::Creating {
+            panic!("timelines in Creating state are never activated");
+        }
+        self.maybe_spawn_flush_loop();
        self.launch_wal_receiver(ctx, broker_client);
        self.set_state(TimelineState::Active);
        self.launch_eviction_task(background_jobs_can_start);
    }

    pub fn set_state(&self, new_state: TimelineState) {
+        if self.current_state() == TimelineState::Creating {
+            // Do a few assertions before panicking to detect other code that is lacking checks for `Creating` state.
+            assert_eq!(
+                *self.flush_loop_state.lock().unwrap(),
+                FlushLoopState::NotStarted
+            );
+            assert!(
+                self.layers
+                    .try_read()
+                    .expect("we would never be modifying Timeline::layers in a Creating timeline")
+                    .open_layer
+                    .is_none(),
+                "would have nothing to flush anyways"
+            );
+            assert!(self.walreceiver.lock().unwrap().is_none());
+            panic!("timelines in Creating state never change state");
+        }
        match (self.current_state(), new_state) {
            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
@@ -1023,6 +1054,12 @@ impl Timeline {
        loop {
            let current_state = receiver.borrow().clone();
            match current_state {
+                TimelineState::Creating => {
+                    // A timeline _object_ in state Creating never transitions out of it.
+                    // It gets replaced by another object in Loading state once creation is done.
+                    // So, `self` is not the right object to subscribe to.
+                    panic!("timelines in Creating state never change state, hence can't wait for it to become active");
+                }
                TimelineState::Loading => {
                    receiver
                        .changed()
@@ -1391,13 +1428,18 @@ impl Timeline {
        timeline_id: TimelineId,
        tenant_id: TenantId,
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
-        remote_client: Option<RemoteTimelineClient>,
+        remote_client: Option<Arc<RemoteTimelineClient>>,
        pg_version: u32,
+        is_create_placeholder: bool,
        initial_logical_size_can_start: Option<completion::Barrier>,
        initial_logical_size_attempt: Option<completion::Completion>,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(TimelineState::Loading);
+        let (state, _) = watch::channel(if is_create_placeholder {
+            TimelineState::Creating
+        } else {
+            TimelineState::Loading
+        });

        let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -1419,13 +1461,13 @@ impl Timeline {
                timeline_id,
                tenant_id,
                pg_version,
-                layers: Arc::new(tokio::sync::RwLock::new(LayerMap::default())),
+                layers: tokio::sync::RwLock::new(LayerMap::default()),
                wanted_image_layers: Mutex::new(None),

                walredo_mgr,
                walreceiver: Mutex::new(None),

-                remote_client: remote_client.map(Arc::new),
+                remote_client,

                // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
                last_record_lsn: SeqWait::new(RecordLsn {
@@ -1441,6 +1483,7 @@ impl Timeline {
                ancestor_lsn: metadata.ancestor_lsn(),

                metrics: TimelineMetrics::new(
+                    is_create_placeholder,
                    &tenant_id,
                    &timeline_id,
                    crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
@@ -1596,20 +1639,41 @@ impl Timeline {
        ));
    }

-    ///
-    /// Initialize with an empty layer map. Used when creating a new timeline.
-    ///
-    pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
-        let mut layers = self.layers.try_write().expect(
-            "in the context where we call this function, no other task has access to the object",
-        );
-        layers.next_open_layer_at = Some(Lsn(start_lsn.0));
+    /// Prepares timeline data by loading it from the basebackup archive.
+    pub(crate) async fn import_basebackup_from_tar(
+        self: &Arc<Self>,
+        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
+        base_lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        import_datadir::import_basebackup_from_tar(self, copyin_read, base_lsn, ctx)
+            .await
+            .context("Failed to import basebackup")?;
+
+        // Flush loop needs to be spawned in order to be able to flush.
+        // We want to run proper checkpoint before we mark timeline as available to outside world
+        // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
+        self.maybe_spawn_flush_loop();
+
+        fail::fail_point!("before-checkpoint-new-timeline", |_| {
+            bail!("failpoint before-checkpoint-new-timeline");
+        });
+
+        self.freeze_and_flush()
+            .await
+            .context("Failed to flush after basebackup import")?;
+
+        Ok(())
    }

    ///
    /// Scan the timeline directory to populate the layer map.
    ///
-    pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
+    pub(super) async fn load_layer_map(
+        &self,
+        cause: &TimelineLoadCause,
+        disk_consistent_lsn: Lsn,
+    ) -> anyhow::Result<()> {
        let mut layers = self.layers.write().await;
        let mut updates = layers.batch_update();
        let mut num_layers = 0;
@@ -1712,7 +1776,19 @@ impl Timeline {
        }

        updates.flush();
-        layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1);
+
+        if disk_consistent_lsn == Lsn(0) {
+            // If disk_consistent_lsn is 0, then we're still in bootstrap/basebackup_import/create_test_timeline.
+            // Set next_open_layer_at to initdb_lsn to enable the put@initdb_lsn optimization in flush_frozen_layer.
+            assert!(matches!(cause, TimelineLoadCause::TimelineCreate { .. }));
+            assert_eq!(
+                num_layers, 0,
+                "if we crash, creating timelines get removed from disk"
+            );
+            layers.next_open_layer_at = Some(self.initdb_lsn);
+        } else {
+            layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1);
+        }

        info!(
            "loaded layer map with {} layers at {}, total physical size: {}",
@@ -2128,6 +2204,10 @@ impl Timeline {
    ) -> Result<u64, CalculateLogicalSizeError> {
        debug_assert_current_span_has_tenant_and_timeline_id();

+        if self.current_state() == TimelineState::Creating {
+            panic!("cannot calculate logical size for timeline in Creating state");
+        }
+
        let mut timeline_state_updates = self.subscribe_for_state_updates();
        let self_calculation = Arc::clone(self);

@@ -2148,7 +2228,8 @@ impl Timeline {
                            TimelineState::Active => continue,
                            TimelineState::Broken { .. }
                            | TimelineState::Stopping
-                            | TimelineState::Loading => {
+                            | TimelineState::Loading
+                            | TimelineState::Creating  => {
                                break format!("aborted because timeline became inactive (new state: {new_state:?})")
                            }
                        }
@@ -2353,9 +2434,9 @@ impl Timeline {
        &self,
        key: Key,
        request_lsn: Lsn,
-        mut reconstruct_state: ValueReconstructState,
+        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
-    ) -> Result<ValueReconstructState, PageReconstructError> {
+    ) -> Result<(), PageReconstructError> {
        // Start from the current timeline.
        let mut timeline_owned;
        let mut timeline = self;
@@ -2385,12 +2466,12 @@ impl Timeline {
            // The function should have updated 'state'
            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
            match result {
-                ValueReconstructResult::Complete => return Ok(reconstruct_state),
+                ValueReconstructResult::Complete => return Ok(()),
                ValueReconstructResult::Continue => {
                    // If we reached an earlier cached page image, we're done.
                    if cont_lsn == cached_lsn + 1 {
                        MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
-                        return Ok(reconstruct_state);
+                        return Ok(());
                    }
                    if prev_lsn <= cont_lsn {
                        // Didn't make any progress in last iteration. Error out to avoid
@@ -2494,19 +2575,13 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match Arc::clone(open_layer)
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx.attached_child(),
-                                )
-                                .await
-                            {
-                                Ok((new_reconstruct_state, result)) => {
-                                    reconstruct_state = new_reconstruct_state;
-                                    result
-                                }
+                            result = match open_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
+                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
@@ -2527,19 +2602,13 @@ impl Timeline {
                        if cont_lsn > start_lsn {
                            //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match Arc::clone(frozen_layer)
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx.attached_child(),
-                                )
-                                .await
-                            {
-                                Ok((new_reconstruct_state, result)) => {
-                                    reconstruct_state = new_reconstruct_state;
-                                    result
-                                }
+                            result = match frozen_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
+                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
@@ -2568,19 +2637,13 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                            result = match Arc::clone(&layer)
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx.attached_child(),
-                                )
-                                .await
-                            {
-                                Ok((new_reconstruct_state, result)) => {
-                                    reconstruct_state = new_reconstruct_state;
-                                    result
-                                }
+                            result = match layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
+                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
@@ -3389,14 +3452,14 @@ struct CompactLevel0Phase1StatsBuilder {
    version: Option<u64>,
    tenant_id: Option<TenantId>,
    timeline_id: Option<TimelineId>,
-    read_lock_acquisition_micros: DurationRecorder,
-    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
-    read_lock_held_prerequisites_micros: DurationRecorder,
-    read_lock_held_compute_holes_micros: DurationRecorder,
-    read_lock_drop_micros: DurationRecorder,
-    prepare_iterators_micros: DurationRecorder,
-    write_layer_files_micros: DurationRecorder,
+    first_read_lock_acquisition_micros: DurationRecorder,
+    get_level0_deltas_plus_drop_lock_micros: DurationRecorder,
    level0_deltas_count: Option<usize>,
+    time_spent_between_locks: DurationRecorder,
+    second_read_lock_acquisition_micros: DurationRecorder,
+    second_read_lock_held_micros: DurationRecorder,
+    sort_holes_micros: DurationRecorder,
+    write_layer_files_micros: DurationRecorder,
    new_deltas_count: Option<usize>,
    new_deltas_size: Option<u64>,
 }
@@ -3409,14 +3472,14 @@ struct CompactLevel0Phase1Stats {
    tenant_id: TenantId,
    #[serde_as(as = "serde_with::DisplayFromStr")]
    timeline_id: TimelineId,
-    read_lock_acquisition_micros: RecordedDuration,
-    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
-    read_lock_held_prerequisites_micros: RecordedDuration,
-    read_lock_held_compute_holes_micros: RecordedDuration,
-    read_lock_drop_micros: RecordedDuration,
-    prepare_iterators_micros: RecordedDuration,
-    write_layer_files_micros: RecordedDuration,
+    first_read_lock_acquisition_micros: RecordedDuration,
+    get_level0_deltas_plus_drop_lock_micros: RecordedDuration,
    level0_deltas_count: usize,
+    time_spent_between_locks: RecordedDuration,
+    second_read_lock_acquisition_micros: RecordedDuration,
+    second_read_lock_held_micros: RecordedDuration,
+    sort_holes_micros: RecordedDuration,
+    write_layer_files_micros: RecordedDuration,
    new_deltas_count: usize,
    new_deltas_size: u64,
 }
@@ -3425,51 +3488,54 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
    type Error = anyhow::Error;

    fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result<Self, Self::Error> {
-        Ok(Self {
-            version: value.version.ok_or_else(|| anyhow!("version not set"))?,
-            tenant_id: value
-                .tenant_id
-                .ok_or_else(|| anyhow!("tenant_id not set"))?,
-            timeline_id: value
-                .timeline_id
-                .ok_or_else(|| anyhow!("timeline_id not set"))?,
-            read_lock_acquisition_micros: value
-                .read_lock_acquisition_micros
+        let CompactLevel0Phase1StatsBuilder {
+            version,
+            tenant_id,
+            timeline_id,
+            first_read_lock_acquisition_micros,
+            get_level0_deltas_plus_drop_lock_micros,
+            level0_deltas_count,
+            time_spent_between_locks,
+            second_read_lock_acquisition_micros,
+            second_read_lock_held_micros,
+            sort_holes_micros,
+            write_layer_files_micros,
+            new_deltas_count,
+            new_deltas_size,
+        } = value;
+        Ok(CompactLevel0Phase1Stats {
+            version: version.ok_or_else(|| anyhow::anyhow!("version not set"))?,
+            tenant_id: tenant_id.ok_or_else(|| anyhow::anyhow!("tenant_id not set"))?,
+            timeline_id: timeline_id.ok_or_else(|| anyhow::anyhow!("timeline_id not set"))?,
+            first_read_lock_acquisition_micros: first_read_lock_acquisition_micros
                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?,
-            read_lock_held_spawn_blocking_startup_micros: value
-                .read_lock_held_spawn_blocking_startup_micros
+                .ok_or_else(|| anyhow::anyhow!("first_read_lock_acquisition_micros not set"))?,
+            get_level0_deltas_plus_drop_lock_micros: get_level0_deltas_plus_drop_lock_micros
                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
-            read_lock_held_prerequisites_micros: value
-                .read_lock_held_prerequisites_micros
+                .ok_or_else(|| {
+                    anyhow::anyhow!("get_level0_deltas_plus_drop_lock_micros not set")
+                })?,
+            level0_deltas_count: level0_deltas_count
+                .ok_or_else(|| anyhow::anyhow!("level0_deltas_count not set"))?,
+            time_spent_between_locks: time_spent_between_locks
                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?,
-            read_lock_held_compute_holes_micros: value
-                .read_lock_held_compute_holes_micros
+                .ok_or_else(|| anyhow::anyhow!("time_spent_between_locks not set"))?,
+            second_read_lock_acquisition_micros: second_read_lock_acquisition_micros
                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?,
-            read_lock_drop_micros: value
-                .read_lock_drop_micros
+                .ok_or_else(|| anyhow::anyhow!("second_read_lock_acquisition_micros not set"))?,
+            second_read_lock_held_micros: second_read_lock_held_micros
                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
-            prepare_iterators_micros: value
-                .prepare_iterators_micros
+                .ok_or_else(|| anyhow::anyhow!("second_read_lock_held_micros not set"))?,
+            sort_holes_micros: sort_holes_micros
                .into_recorded()
-                .ok_or_else(|| anyhow!("prepare_iterators_micros not set"))?,
-            write_layer_files_micros: value
-                .write_layer_files_micros
+                .ok_or_else(|| anyhow::anyhow!("sort_holes_micros not set"))?,
+            write_layer_files_micros: write_layer_files_micros
                .into_recorded()
-                .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?,
-            level0_deltas_count: value
-                .level0_deltas_count
-                .ok_or_else(|| anyhow!("level0_deltas_count not set"))?,
-            new_deltas_count: value
-                .new_deltas_count
-                .ok_or_else(|| anyhow!("new_deltas_count not set"))?,
-            new_deltas_size: value
-                .new_deltas_size
-                .ok_or_else(|| anyhow!("new_deltas_size not set"))?,
+                .ok_or_else(|| anyhow::anyhow!("write_layer_files_micros not set"))?,
+            new_deltas_count: new_deltas_count
+                .ok_or_else(|| anyhow::anyhow!("new_deltas_count not set"))?,
+            new_deltas_size: new_deltas_size
+                .ok_or_else(|| anyhow::anyhow!("new_deltas_size not set"))?,
        })
    }
 }
@@ -3480,18 +3546,30 @@ impl Timeline {
    /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
    /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
    /// start of level0 files compaction, the on-demand download should be revisited as well.
-    fn compact_level0_phase1(
-        self: Arc<Self>,
+    async fn compact_level0_phase1(
+        &self,
        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
-        layers: tokio::sync::OwnedRwLockReadGuard<LayerMap<dyn PersistentLayer>>,
-        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
        ctx: &RequestContext,
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
-        stats.read_lock_held_spawn_blocking_startup_micros =
-            stats.read_lock_acquisition_micros.till_now(); // set by caller
+        let mut stats = CompactLevel0Phase1StatsBuilder {
+            version: Some(1),
+            tenant_id: Some(self.tenant_id),
+            timeline_id: Some(self.timeline_id),
+            ..Default::default()
+        };
+
+        let begin = tokio::time::Instant::now();
+        let layers = self.layers.read().await;
+        let now = tokio::time::Instant::now();
+        stats.first_read_lock_acquisition_micros =
+            DurationRecorder::Recorded(RecordedDuration(now - begin), now);
        let mut level0_deltas = layers.get_level0_deltas()?;
+        drop(layers);
        stats.level0_deltas_count = Some(level0_deltas.len());
+        stats.get_level0_deltas_plus_drop_lock_micros =
+            stats.first_read_lock_acquisition_micros.till_now();
+
        // Only compact if enough layers have accumulated.
        let threshold = self.get_compaction_threshold();
        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
@@ -3569,53 +3647,6 @@ impl Timeline {
        // we don't accidentally use it later in the function.
        drop(level0_deltas);

-        stats.read_lock_held_prerequisites_micros = stats
-            .read_lock_held_spawn_blocking_startup_micros
-            .till_now();
-
-        // Determine N largest holes where N is number of compacted layers.
-        let max_holes = deltas_to_compact.len();
-        let last_record_lsn = self.get_last_record_lsn();
-        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
-        let min_hole_coverage_size = 3; // TODO: something more flexible?
-
-        // min-heap (reserve space for one more element added before eviction)
-        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
-        let mut prev: Option<Key> = None;
-        for (next_key, _next_lsn, _size) in itertools::process_results(
-            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
-            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0),
-        )? {
-            if let Some(prev_key) = prev {
-                // just first fast filter
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
-                    let key_range = prev_key..next_key;
-                    // Measuring hole by just subtraction of i128 representation of key range boundaries
-                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
-                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
-                    // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
-                    if coverage_size >= min_hole_coverage_size {
-                        heap.push(Hole {
-                            key_range,
-                            coverage_size,
-                        });
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
-                        }
-                    }
-                }
-            }
-            prev = Some(next_key.next());
-        }
-        stats.read_lock_held_compute_holes_micros =
-            stats.read_lock_held_prerequisites_micros.till_now();
-        drop(layers);
-        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
-        let mut holes = heap.into_vec();
-        holes.sort_unstable_by_key(|hole| hole.key_range.start);
-        let mut next_hole = 0; // index of next hole in holes vector
-
        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
        let all_values_iter = itertools::process_results(
@@ -3655,7 +3686,50 @@ impl Timeline {
            },
        )?;

-        stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();
+        // Determine N largest holes where N is number of compacted layers.
+        let max_holes = deltas_to_compact.len();
+        let last_record_lsn = self.get_last_record_lsn();
+        stats.time_spent_between_locks = stats.get_level0_deltas_plus_drop_lock_micros.till_now();
+        let layers = self.layers.read().await; // Is'n it better to hold original layers lock till here?
+        stats.second_read_lock_acquisition_micros = stats.time_spent_between_locks.till_now();
+        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
+        let min_hole_coverage_size = 3; // TODO: something more flexible?
+
+        // min-heap (reserve space for one more element added before eviction)
+        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
+        let mut prev: Option<Key> = None;
+        for (next_key, _next_lsn, _size) in itertools::process_results(
+            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
+            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0),
+        )? {
+            if let Some(prev_key) = prev {
+                // just first fast filter
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
+                    let key_range = prev_key..next_key;
+                    // Measuring hole by just subtraction of i128 representation of key range boundaries
+                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
+                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
+                    // That is why it is better to measure size of hole as number of covering image layers.
+                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
+                    if coverage_size >= min_hole_coverage_size {
+                        heap.push(Hole {
+                            key_range,
+                            coverage_size,
+                        });
+                        if heap.len() > max_holes {
+                            heap.pop(); // remove smallest hole
+                        }
+                    }
+                }
+            }
+            prev = Some(next_key.next());
+        }
+        drop(layers);
+        stats.second_read_lock_held_micros = stats.second_read_lock_acquisition_micros.till_now();
+        let mut holes = heap.into_vec();
+        holes.sort_unstable_by_key(|hole| hole.key_range.start);
+        let mut next_hole = 0; // index of next hole in holes vector
+        stats.sort_holes_micros = stats.second_read_lock_held_micros.till_now();

        // Merge the contents of all the input delta layers into a new set
        // of delta layers, based on the current partitioning.
@@ -3815,7 +3889,7 @@ impl Timeline {
            layer_paths.pop().unwrap();
        }

-        stats.write_layer_files_micros = stats.prepare_iterators_micros.till_now();
+        stats.write_layer_files_micros = stats.sort_holes_micros.till_now();
        stats.new_deltas_count = Some(new_layers.len());
        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());

@@ -3854,36 +3928,9 @@ impl Timeline {
        let CompactLevel0Phase1Result {
            new_layers,
            deltas_to_compact,
-        } = {
-            let phase1_span = info_span!("compact_level0_phase1");
-            let myself = Arc::clone(self);
-            let ctx = ctx.attached_child(); // technically, the spawn_blocking can outlive this future
-            let mut stats = CompactLevel0Phase1StatsBuilder {
-                version: Some(2),
-                tenant_id: Some(self.tenant_id),
-                timeline_id: Some(self.timeline_id),
-                ..Default::default()
-            };
-
-            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
-            let now = tokio::time::Instant::now();
-            stats.read_lock_acquisition_micros =
-                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
-            let layer_removal_cs = layer_removal_cs.clone();
-            tokio::task::spawn_blocking(move || {
-                let _entered = phase1_span.enter();
-                myself.compact_level0_phase1(
-                    layer_removal_cs,
-                    phase1_layers_locked,
-                    stats,
-                    target_file_size,
-                    &ctx,
-                )
-            })
-            .await
-            .context("spawn_blocking")??
-        };
+        } = self
+            .compact_level0_phase1(layer_removal_cs.clone(), target_file_size, ctx)
+            .await?;

        if new_layers.is_empty() && deltas_to_compact.is_empty() {
            // nothing to do
@@ -4109,6 +4156,11 @@ impl Timeline {
        let now = SystemTime::now();
        let mut result: GcResult = GcResult::default();

+        if self.current_state() == TimelineState::Creating {
+            debug!("timeline creating placeholder does not need GC");
+            return Ok(GcResult::default());
+        }
+
        // Nothing to GC. Return early.
        let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
        if latest_gc_cutoff >= new_gc_cutoff {
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -151,6 +151,7 @@ pub(super) async fn connection_manager_loop_step(
                        Ok(()) => {
                            let new_state = connection_manager_state.timeline.current_state();
                            match new_state {
+                                TimelineState::Creating => unreachable!("walreceiver should never be launched on a timeline in Creating state"),
                                // we're already active as walreceiver, no need to reactivate
                                TimelineState::Active => continue,
                                TimelineState::Broken { .. } | TimelineState::Stopping => {
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,7 +4,6 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
-	extension_server.o \
 	file_cache.o \
 	libpagestore.o \
 	libpqwalproposer.o \
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -1,103 +0,0 @@
-
-/*-------------------------------------------------------------------------
- *
- * extension_server.c
- *	  Request compute_ctl to download extension files.
- *
- * IDENTIFICATION
- *	 contrib/neon/extension_server.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-#include "tcop/pquery.h"
-#include "tcop/utility.h"
-#include "access/xact.h"
-#include "utils/hsearch.h"
-#include "utils/memutils.h"
-#include "commands/defrem.h"
-#include "miscadmin.h"
-#include "utils/acl.h"
-#include "fmgr.h"
-#include "utils/guc.h"
-#include "port.h"
-#include "fmgr.h"
-
-#include <curl/curl.h>
-
-static int extension_server_port = 0;
-
-static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
-
-// to download all SQL files for an extension:
-// curl -X POST http://localhost:8080/extension_server/postgis
-//
-// to download specific library file:
-// curl -X POST http://localhost:8080/extension_server/postgis-3.so?=true
-static bool
-neon_download_extension_file_http(const char *filename, bool is_library)
-{
-    CURL *curl;
-    CURLcode res;
-    char *compute_ctl_url;
-    char *postdata;
-    bool ret = false;
-
-    if ((curl = curl_easy_init()) == NULL)
-    {
-        elog(ERROR, "Failed to initialize curl handle");
-    }
-
-
-    if (is_library)
-    {
-        elog(LOG, "request library");
-    }
-
-    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
-                      extension_server_port, filename, is_library?"?is_library=true":"");
-
-    elog(LOG, "curl_easy_perform() url: %s", compute_ctl_url);
-
-    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
-    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
-    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
-
-
-    if (curl)
-    {
-        /* Perform the request, res will get the return code */
-        res = curl_easy_perform(curl);
-        /* Check for errors */
-        if (res == CURLE_OK)
-        {
-            elog(LOG, "curl_easy_perform() succeeded");
-            ret = true;
-        }
-        else
-        {
-            elog(WARNING, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
-        }
-
-        /* always cleanup */
-        curl_easy_cleanup(curl);
-    }
-
-    return ret;
-}
-
-void pg_init_extension_server()
-{
-    DefineCustomIntVariable("neon.extension_server_port",
-                            "connection string to the compute_ctl",
-                            NULL,
-                            &extension_server_port,
-                            0, 0, INT_MAX,
-                            PGC_POSTMASTER,
-                            0, /* no flags required */
-                            NULL, NULL, NULL);
-
-    // set download_extension_file_hook
-    prev_download_extension_file_hook = download_extension_file_hook;
-    download_extension_file_hook = neon_download_extension_file_http;
-}
--- a/pgxn/neon/extension_server.h
+++ b/pgxn/neon/extension_server.h
@@ -1 +0,0 @@
-
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -35,11 +35,8 @@ _PG_init(void)
 {
 	pg_init_libpagestore();
 	pg_init_walproposer();
-
 	InitControlPlaneConnector();

-	pg_init_extension_server();
-
        // Important: This must happen after other parts of the extension
        // are loaded, otherwise any settings to GUCs that were set before
        // the extension was loaded will be removed.
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -21,8 +21,6 @@ extern char *neon_tenant;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

-extern void pg_init_extension_server(void);
-
 /*
 * Returns true if we shouldn't do REDO on that block in record indicated by
 * block_id; false otherwise.
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.70.0"
+channel = "1.68.2"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -62,7 +62,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "pageserver_getpage_reconstruct_seconds_bucket",
    "pageserver_getpage_reconstruct_seconds_count",
    "pageserver_getpage_reconstruct_seconds_sum",
-    *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
 )

 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -534,16 +534,6 @@ class S3Storage:
            "AWS_SECRET_ACCESS_KEY": self.secret_key,
        }

-    def to_string(self) -> str:
-        return json.dumps(
-            {
-                "bucket": self.bucket_name,
-                "region": self.bucket_region,
-                "endpoint": self.endpoint,
-                "prefix": self.prefix_in_bucket,
-            }
-        )
-

 RemoteStorage = Union[LocalFsStorage, S3Storage]

@@ -610,12 +600,10 @@ class NeonEnvBuilder:
        self.rust_log_override = rust_log_override
        self.port_distributor = port_distributor
        self.remote_storage = remote_storage
-        self.ext_remote_storage: Optional[S3Storage] = None
-        self.remote_storage_client: Optional[Any] = None
        self.remote_storage_users = remote_storage_users
        self.broker = broker
        self.run_id = run_id
-        self.mock_s3_server: MockS3Server = mock_s3_server
+        self.mock_s3_server = mock_s3_server
        self.pageserver_config_override = pageserver_config_override
        self.num_safekeepers = num_safekeepers
        self.safekeepers_id_start = safekeepers_id_start
@@ -663,24 +651,15 @@ class NeonEnvBuilder:
        remote_storage_kind: RemoteStorageKind,
        test_name: str,
        force_enable: bool = True,
-        enable_remote_extensions: bool = False,
    ):
        if remote_storage_kind == RemoteStorageKind.NOOP:
            return
        elif remote_storage_kind == RemoteStorageKind.LOCAL_FS:
            self.enable_local_fs_remote_storage(force_enable=force_enable)
        elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
-            self.enable_mock_s3_remote_storage(
-                bucket_name=test_name,
-                force_enable=force_enable,
-                enable_remote_extensions=enable_remote_extensions,
-            )
+            self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable)
        elif remote_storage_kind == RemoteStorageKind.REAL_S3:
-            self.enable_real_s3_remote_storage(
-                test_name=test_name,
-                force_enable=force_enable,
-                enable_remote_extensions=enable_remote_extensions,
-            )
+            self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable)
        else:
            raise RuntimeError(f"Unknown storage type: {remote_storage_kind}")

@@ -694,15 +673,11 @@ class NeonEnvBuilder:
        assert force_enable or self.remote_storage is None, "remote storage is enabled already"
        self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage"))

-    def enable_mock_s3_remote_storage(
-        self, bucket_name: str, force_enable: bool = True, enable_remote_extensions: bool = False
-    ):
+    def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True):
        """
        Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
        Starts up the mock server, if that does not run yet.
        Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
-
-        Also creates the bucket for extensions, self.ext_remote_storage bucket
        """
        assert force_enable or self.remote_storage is None, "remote storage is enabled already"
        mock_endpoint = self.mock_s3_server.endpoint()
@@ -723,22 +698,9 @@ class NeonEnvBuilder:
            bucket_region=mock_region,
            access_key=self.mock_s3_server.access_key(),
            secret_key=self.mock_s3_server.secret_key(),
-            prefix_in_bucket="pageserver",
        )

-        if enable_remote_extensions:
-            self.ext_remote_storage = S3Storage(
-                bucket_name=bucket_name,
-                endpoint=mock_endpoint,
-                bucket_region=mock_region,
-                access_key=self.mock_s3_server.access_key(),
-                secret_key=self.mock_s3_server.secret_key(),
-                prefix_in_bucket="ext",
-            )
-
-    def enable_real_s3_remote_storage(
-        self, test_name: str, force_enable: bool = True, enable_remote_extensions: bool = False
-    ):
+    def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True):
        """
        Sets up configuration to use real s3 endpoint without mock server
        """
@@ -775,18 +737,9 @@ class NeonEnvBuilder:
            bucket_region=region,
            access_key=access_key,
            secret_key=secret_key,
-            prefix_in_bucket=f"{self.remote_storage_prefix}/pageserver",
+            prefix_in_bucket=self.remote_storage_prefix,
        )

-        if enable_remote_extensions:
-            self.ext_remote_storage = S3Storage(
-                bucket_name=bucket_name,
-                bucket_region=region,
-                access_key=access_key,
-                secret_key=secret_key,
-                prefix_in_bucket=f"{self.remote_storage_prefix}/ext",
-            )
-
    def cleanup_local_storage(self):
        if self.preserve_database_files:
            return
@@ -820,7 +773,6 @@ class NeonEnvBuilder:
        # `self.remote_storage_prefix` is coupled with `S3Storage` storage type,
        # so this line effectively a no-op
        assert isinstance(self.remote_storage, S3Storage)
-        assert self.remote_storage_client is not None

        if self.keep_remote_storage_contents:
            log.info("keep_remote_storage_contents skipping remote storage cleanup")
@@ -950,8 +902,6 @@ class NeonEnv:
        self.neon_binpath = config.neon_binpath
        self.pg_distrib_dir = config.pg_distrib_dir
        self.endpoint_counter = 0
-        self.remote_storage_client = config.remote_storage_client
-        self.ext_remote_storage = config.ext_remote_storage

        # generate initial tenant ID here instead of letting 'neon init' generate it,
        # so that we don't need to dig it out of the config file afterwards.
@@ -1538,7 +1488,6 @@ class NeonCli(AbstractNeonCli):
        safekeepers: Optional[List[int]] = None,
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
-        remote_ext_config: Optional[str] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1548,8 +1497,6 @@ class NeonCli(AbstractNeonCli):
            "--pg-version",
            self.env.pg_version,
        ]
-        if remote_ext_config is not None:
-            args.extend(["--remote-ext-config", remote_ext_config])
        if lsn is not None:
            args.append(f"--lsn={lsn}")
        args.extend(["--pg-port", str(pg_port)])
@@ -2411,7 +2358,7 @@ class Endpoint(PgProtocol):

        return self

-    def start(self, remote_ext_config: Optional[str] = None) -> "Endpoint":
+    def start(self) -> "Endpoint":
        """
        Start the Postgres instance.
        Returns self.
@@ -2427,7 +2374,6 @@ class Endpoint(PgProtocol):
            http_port=self.http_port,
            tenant_id=self.tenant_id,
            safekeepers=self.active_safekeepers,
-            remote_ext_config=remote_ext_config,
        )
        self.running = True

@@ -2517,7 +2463,6 @@ class Endpoint(PgProtocol):
        hot_standby: bool = False,
        lsn: Optional[Lsn] = None,
        config_lines: Optional[List[str]] = None,
-        remote_ext_config: Optional[str] = None,
    ) -> "Endpoint":
        """
        Create an endpoint, apply config, and start Postgres.
@@ -2532,7 +2477,7 @@ class Endpoint(PgProtocol):
            config_lines=config_lines,
            hot_standby=hot_standby,
            lsn=lsn,
-        ).start(remote_ext_config=remote_ext_config)
+        ).start()

        log.info(f"Postgres startup took {time.time() - started_at} seconds")

@@ -2566,7 +2511,6 @@ class EndpointFactory:
        lsn: Optional[Lsn] = None,
        hot_standby: bool = False,
        config_lines: Optional[List[str]] = None,
-        remote_ext_config: Optional[str] = None,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -2583,7 +2527,6 @@ class EndpointFactory:
            hot_standby=hot_standby,
            config_lines=config_lines,
            lsn=lsn,
-            remote_ext_config=remote_ext_config,
        )

    def create(
@@ -3136,6 +3079,21 @@ def fork_at_current_lsn(
    return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)


+def last_flush_lsn_checkpoint(
+    env: NeonEnv, endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId
+) -> Lsn:
+    """
+    Wait for pageserver to catch to the latest flush LSN of given endpoint, then
+    checkpoint pageserver.
+    """
+    last_flush_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    ps_http = env.pageserver.http_client()
+    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn)
+    # force a checkpoint to trigger upload
+    ps_http.timeline_checkpoint(tenant_id, timeline_id)
+    return last_flush_lsn
+
+
 def last_flush_lsn_upload(
    env: NeonEnv, endpoint: Endpoint, tenant_id: TenantId, timeline_id: TimelineId
 ) -> Lsn:
@@ -3144,10 +3102,7 @@ def last_flush_lsn_upload(
    checkpoint pageserver, and wait for it to be uploaded (remote_consistent_lsn
    reaching flush LSN).
    """
-    last_flush_lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    last_flush_lsn = last_flush_lsn_checkpoint(env, endpoint, tenant_id, timeline_id)
    ps_http = env.pageserver.http_client()
-    wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn)
-    # force a checkpoint to trigger upload
-    ps_http.timeline_checkpoint(tenant_id, timeline_id)
    wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
    return last_flush_lsn
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -89,9 +89,6 @@ class TenantId(Id):
    def __repr__(self) -> str:
        return f'`TenantId("{self.id.hex()}")'

-    def __str__(self) -> str:
-        return self.id.hex()
-

 class TimelineId(Id):
    def __repr__(self) -> str:
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -172,8 +172,10 @@ def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv):

    # Introduce failpoint when creating a new timeline uninit mark, before any other files were created
    pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
-    with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
+    with pytest.raises(Exception, match="create timeline files"):
        _ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id)
+    env.pageserver.allowed_errors.append(".*InternalServerError.*create timeline files")
+    env.pageserver.allowed_errors.append(".*hitting failpoint after-timeline-uninit-mark-creation")

    # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
    # "New" timeline is not present in the list, allowing pageserver to retry the same request
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -2,7 +2,6 @@ import copy
 import os
 import shutil
 import subprocess
-import tempfile
 from pathlib import Path
 from typing import Any, Optional

@@ -449,7 +448,7 @@ def dump_differs(first: Path, second: Path, output: Path) -> bool:
    """

    with output.open("w") as stdout:
-        res = subprocess.run(
+        rv = subprocess.run(
            [
                "diff",
                "--unified",  # Make diff output more readable
@@ -461,53 +460,4 @@ def dump_differs(first: Path, second: Path, output: Path) -> bool:
            stdout=stdout,
        )

-    differs = res.returncode != 0
-
-    # TODO: Remove after https://github.com/neondatabase/neon/pull/4425 is merged, and a couple of releases are made
-    if differs:
-        with tempfile.NamedTemporaryFile(mode="w") as tmp:
-            tmp.write(PR4425_ALLOWED_DIFF)
-            tmp.flush()
-
-            allowed = subprocess.run(
-                [
-                    "diff",
-                    "--unified",  # Make diff output more readable
-                    r"--ignore-matching-lines=^---",  # Ignore diff headers
-                    r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
-                    "--ignore-matching-lines=^@@",  # Ignore diff blocks location
-                    "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
-                    "--ignore-matching-lines=^ --.*",  # Ignore the " --" lines for compatibility with PG14
-                    "--ignore-blank-lines",
-                    str(output),
-                    str(tmp.name),
-                ],
-            )
-
-            differs = allowed.returncode != 0
-
-    return differs
-
-
-PR4425_ALLOWED_DIFF = """
--- /tmp/test_output/test_backward_compatibility[release-pg15]/compatibility_snapshot/dump.sql 2023-06-08 18:12:45.000000000 +0000
-+++ /tmp/test_output/test_backward_compatibility[release-pg15]/dump.sql        2023-06-13 07:25:35.211733653 +0000
-@@ -13,12 +13,20 @@
-
- CREATE ROLE cloud_admin;
- ALTER ROLE cloud_admin WITH SUPERUSER INHERIT CREATEROLE CREATEDB LOGIN REPLICATION BYPASSRLS;
-+CREATE ROLE neon_superuser;
-+ALTER ROLE neon_superuser WITH NOSUPERUSER INHERIT CREATEROLE CREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS;
-
- --
- -- User Configurations
- --
-
-
-+--
-+-- Role memberships
-+--
-+
-+GRANT pg_read_all_data TO neon_superuser GRANTED BY cloud_admin;
-+GRANT pg_write_all_data TO neon_superuser GRANTED BY cloud_admin;
-"""
+    return rv.returncode != 0
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -1,252 +0,0 @@
-import os
-from contextlib import closing
-from io import BytesIO
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, RemoteStorageKind
-from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId
-
-NUM_EXT = 3
-
-
-def control_file_content(owner, i):
-    output = f"""# mock {owner} extension{i}
-comment = 'This is a mock extension'
-default_version = '1.0'
-module_pathname = '$libdir/test_ext{i}'
-relocatable = true"""
-    return output
-
-
-def sql_file_content():
-    output = """
-            CREATE FUNCTION test_ext_add(integer, integer) RETURNS integer
-    AS 'select $1 + $2;'
-    LANGUAGE SQL
-    IMMUTABLE
-    RETURNS NULL ON NULL INPUT;
-        """
-    return output
-
-
-# Prepare some mock extension files and upload them to the bucket
-# returns a list of files that should be cleaned up after the test
-def prepare_mock_ext_storage(
-    pg_version: PgVersion,
-    tenant_id: TenantId,
-    pg_bin: PgBin,
-    ext_remote_storage,
-    remote_storage_client,
-):
-    bucket_prefix = ext_remote_storage.prefix_in_bucket
-    private_prefix = str(tenant_id)
-    PUB_EXT_ROOT = f"v{pg_version}/share/postgresql/extension"
-    PRIVATE_EXT_ROOT = f"v{pg_version}/{private_prefix}/share/postgresql/extension"
-    LOCAL_EXT_ROOT = f"pg_install/{PUB_EXT_ROOT}"
-
-    PUB_LIB_ROOT = f"v{pg_version}/lib"
-    PRIVATE_LIB_ROOT = f"v{pg_version}/{private_prefix}/lib"
-    LOCAL_LIB_ROOT = f"{pg_bin.pg_lib_dir}/postgresql"
-
-    log.info(
-        f"""
-            PUB_EXT_ROOT: {PUB_EXT_ROOT}
-            PRIVATE_EXT_ROOT: {PRIVATE_EXT_ROOT}
-            LOCAL_EXT_ROOT: {LOCAL_EXT_ROOT}
-            PUB_LIB_ROOT: {PUB_LIB_ROOT}
-            PRIVATE_LIB_ROOT: {PRIVATE_LIB_ROOT}
-            LOCAL_LIB_ROOT: {LOCAL_LIB_ROOT}
-            """
-    )
-
-    cleanup_files = []
-
-    # Upload several test_ext{i}.control files to the bucket
-    for i in range(NUM_EXT):
-        public_ext = BytesIO(bytes(control_file_content("public", i), "utf-8"))
-        public_remote_name = f"{bucket_prefix}/{PUB_EXT_ROOT}/test_ext{i}.control"
-        public_local_name = f"{LOCAL_EXT_ROOT}/test_ext{i}.control"
-        private_ext = BytesIO(bytes(control_file_content(str(tenant_id), i), "utf-8"))
-        private_remote_name = f"{bucket_prefix}/{PRIVATE_EXT_ROOT}/private_ext{i}.control"
-        private_local_name = f"{LOCAL_EXT_ROOT}/private_ext{i}.control"
-        cleanup_files += [public_local_name, private_local_name]
-
-        remote_storage_client.upload_fileobj(
-            public_ext, ext_remote_storage.bucket_name, public_remote_name
-        )
-        remote_storage_client.upload_fileobj(
-            private_ext, ext_remote_storage.bucket_name, private_remote_name
-        )
-
-    # Upload SQL file for the extension we're going to create
-    sql_filename = "test_ext0--1.0.sql"
-    test_sql_public_remote_path = f"{bucket_prefix}/{PUB_EXT_ROOT}/{sql_filename}"
-    test_sql_local_path = f"{LOCAL_EXT_ROOT}/{sql_filename}"
-    test_ext_sql_file = BytesIO(bytes(sql_file_content(), "utf-8"))
-    remote_storage_client.upload_fileobj(
-        test_ext_sql_file,
-        ext_remote_storage.bucket_name,
-        test_sql_public_remote_path,
-    )
-    cleanup_files += [test_sql_local_path]
-
-    # upload some fake library files
-    for i in range(2):
-        public_library = BytesIO(bytes("\n111\n", "utf-8"))
-        public_remote_name = f"{bucket_prefix}/{PUB_LIB_ROOT}/test_lib{i}.so"
-        public_local_name = f"{LOCAL_LIB_ROOT}/test_lib{i}.so"
-        private_library = BytesIO(bytes("\n111\n", "utf-8"))
-        private_remote_name = f"{bucket_prefix}/{PRIVATE_LIB_ROOT}/private_lib{i}.so"
-        private_local_name = f"{LOCAL_LIB_ROOT}/private_lib{i}.so"
-
-        log.info(f"uploading library to {public_remote_name}")
-        log.info(f"uploading library to {private_remote_name}")
-
-        remote_storage_client.upload_fileobj(
-            public_library,
-            ext_remote_storage.bucket_name,
-            public_remote_name,
-        )
-        remote_storage_client.upload_fileobj(
-            private_library,
-            ext_remote_storage.bucket_name,
-            private_remote_name,
-        )
-        cleanup_files += [public_local_name, private_local_name]
-
-    return cleanup_files
-
-
-# Generate mock extension files and upload them to the bucket.
-#
-# Then check that compute nodes can download them and use them
-# to CREATE EXTENSION and LOAD 'library.so'
-#
-# NOTE: You must have appropriate AWS credentials to run REAL_S3 test.
-# It may also be necessary to set the following environment variables:
-#   export AWS_ACCESS_KEY_ID='test'
-#   export AWS_SECRET_ACCESS_KEY='test'
-#   export AWS_SECURITY_TOKEN='test'
-#   export AWS_SESSION_TOKEN='test'
-#   export AWS_DEFAULT_REGION='us-east-1'
-
-
-@pytest.mark.parametrize(
-    "remote_storage_kind", [RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3]
-)
-def test_remote_extensions(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-    pg_version: PgVersion,
-    pg_bin: PgBin,
-):
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=remote_storage_kind,
-        test_name="test_remote_extensions",
-        enable_remote_extensions=True,
-    )
-    neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-    tenant_id, _ = env.neon_cli.create_tenant()
-    env.neon_cli.create_timeline("test_remote_extensions", tenant_id=tenant_id)
-
-    assert env.ext_remote_storage is not None
-    assert env.remote_storage_client is not None
-
-    # Prepare some mock extension files and upload them to the bucket
-    cleanup_files = prepare_mock_ext_storage(
-        pg_version,
-        tenant_id,
-        pg_bin,
-        env.ext_remote_storage,
-        env.remote_storage_client,
-    )
-    # Start a compute node and check that it can download the extensions
-    # and use them to CREATE EXTENSION and LOAD 'library.so'
-    #
-    # This block is wrapped in a try/finally so that the downloaded files
-    # are cleaned up even if the test fails
-    try:
-        endpoint = env.endpoints.create_start(
-            "test_remote_extensions",
-            tenant_id=tenant_id,
-            remote_ext_config=env.ext_remote_storage.to_string(),
-            # config_lines=["log_min_messages=debug3"],
-        )
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                # Test query: check that test_ext0 was successfully downloaded
-                cur.execute("SELECT * FROM pg_available_extensions")
-                all_extensions = [x[0] for x in cur.fetchall()]
-                log.info(all_extensions)
-                for i in range(NUM_EXT):
-                    assert f"test_ext{i}" in all_extensions
-                    assert f"private_ext{i}" in all_extensions
-
-                cur.execute("CREATE EXTENSION test_ext0")
-                cur.execute("SELECT extname FROM pg_extension")
-                all_extensions = [x[0] for x in cur.fetchall()]
-                log.info(all_extensions)
-                assert "test_ext0" in all_extensions
-
-                # Try to load existing library file
-                try:
-                    cur.execute("LOAD 'test_lib0.so'")
-                except Exception as e:
-                    # expected to fail with
-                    # could not load library ... test_ext.so: file too short
-                    # because test_lib0.so is not real library file
-                    log.info("LOAD test_lib0.so failed (expectedly): %s", e)
-                    assert "file too short" in str(e)
-
-                # Try to load private library file
-                try:
-                    cur.execute("LOAD 'private_lib0.so'")
-                except Exception as e:
-                    # expected to fail with
-                    # could not load library ... test_ext.so: file too short
-                    # because test_lib0.so is not real library file
-                    log.info("LOAD private_lib0.so failed (expectedly): %s", e)
-                    assert "file too short" in str(e)
-
-                # Try to load existing library file without .so extension
-                try:
-                    cur.execute("LOAD 'test_lib1'")
-                except Exception as e:
-                    # expected to fail with
-                    # could not load library ... test_lib1.so: file too short
-                    # because test_lib1.so is not real library file
-                    log.info("LOAD test_lib1 failed (expectedly): %s", e)
-                    assert "file too short" in str(e)
-
-                # Try to load non-existent library file
-                try:
-                    cur.execute("LOAD 'test_lib_fail.so'")
-                except Exception as e:
-                    # expected to fail because test_lib_fail.so is not found
-                    log.info("LOAD test_lib_fail.so failed (expectedly): %s", e)
-                    assert (
-                        """could not access file "test_lib_fail.so": No such file or directory"""
-                        in str(e)
-                    )
-
-    finally:
-        # this is important because if the files aren't cleaned up then the test can
-        # pass even without successfully downloading the files if a previous run (or
-        # run with different type of remote storage) of the test did download the
-        # files
-        for file in cleanup_files:
-            try:
-                os.remove(file)
-                log.info(f"Deleted {file}")
-            except FileNotFoundError:
-                log.info(f"{file} does not exist, so cannot be deleted")
-
-
-# TODO
-# @pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
-# def test_remote_extensions_shared_preload_libraries(
-#     neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, pg_version: PgVersion
-# ):
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,5 +1,3 @@
-import time
-
 import pytest
 from fixtures.neon_fixtures import NeonEnv

@@ -12,10 +10,9 @@ def test_hot_standby(neon_simple_env: NeonEnv):
        branch_name="main",
        endpoint_id="primary",
    ) as primary:
-        time.sleep(1)
        with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
            primary_lsn = None
-            caught_up = False
+            cought_up = False
            queries = [
                "SHOW neon.timeline_id",
                "SHOW neon.tenant_id",
@@ -59,7 +56,7 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                    res = s_cur.fetchone()
                    assert res is not None

-                while not caught_up:
+                while not cought_up:
                    with s_con.cursor() as secondary_cursor:
                        secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()")
                        res = secondary_cursor.fetchone()
@@ -69,7 +66,7 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                        # due to e.g. autovacuum, but that shouldn't impact the content
                        # of the tables, so we check whether we've replayed up to at
                        # least after the commit of the `test` table.
-                        caught_up = secondary_lsn >= primary_lsn
+                        cought_up = secondary_lsn >= primary_lsn

                # Explicit commit to flush any transient transaction-level state.
                s_con.commit()
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -133,17 +133,24 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
        )

    # Importing empty file fails
+    log.info("importing empty_file")
    empty_file = os.path.join(test_output_dir, "empty_file")
    with open(empty_file, "w") as _:
        with pytest.raises(Exception):
            import_tar(empty_file, empty_file)

+    assert timeline not in {TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)}
+
    # Importing corrupt backup fails
+    log.info("importing corrupt_base_tar")
    with pytest.raises(Exception):
        import_tar(corrupt_base_tar, wal_tar)

+    assert timeline not in {TimelineId(t["timeline_id"]) for t in client.timeline_list(tenant)}
+
    # A tar with trailing garbage is currently accepted. It prints a warnings
    # to the pageserver log, however. Check that.
+    log.info("importing base_plus_garbage_tar")
    import_tar(base_plus_garbage_tar, wal_tar)
    assert env.pageserver.log_contains(
        ".*WARN.*ignored .* unexpected bytes after the tar archive.*"
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -2,12 +2,11 @@
 # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......

 import os
-import queue
 import shutil
 import threading
 import time
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Tuple

 import pytest
 from fixtures.log_helper import log
@@ -674,21 +673,26 @@ def test_empty_branch_remote_storage_upload(


@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
-def test_empty_branch_remote_storage_upload_on_restart(
+def test_empty_branch_remote_storage_upload_failure(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
 ):
    """
-    Branches off a root branch, but does not write anything to the new branch, so
-    it has a metadata file only.
+    Branching is not acknowledged until the index_part.json is uploaded.

-    Ensures the branch is not on the remote storage and restarts the pageserver
-    — the upload should be scheduled by load, and create_timeline should await
-    for it even though it gets 409 Conflict.
+    Fails the index_part.json upload with a failpoint.
+    Ensures that timeline creation fails because of that.
+    Stops the pageserver.
+    Restarts it, still with failpoint enabled.
+    Waits for tenant to finish loading.
+    Ensures the timeline does not exist locally nor remotely.
+
+    Disables the failpoint.
+    Ensures that timeline can be created.
    """
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
-        test_name="test_empty_branch_remote_storage_upload_on_restart",
+        test_name="test_empty_branch_remote_storage_upload_failures",
    )

    env = neon_env_builder.init_start()
@@ -714,9 +718,14 @@ def test_empty_branch_remote_storage_upload_on_restart(
    # index upload is now hitting the failpoint, it should block the shutdown
    env.pageserver.stop(immediate=True)

+    env.pageserver.allowed_errors.append(
+        f".*failed to create on-disk state for new_timeline_id={new_branch_timeline_id}.*wait for initial uploads to complete.*upload queue was stopped"
+    )
+
    timeline_path = (
        Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id)
    )
+    uninit_marker_path = env.repo_dir / timeline_path.with_suffix(".___uninit")

    local_metadata = env.repo_dir / timeline_path / "metadata"
    assert local_metadata.is_file()
@@ -727,54 +736,26 @@ def test_empty_branch_remote_storage_upload_on_restart(
        not new_branch_on_remote_storage.exists()
    ), "failpoint should had prohibited index_part.json upload"

-    # during reconciliation we should had scheduled the uploads and on the
-    # retried create_timeline, we will await for those to complete on next
-    # client.timeline_create
-    env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-upload-index=return"})
+    # restart without failpoint
+    env.pageserver.start()

-    # sleep a bit to force the upload task go into exponential backoff
-    time.sleep(1)
+    wait_until_tenant_state(client, env.initial_tenant, "Active", 5)

-    q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
-    barrier = threading.Barrier(2)
+    # retry creation
+    client.timeline_create(
+        tenant_id=env.initial_tenant,
+        ancestor_timeline_id=env.initial_timeline,
+        new_timeline_id=new_branch_timeline_id,
+        pg_version=env.pg_version,
+    )

-    def create_in_background():
-        barrier.wait()
-        try:
-            client.timeline_create(
-                tenant_id=env.initial_tenant,
-                ancestor_timeline_id=env.initial_timeline,
-                new_timeline_id=new_branch_timeline_id,
-                pg_version=env.pg_version,
-            )
-            q.put(None)
-        except PageserverApiException as e:
-            q.put(e)
+    assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)

-    create_thread = threading.Thread(target=create_in_background)
-    create_thread.start()
-
-    try:
-        # maximize chances of actually waiting for the uploads by create_timeline
-        barrier.wait()
-
-        assert not new_branch_on_remote_storage.exists(), "failpoint should had stopped uploading"
-
-        client.configure_failpoints(("before-upload-index", "off"))
-        conflict = q.get()
-
-        assert conflict, "create_timeline should not have succeeded"
-        assert (
-            conflict.status_code == 409
-        ), "timeline was created before restart, and uploads scheduled during initial load, so we expect 409 conflict"
-
-        assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
-
-        assert (
-            new_branch_on_remote_storage / "index_part.json"
-        ).is_file(), "uploads scheduled during initial load should had been awaited for"
-    finally:
-        create_thread.join()
+    assert (env.repo_dir / timeline_path).exists()
+    assert not uninit_marker_path.exists()
+    assert (
+        new_branch_on_remote_storage / "index_part.json"
+    ).is_file(), "uploads scheduled during initial load should had been awaited for"


 def wait_upload_queue_empty(
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -16,7 +16,6 @@ from fixtures.pg_version import PgVersion, xfail_on_postgres
 from fixtures.types import Lsn, TenantId, TimelineId


-@pytest.mark.xfail
 def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
    env = neon_simple_env
    (tenant_id, _) = env.neon_cli.create_tenant()
@@ -45,16 +44,12 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
        # we've disabled the autovacuum and checkpoint
        # so background processes should not change the size.
        # If this test will flake we should probably loosen the check
-        assert (
-            size == initial_size
-        ), f"starting idle compute should not change the tenant size (Currently {size}, expected {initial_size})"
+        assert size == initial_size, "starting idle compute should not change the tenant size"

    # the size should be the same, until we increase the size over the
    # gc_horizon
    size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
-    assert (
-        size == initial_size
-    ), f"tenant_size should not be affected by shutdown of compute (Currently {size}, expected {initial_size})"
+    assert size == initial_size, "tenant_size should not be affected by shutdown of compute"

    expected_inputs = {
        "segments": [
@@ -323,7 +318,6 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
    size_debug_file.write(size_debug)


-@pytest.mark.xfail
 def test_single_branch_get_tenant_size_grows(
    neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
 ):
@@ -339,13 +333,13 @@ def test_single_branch_get_tenant_size_grows(
    # inserts is larger than gc_horizon. for example 0x20000 here hid the fact
    # that there next_gc_cutoff could be smaller than initdb_lsn, which will
    # obviously lead to issues when calculating the size.
-    gc_horizon = 0x3BA00
+    gc_horizon = 0x38000

    # it's a bit of a hack, but different versions of postgres have different
    # amount of WAL generated for the same amount of data. so we need to
    # adjust the gc_horizon accordingly.
    if pg_version == PgVersion.V14:
-        gc_horizon = 0x4A000
+        gc_horizon = 0x40000

    neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"

@@ -366,11 +360,11 @@ def test_single_branch_get_tenant_size_grows(
        if current_lsn - initdb_lsn >= gc_horizon:
            assert (
                size >= prev_size
-            ), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
+            ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
        else:
            assert (
                size > prev_size
-            ), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
+            ), "tenant_size should grow, because we continue to add WAL to initial snapshot size"

    def get_current_consistent_size(
        env: NeonEnv,
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -20,6 +20,8 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    RemoteStorageKind,
    available_remote_storages,
+    last_flush_lsn_checkpoint,
+    last_flush_lsn_upload,
 )
 from fixtures.pageserver.utils import timeline_delete_wait_completed
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -251,8 +253,12 @@ def test_pageserver_metrics_removed_after_detach(
    tenant_1, _ = env.neon_cli.create_tenant()
    tenant_2, _ = env.neon_cli.create_tenant()

-    env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_1)
-    env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_2)
+    tenant_1_timeline = env.neon_cli.create_timeline(
+        "test_metrics_removed_after_detach", tenant_id=tenant_1
+    )
+    tenant_2_timeline = env.neon_cli.create_timeline(
+        "test_metrics_removed_after_detach", tenant_id=tenant_2
+    )

    endpoint_tenant1 = env.endpoints.create_start(
        "test_metrics_removed_after_detach", tenant_id=tenant_1
@@ -261,13 +267,20 @@ def test_pageserver_metrics_removed_after_detach(
        "test_metrics_removed_after_detach", tenant_id=tenant_2
    )

-    for endpoint in [endpoint_tenant1, endpoint_tenant2]:
+    for endpoint, timeline_id in [
+        (endpoint_tenant1, tenant_1_timeline),
+        (endpoint_tenant2, tenant_2_timeline),
+    ]:
        with closing(endpoint.connect()) as conn:
            with conn.cursor() as cur:
                cur.execute("CREATE TABLE t(key int primary key, value text)")
                cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
                cur.execute("SELECT sum(key) FROM t")
                assert cur.fetchone() == (5000050000,)
+            if remote_storage_kind != RemoteStorageKind.NOOP:
+                last_flush_lsn_upload(env, endpoint, endpoint.tenant_id, timeline_id)
+            else:
+                last_flush_lsn_checkpoint(env, endpoint, endpoint.tenant_id, timeline_id)
        endpoint.stop()

    def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]:
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -275,7 +275,6 @@ def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str]
    assert isinstance(neon_env_builder.remote_storage, S3Storage)

    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
-    assert neon_env_builder.remote_storage_client is not None
    response = neon_env_builder.remote_storage_client.list_objects_v2(
        Bucket=neon_env_builder.remote_storage.bucket_name,
        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
@@ -306,7 +305,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
    )
    # this happens, because the stuck timeline is visible to shutdown
    env.pageserver.allowed_errors.append(
-        ".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
+        ".*shutdown_pageserver.*freeze_and_flush timeline failed timeline_id=.* err=cannot flush frozen layers when flush_loop is not running, state is Exited"
    )

    ps_http = env.pageserver.http_client()
@@ -629,7 +628,7 @@ def test_timeline_delete_works_for_remote_smoke(
        )

    # for some reason the check above doesnt immediately take effect for the below.
-    # Assume it is mock server inconsistency and check twice.
+    # Assume it is mock server incosistency and check twice.
    wait_until(
        2,
        0.5,
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
Author	SHA1	Message	Date
Christian Schwarz	1a18d44013	clippy	2023-06-23 15:48:42 +02:00
Christian Schwarz	8f8b7ad4fd	minor fixes	2023-06-23 13:00:28 +02:00
Christian Schwarz	70fdaa47f1	few more panics instead of bailouts	2023-06-23 13:00:17 +02:00
Christian Schwarz	b5cf92b948	Merge remote-tracking branch 'origin/main' into problame/async-timeline-get/refactor-timeline-initialization-to-avoid-holding-tenants-timelines-lock	2023-06-23 12:47:33 +02:00
Christian Schwarz	a109a246f3	remove debug code	2023-06-16 17:34:08 +02:00
Christian Schwarz	a0b6f0c052	reduce diff	2023-06-16 17:31:45 +02:00
Christian Schwarz	9ce6e7c86b	convert more 'no-op if Creating' into check_at_caller+asserts	2023-06-16 17:31:18 +02:00
Dmitry Rodionov	e839c97188	fix uninit glitch cherry-picked from https://github.com/neondatabase/neon/pull/4458	2023-06-16 13:29:27 +02:00
Christian Schwarz	0c0cd1857d	Revert the Tenant::load changes to fix uninit timeline marker processing Dmitry is fixing it differently in https://github.com/neondatabase/neon/pull/4458, going to cherry-pick that.	2023-06-16 13:28:32 +02:00
Christian Schwarz	b725cc879a	fixup the merge, unit tests pass, some python tests will fail	2023-06-14 20:54:28 +02:00
Christian Schwarz	4f4073a124	Merge remote-tracking branch 'origin/main' into problame/async-timeline-get/refactor-timeline-initialization-to-avoid-holding-tenants-timelines-lock	2023-06-14 19:46:54 +02:00
Christian Schwarz	64efcbf8da	Merge remote-tracking branch 'origin/main' into problame/async-timeline-get/refactor-timeline-initialization-to-avoid-holding-tenants-timelines-lock	2023-06-07 18:04:28 +02:00
Christian Schwarz	abb4112a28	fixup: test_pageserver_metrics_removed_after_detach[debug-pg14-noop]' last_flush_lsn_upload obviously doesn't work with NOOP storage kind but we need the on-disk state at that lsn, so, wait for on-disk consistent in that case	2023-06-07 15:09:44 +02:00
Christian Schwarz	a97b21ab13	Merge remote-tracking branch 'origin/main' into problame/async-timeline-get/refactor-timeline-initialization-to-avoid-holding-tenants-timelines-lock	2023-06-07 14:47:19 +02:00
Christian Schwarz	9d3267e474	clippy continued	2023-05-26 19:33:11 +02:00
Christian Schwarz	6b25cb5030	clippy	2023-05-26 18:36:13 +02:00
Christian Schwarz	f91ad65fb3	Merge branch 'problame/async-timeline-get/dont-hold-timelines-lock-inside-tenant-state-send-modify' into problame/async-timeline-get/refactor-timeline-initialization-to-avoid-holding-tenants-timelines-lock	2023-05-26 18:24:23 +02:00
Christian Schwarz	9a4789ec73	demote warn line to info-level, as the log line in set_stopping() is also info!() This should fix the faile regress tests that barked on allowed_errors	2023-05-26 18:22:41 +02:00
Christian Schwarz	72159ee686	Merge remote-tracking branch 'origin/main' into problame/async-timeline-get/dont-hold-timelines-lock-inside-tenant-state-send-modify	2023-05-26 18:03:35 +02:00
Christian Schwarz	be74662d05	re-introduce the check for 0 layers, based on cause	2023-05-26 17:52:26 +02:00
Christian Schwarz	e7c4ef9f4f	don't hold TENANTS lock while waiting for set_stopping()	2023-05-26 17:46:09 +02:00
Christian Schwarz	13d3f4c29f	set_stopping(): report in result if not transitioning to Stopping	2023-05-26 17:46:09 +02:00
Christian Schwarz	67258af8a2	Revert "test_broken_timelines: regex needs changing due to changes in this PR" This reverts commit `17ba307004`.	2023-05-26 17:40:37 +02:00
Christian Schwarz	17ba307004	test_broken_timelines: regex needs changing due to changes in this PR The regex is different because tenant2 is not broken anymore with this PR, because we allow empty timeline dirs to load	2023-05-26 17:40:34 +02:00
Christian Schwarz	e1486444d6	Revert "test_broken_timelines: wait for tenants to load" This reverts commit `c6f9b8f318`.	2023-05-26 17:40:27 +02:00
Christian Schwarz	c6f9b8f318	test_broken_timelines: wait for tenants to load Without this, we rely on the basebackup request to wait for the tenant to load. It works, but, would be nice to rule it out, no?	2023-05-26 17:39:14 +02:00
Christian Schwarz	ba3e3bdddf	clippy	2023-05-26 17:07:15 +02:00
Christian Schwarz	71f9bbef0d	fix the test timeline creation functions	2023-05-26 16:01:45 +02:00
Christian Schwarz	4680f8c60b	finish WIP: keep the real timeline from create_empty_timeline outside of timelines map until it has finished filling	2023-05-26 15:29:19 +02:00
Christian Schwarz	3c1fc2617c	WIP	2023-05-26 14:24:23 +02:00
Christian Schwarz	60cc197ce3	fix test_timeline_create_break_after_uninit_mark (the refactoring added .context())	2023-05-26 10:19:24 +02:00
Christian Schwarz	609a929968	instrument shutdown_all_tenants code path, include timeline_id in logs if failed to flush This can be extracted into an independent commit.	2023-05-26 10:12:33 +02:00
Christian Schwarz	f2abc4c933	independent fix: test_pageserver_metrics_removed_after_detach didn't wait for uploads This resulted in unexpectedly absent metrics `pageserver_remote_timeline_client_bytes_finished` tripping the assert quoted below. Not sure why this PR (#4350) exposed this problem though. Are we detaching faster? If so, why? AssertionError: assert {'pageserver_...s_count', ...} == {'pageserver_...s_count', ...} Extra items in the right set: 'pageserver_remote_timeline_client_bytes_started_total' 'pageserver_remote_timeline_client_bytes_finished_total' Full diff: { 'pageserver_created_persistent_files_total', 'pageserver_current_logical_size', 'pageserver_evictions_total', 'pageserver_evictions_with_low_residence_duration_total', 'pageserver_getpage_reconstruct_seconds_bucket', 'pageserver_getpage_reconstruct_seconds_count', 'pageserver_getpage_reconstruct_seconds_sum', 'pageserver_io_operations_bytes_total', 'pageserver_io_operations_seconds_bucket', 'pageserver_io_operations_seconds_count', 'pageserver_io_operations_seconds_sum', 'pageserver_last_record_lsn', 'pageserver_materialized_cache_hits_total', 'pageserver_remote_operation_seconds_bucket', 'pageserver_remote_operation_seconds_count', 'pageserver_remote_operation_seconds_sum', 'pageserver_remote_physical_size', - 'pageserver_remote_timeline_client_bytes_finished_total', - 'pageserver_remote_timeline_client_bytes_started_total', 'pageserver_remote_timeline_client_calls_started_bucket', 'pageserver_remote_timeline_client_calls_started_count', 'pageserver_remote_timeline_client_calls_started_sum', 'pageserver_remote_timeline_client_calls_unfinished', 'pageserver_resident_physical_size', 'pageserver_smgr_query_seconds_bucket', 'pageserver_smgr_query_seconds_count', 'pageserver_smgr_query_seconds_sum', 'pageserver_storage_operations_seconds_count_total', 'pageserver_storage_operations_seconds_sum_total', 'pageserver_tenant_states_count', 'pageserver_wait_lsn_seconds_bucket', 'pageserver_wait_lsn_seconds_count', 'pageserver_wait_lsn_seconds_sum', 'pageserver_written_persistent_bytes_total', }	2023-05-26 09:54:30 +02:00
Christian Schwarz	b09beaa4fe	log while waiting for tenant to finish activation	2023-05-26 09:34:12 +02:00
Christian Schwarz	1367e2b0ee	improve TenantState doc comments, repeating what's in the Mermaid diagram	2023-05-26 09:31:44 +02:00
Christian Schwarz	122e23071b	fix the tests (commenting out too-conservative "Timeline has no ancestor and no layer files" assert)	2023-05-26 09:23:26 +02:00
Christian Schwarz	696c6ed6ff	fix cfg(test) code to the extent that clippy passes	2023-05-26 08:49:42 +02:00
Christian Schwarz	0874e27023	refactor timeline initialization High-level ideas: - placeholder Timeline object in timelines map during a timeline creation - the timeline creations (branch, bootstrap, import_from_basebackup) prepare durable state (on-disk & remote)state, if necessary using _another_ _temporary_ Timeline object - once the timeline creations have prepared the durable state, they use the normal load routine (load_local_timeline) that is also used during pageserver startup - Once the loading is done, we replace the placheolder timeline object with the real one	2023-05-25 23:01:40 +02:00
Christian Schwarz	6fe39ecbf7	add ability to have fake metrics (needed in next patch so we can have to Timeline objects with the same id in memory)	2023-05-25 23:01:40 +02:00
Christian Schwarz	a0c2a85505	timeline_init_and_sync: don't hold Tenant::timelines while load_layer_map This patch inlines `initialize_with_lock` and then reorganizes the code such that we can `load_layer_map` without holding the `Tenant::timelines` lock. As a nice aside, we can get rid of the dummy() uninit mark, which has always been a terrible hack.	2023-05-25 23:01:40 +02:00
Christian Schwarz	dd0f5c4ef3	Merge remote-tracking branch 'origin/main' into problame/async-timeline-get/dont-hold-timelines-lock-inside-tenant-state-send-modify	2023-05-25 22:20:52 +02:00
Christian Schwarz	de780d2e0f	make TenantState::{Loading,Attaching,Activating} owned by spawn_load / spawn_attach See the Mermaid diagram in the doc comment for the now-possible state transitions. The two core insights / changes are: - spawn_load and spawn_attach own the tenant state until they're done - once load()/attach() calls are done - if they failed, transition them to Broken directly (we know that there's no background activity because we didn't call activate yet) - if they succeed, call activate. We can make it infallible. How? Later. - set_broken() and set_stopping() are changed to wait for spawn_load() / spawn_attach() to finish. This sounds scary because it might hinder detach or shutdown, but actually, concurrent attach+detach, or attach+shutdown, or load+shutdown, or attach+shutdown were just racy. With this change, they're not anymore. We can add a CancellationToken stored in Tenant for load/attach and cancel it from set_stopping() or set_broken() if necessary in the future. So, why can activate() be infallible now: because we declare that spawn_load and spawn_attach own the tenant state until they're done. And we enforce that ownership using the wait_for at the start of set_stopping and set_broken.	2023-05-25 15:02:43 +02:00
Christian Schwarz	f18d9f555b	Revert "Revert "use tokio::sync:⌚:Receiver::wait_for"" This reverts commit `eaf270c648`.	2023-05-25 14:58:49 +02:00
Christian Schwarz	05a2fe08d1	Merge branch 'problame/infallible-timeline-activate/4-make-infallible' into problame/async-timeline-get/dont-hold-timelines-lock-inside-tenant-state-send-modify	2023-05-25 14:58:19 +02:00
Christian Schwarz	eaf270c648	Revert "use tokio::sync:⌚:Receiver::wait_for" This reverts commit `fe4ef121b6`.	2023-05-25 14:57:41 +02:00
Christian Schwarz	ddad0928c5	Merge branch 'problame/infallible-timeline-activate/3-funnel-storage-broker-client' into problame/infallible-timeline-activate/4-make-infallible	2023-05-25 14:53:32 +02:00
Christian Schwarz	96c550222b	apply heikki's comment suggestion	2023-05-25 14:53:20 +02:00
Christian Schwarz	cf8ff7edad	explainer comment on storage_broker::connect async weirdness	2023-05-25 14:51:48 +02:00
Christian Schwarz	da6573f551	Merge branch 'problame/infallible-timeline-activate/3-funnel-storage-broker-client' into problame/infallible-timeline-activate/4-make-infallible	2023-05-25 10:54:30 +02:00
Christian Schwarz	2fee8c884f	Merge remote-tracking branch 'origin/main' into problame/infallible-timeline-activate/3-funnel-storage-broker-client	2023-05-25 10:54:03 +02:00
Christian Schwarz	fe4ef121b6	use tokio::sync:⌚:Receiver::wait_for	2023-05-25 10:44:26 +02:00
Christian Schwarz	641ca994dc	assert_eq suggestion	2023-05-25 09:55:32 +02:00
Christian Schwarz	413598b19b	fix merge fallout (?)	2023-05-24 17:42:51 +02:00
Christian Schwarz	b345f32e3f	Merge branch 'problame/infallible-timeline-activate/4-make-infallible' into problame/async-timeline-get/dont-hold-timelines-lock-inside-tenant-state-send-modify	2023-05-24 17:25:35 +02:00
Christian Schwarz	69cfa9fe61	launch_wal_receiver: apply joonas's review suggestion (visibility + doc comment)	2023-05-24 17:20:03 +02:00
Christian Schwarz	2c424c8f4e	Revert "activate_timelines counter is now == not_broken_timelines.len()" not_broken_timelines is an iterator, doesn't have `len()`. This reverts commit `4001f441c0`.	2023-05-24 17:19:22 +02:00
Christian Schwarz	4001f441c0	activate_timelines counter is now == not_broken_timelines.len()	2023-05-24 17:14:49 +02:00
Christian Schwarz	ef956c47fc	make it clear that `walreceiver_status` is always used in the branch where it's produced	2023-05-24 17:12:35 +02:00
Christian Schwarz	8606b6abe5	Merge remote-tracking branch 'origin/problame/infallible-timeline-activate/3-funnel-storage-broker-client' into problame/infallible-timeline-activate/4-make-infallible	2023-05-24 17:02:18 +02:00
Christian Schwarz	732f60317b	Merge remote-tracking branch 'origin/main' into problame/infallible-timeline-activate/3-funnel-storage-broker-client	2023-05-24 16:58:25 +02:00
Christian Schwarz	b54431bbd3	pass the BrokerClientChannel by value & clone it as necessary It's a wrapper around an inner Arc anyways Also, this gets rid of the OnceCell	2023-05-24 12:29:05 +02:00
Christian Schwarz	def5eb8542	Merge branch 'problame/infallible-timeline-activate/2-pushup-tenant-and-timeline-activation' into problame/infallible-timeline-activate/3-funnel-storage-broker-client	2023-05-24 11:57:37 +02:00
Christian Schwarz	07da786ed3	apply joonas's suggestion to use parent: None + follows_from	2023-05-24 11:56:26 +02:00
Christian Schwarz	75c3c43b2e	don't unwrap() the `activate()` result in spawn_load / spawn_attach	2023-05-24 11:36:07 +02:00
Christian Schwarz	bdf03eab58	Merge branch 'problame/infallible-timeline-activate/2-pushup-tenant-and-timeline-activation' into problame/infallible-timeline-activate/3-funnel-storage-broker-client	2023-05-24 11:32:38 +02:00
Christian Schwarz	32c85fa87a	Merge remote-tracking branch 'origin/main' into problame/infallible-timeline-activate/2-pushup-tenant-and-timeline-activation	2023-05-24 11:31:00 +02:00
Christian Schwarz	b2e0c58a8c	Merge branch 'problame/infallible-timeline-activate/4-make-infallible' into problame/async-timeline-get/dont-hold-timelines-lock-inside-tenant-state-send-modify	2023-05-23 20:44:34 +02:00
Christian Schwarz	94f30f0660	Merge branch 'problame/infallible-timeline-activate/3-funnel-storage-broker-client' into problame/infallible-timeline-activate/4-make-infallible	2023-05-23 20:44:12 +02:00
Christian Schwarz	a55d224923	tests would fail because broker client needs to be launched on a tokio runtime thread	2023-05-23 20:43:10 +02:00
Christian Schwarz	4f586ac101	Merge branch 'problame/infallible-timeline-activate/2-pushup-tenant-and-timeline-activation' into problame/infallible-timeline-activate/3-funnel-storage-broker-client	2023-05-23 20:42:54 +02:00
Christian Schwarz	feb2e80b83	tests were failing because activate() was outside of a span with tenant_id	2023-05-23 20:36:32 +02:00
Christian Schwarz	ee22e81583	don't hold timelines lock inside set_stopping()	2023-05-23 20:11:15 +02:00
Christian Schwarz	3e604eaa39	refactor: introduce TenantState::Activating to avoid holding timelines lock inside Tenant::activate	2023-05-23 20:03:12 +02:00
Christian Schwarz	8bcb542a3b	refactor: make timeline activation infallible Timeline::activate() was only fallible because `launch_wal_receiver` was. `launch_wal_receiver` was fallible only because of some preliminary checks in `WalReceiver::start`. Turns out these checks can be shifted to the type system by delaying creatinon of the `WalReceiver` struct to the point where we activate the timeline. The changes in this PR were enabled by my previous refactoring that funneled the broker_client from pageserver startup to the activate() call sites.	2023-05-23 19:27:06 +02:00
Christian Schwarz	17b081d294	refactor: eliminate global storage_broker client state (This is prep work to make `Timeline::activate` infallible.) This patch removes the global storage_broker client instance from the pageserver codebase. Instead, pageserver startup instantiates it and passes it down to the `Timeline::activate` function, which in turn passes it to the WalReceiver, which is the entity that actually uses it.	2023-05-23 19:27:06 +02:00
Christian Schwarz	d5337e6a65	refactor responsibility for tenant/timeline activation (This is prep work to make `Timeline::activate()` infallible.) The current possibility for failure in `Timeline::activate()` is the broker client's presence / absence. It should be an assert, but we're careful with these. So, I'm planning to pass in the broker client to activate(), thereby eliminating the possiblity of its absence. In the unit tests, we don't have a broker client. So, I thought I'd be in trouble because the unit tests also called `activate()` before this PR. However, closer inspection reveals a long-standing FIXME about this, which is addressed by this patch. It turns out that the unit tests don't actually need the background loops to be running. They just need the state value to be `Active`. So, for the tests, we just set it to that value but don't spawn the background loops. We'll need to revisit this if we ever do more Rust unit tests in the future. But right now, this refactoring improves the code, so, let's revisit when we get there.	2023-05-23 19:26:36 +02:00
Christian Schwarz	cc96a5186d	tenant_map_insert: don't expose the vacant entry to the closure This tightens up the API a little. Byproduct of some refactoring work that I'm doing right now.	2023-05-23 19:25:47 +02:00