rename fn

fix cancellation
fix http connect config
2026-01-20 03:42:55 +00:00 · 2024-08-16 08:54:25 +01:00 · 2024-08-16 08:44:03 +01:00 · 2024-08-16 08:42:14 +01:00
30 changed files with 506 additions and 1634 deletions
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -379,7 +379,7 @@ where
    }
 }

-pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
+fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
    match kill(pid, None) {
        // Process exists, keep waiting
        Ok(_) => Ok(false),
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -15,9 +15,7 @@ use control_plane::local_env::{
 };
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::storage_controller::{
-    NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
-};
+use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
 use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
@@ -1054,36 +1052,6 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
    humantime_duration.as_ref()
 }

-fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
-    let maybe_instance_id = args.get_one::<u8>("instance-id");
-
-    let base_port = args.get_one::<u16>("base-port");
-
-    if maybe_instance_id.is_some() && base_port.is_none() {
-        panic!("storage-controller start specificied instance-id but did not provide base-port");
-    }
-
-    let start_timeout = args
-        .get_one::<humantime::Duration>("start-timeout")
-        .expect("invalid value for start-timeout");
-
-    NeonStorageControllerStartArgs {
-        instance_id: maybe_instance_id.copied().unwrap_or(1),
-        base_port: base_port.copied(),
-        start_timeout: *start_timeout,
-    }
-}
-
-fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
-    let maybe_instance_id = args.get_one::<u8>("instance-id");
-    let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
-
-    NeonStorageControllerStopArgs {
-        instance_id: maybe_instance_id.copied().unwrap_or(1),
-        immediate,
-    }
-}
-
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
@@ -1145,14 +1113,19 @@ async fn handle_storage_controller(
    let svc = StorageController::from_env(env);
    match sub_match.subcommand() {
        Some(("start", start_match)) => {
-            if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
+            if let Err(e) = svc.start(get_start_timeout(start_match)).await {
                eprintln!("start failed: {e}");
                exit(1);
            }
        }

        Some(("stop", stop_match)) => {
-            if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
+            let immediate = stop_match
+                .get_one::<String>("stop-mode")
+                .map(|s| s.as_str())
+                == Some("immediate");
+
+            if let Err(e) = svc.stop(immediate).await {
                eprintln!("stop failed: {}", e);
                exit(1);
            }
@@ -1255,12 +1228,7 @@ async fn handle_start_all(
    // Only start the storage controller if the pageserver is configured to need it
    if env.control_plane_api.is_some() {
        let storage_controller = StorageController::from_env(env);
-        if let Err(e) = storage_controller
-            .start(NeonStorageControllerStartArgs::with_default_instance_id(
-                (*retry_timeout).into(),
-            ))
-            .await
-        {
+        if let Err(e) = storage_controller.start(retry_timeout).await {
            eprintln!("storage_controller start failed: {:#}", e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1390,21 +1358,10 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        eprintln!("neon broker stop failed: {e:#}");
    }

-    // Stop all storage controller instances. In the most common case there's only one,
-    // but iterate though the base data directory in order to discover the instances.
-    let storcon_instances = env
-        .storage_controller_instances()
-        .await
-        .expect("Must inspect data dir");
-    for (instance_id, _instance_dir_path) in storcon_instances {
+    if env.control_plane_api.is_some() {
        let storage_controller = StorageController::from_env(env);
-        let stop_args = NeonStorageControllerStopArgs {
-            instance_id,
-            immediate,
-        };
-
-        if let Err(e) = storage_controller.stop(stop_args).await {
-            eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
+        if let Err(e) = storage_controller.stop(immediate).await {
+            eprintln!("storage controller stop failed: {e:#}");
        }
    }
 }
@@ -1544,18 +1501,6 @@ fn cli() -> Command {
        .action(ArgAction::SetTrue)
        .required(false);

-    let instance_id = Arg::new("instance-id")
-        .long("instance-id")
-        .help("Identifier used to distinguish storage controller instances (default 1)")
-        .value_parser(value_parser!(u8))
-        .required(false);
-
-    let base_port = Arg::new("base-port")
-        .long("base-port")
-        .help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
-        .value_parser(value_parser!(u16))
-        .required(false);
-
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1664,12 +1609,9 @@ fn cli() -> Command {
                .arg_required_else_help(true)
                .about("Manage storage_controller")
                .subcommand(Command::new("start").about("Start storage controller")
-                            .arg(timeout_arg.clone())
-                            .arg(instance_id.clone())
-                            .arg(base_port))
+                            .arg(timeout_arg.clone()))
                .subcommand(Command::new("stop").about("Stop storage controller")
-                            .arg(stop_mode_arg.clone())
-                            .arg(instance_id))
+                            .arg(stop_mode_arg.clone()))
        )
        .subcommand(
            Command::new("safekeeper")
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -156,11 +156,6 @@ pub struct NeonStorageControllerConf {
    #[serde(with = "humantime_serde")]
    pub max_warming_up: Duration,

-    pub start_as_candidate: bool,
-
-    /// Database url used when running multiple storage controller instances
-    pub database_url: Option<SocketAddr>,
-
    /// Threshold for auto-splitting a tenant into shards
    pub split_threshold: Option<u64>,

@@ -179,8 +174,6 @@ impl Default for NeonStorageControllerConf {
        Self {
            max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
            max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
-            start_as_candidate: false,
-            database_url: None,
            split_threshold: None,
            max_secondary_lag_bytes: None,
        }
@@ -399,36 +392,6 @@ impl LocalEnv {
        }
    }

-    /// Inspect the base data directory and extract the instance id and instance directory path
-    /// for all storage controller instances
-    pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
-        let mut instances = Vec::default();
-
-        let dir = std::fs::read_dir(self.base_data_dir.clone())?;
-        for dentry in dir {
-            let dentry = dentry?;
-            let is_dir = dentry.metadata()?.is_dir();
-            let filename = dentry.file_name().into_string().unwrap();
-            let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
-                Some(suffix) => suffix.parse::<u8>().ok(),
-                None => None,
-            };
-
-            let is_instance_dir = is_dir && parsed_instance_id.is_some();
-
-            if !is_instance_dir {
-                continue;
-            }
-
-            instances.push((
-                parsed_instance_id.expect("Checked previously"),
-                dentry.path(),
-            ));
-        }
-
-        Ok(instances)
-    }
-
    pub fn register_branch_mapping(
        &mut self,
        branch_name: String,
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -3,8 +3,6 @@ use crate::{
    local_env::{LocalEnv, NeonStorageControllerConf},
 };
 use camino::{Utf8Path, Utf8PathBuf};
-use hyper::Uri;
-use nix::unistd::Pid;
 use pageserver_api::{
    controller_api::{
        NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
@@ -20,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
+use std::{fs, str::FromStr, time::Duration};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -31,14 +29,12 @@ use utils::{

 pub struct StorageController {
    env: LocalEnv,
+    listen: String,
    private_key: Option<Vec<u8>>,
    public_key: Option<String>,
+    postgres_port: u16,
    client: reqwest::Client,
    config: NeonStorageControllerConf,
-
-    // The listen addresses is learned when starting the storage controller,
-    // hence the use of OnceLock to init it at the right time.
-    listen: OnceLock<SocketAddr>,
 }

 const COMMAND: &str = "storage_controller";
@@ -47,36 +43,6 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

 const DB_NAME: &str = "storage_controller";

-pub struct NeonStorageControllerStartArgs {
-    pub instance_id: u8,
-    pub base_port: Option<u16>,
-    pub start_timeout: humantime::Duration,
-}
-
-impl NeonStorageControllerStartArgs {
-    pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
-        Self {
-            instance_id: 1,
-            base_port: None,
-            start_timeout,
-        }
-    }
-}
-
-pub struct NeonStorageControllerStopArgs {
-    pub instance_id: u8,
-    pub immediate: bool,
-}
-
-impl NeonStorageControllerStopArgs {
-    pub fn with_default_instance_id(immediate: bool) -> Self {
-        Self {
-            instance_id: 1,
-            immediate,
-        }
-    }
-}
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -101,6 +67,23 @@ pub struct InspectResponse {

 impl StorageController {
    pub fn from_env(env: &LocalEnv) -> Self {
+        // Makes no sense to construct this if pageservers aren't going to use it: assume
+        // pageservers have control plane API set
+        let listen_url = env.control_plane_api.clone().unwrap();
+
+        let listen = format!(
+            "{}:{}",
+            listen_url.host_str().unwrap(),
+            listen_url.port().unwrap()
+        );
+
+        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
+        // port, for use by our captive postgres.
+        let postgres_port = listen_url
+            .port()
+            .expect("Control plane API setting should always have a port")
+            + 1;
+
        // Assume all pageservers have symmetric auth configuration: this service
        // expects to use one JWT token to talk to all of them.
        let ps_conf = env
@@ -143,28 +126,20 @@ impl StorageController {

        Self {
            env: env.clone(),
+            listen,
            private_key,
            public_key,
+            postgres_port,
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
            config: env.storage_controller.clone(),
-            listen: OnceLock::default(),
        }
    }

-    fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
-        self.env
-            .base_data_dir
-            .join(format!("storage_controller_{}", instance_id))
-    }
-
-    fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(
-            self.storage_controller_instance_dir(instance_id)
-                .join("storage_controller.pid"),
-        )
-        .expect("non-Unicode path")
+    fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
+            .expect("non-Unicode path")
    }

    /// PIDFile for the postgres instance used to store storage controller state
@@ -209,9 +184,9 @@ impl StorageController {
    }

    /// Readiness check for our postgres process
-    async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
+    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
        let bin_path = pg_bin_dir.join("pg_isready");
-        let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
+        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;

        Ok(exitcode.success())
@@ -224,8 +199,8 @@ impl StorageController {
    /// who just want to run `cargo neon_local` without knowing about diesel.
    ///
    /// Returns the database url
-    pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
+    pub async fn setup_database(&self) -> anyhow::Result<String> {
+        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);

        let pg_bin_dir = self.get_pg_bin_dir().await?;
        let createdb_path = pg_bin_dir.join("createdb");
@@ -234,7 +209,7 @@ impl StorageController {
                "-h",
                "localhost",
                "-p",
-                &format!("{}", postgres_port),
+                &format!("{}", self.postgres_port),
                DB_NAME,
            ])
            .output()
@@ -255,14 +230,13 @@ impl StorageController {

    pub async fn connect_to_database(
        &self,
-        postgres_port: u16,
    ) -> anyhow::Result<(
        tokio_postgres::Client,
        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
    )> {
        tokio_postgres::Config::new()
            .host("localhost")
-            .port(postgres_port)
+            .port(self.postgres_port)
            // The user is the ambient operating system user name.
            // That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
            //
@@ -278,115 +252,72 @@ impl StorageController {
            .map_err(anyhow::Error::new)
    }

-    pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
-        let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
-        if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
-            if err.kind() != std::io::ErrorKind::AlreadyExists {
-                panic!("Failed to create instance dir {instance_dir:?}");
-            }
-        }
+    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
+        // Start a vanilla Postgres process used by the storage controller for persistence.
+        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
+            .unwrap()
+            .join("storage_controller_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await?;
+        let pg_log_path = pg_data_path.join("postgres.log");

-        let (listen, postgres_port) = {
-            if let Some(base_port) = start_args.base_port {
-                (
-                    format!("127.0.0.1:{base_port}"),
-                    self.config
-                        .database_url
-                        .expect("--base-port requires NeonStorageControllerConf::database_url")
-                        .port(),
-                )
-            } else {
-                let listen_url = self.env.control_plane_api.clone().unwrap();
-
-                let listen = format!(
-                    "{}:{}",
-                    listen_url.host_str().unwrap(),
-                    listen_url.port().unwrap()
-                );
-
-                (listen, listen_url.port().unwrap() + 1)
+        if !tokio::fs::try_exists(&pg_data_path).await? {
+            // Initialize empty database
+            let initdb_path = pg_bin_dir.join("initdb");
+            let mut child = Command::new(&initdb_path)
+                .envs(vec![
+                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ])
+                .args(["-D", pg_data_path.as_ref()])
+                .spawn()
+                .expect("Failed to spawn initdb");
+            let status = child.wait().await?;
+            if !status.success() {
+                anyhow::bail!("initdb failed with status {status}");
            }
        };

-        let socket_addr = listen
-            .parse()
-            .expect("listen address is a valid socket address");
-        self.listen
-            .set(socket_addr)
-            .expect("StorageController::listen is only set here");
+        // Write a minimal config file:
+        // - Specify the port, since this is chosen dynamically
+        // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+        //   the storage controller we don't want a slow local disk to interfere with that.
+        //
+        // NB: it's important that we rewrite this file on each start command so we propagate changes
+        // from `LocalEnv`'s config file (`.neon/config`).
+        tokio::fs::write(
+            &pg_data_path.join("postgresql.conf"),
+            format!("port = {}\nfsync=off\n", self.postgres_port),
+        )
+        .await?;

-        // Do we remove the pid file on stop?
-        let pg_started = self.is_postgres_running().await?;
-        let pg_lib_dir = self.get_pg_lib_dir().await?;
+        println!("Starting storage controller database...");
+        let db_start_args = [
+            "-w",
+            "-D",
+            pg_data_path.as_ref(),
+            "-l",
+            pg_log_path.as_ref(),
+            "start",
+        ];

-        if !pg_started {
-            // Start a vanilla Postgres process used by the storage controller for persistence.
-            let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
-                .unwrap()
-                .join("storage_controller_db");
-            let pg_bin_dir = self.get_pg_bin_dir().await?;
-            let pg_log_path = pg_data_path.join("postgres.log");
+        background_process::start_process(
+            "storage_controller_db",
+            &self.env.base_data_dir,
+            pg_bin_dir.join("pg_ctl").as_std_path(),
+            db_start_args,
+            vec![
+                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ],
+            background_process::InitialPidFile::Create(self.postgres_pid_file()),
+            retry_timeout,
+            || self.pg_isready(&pg_bin_dir),
+        )
+        .await?;

-            if !tokio::fs::try_exists(&pg_data_path).await? {
-                // Initialize empty database
-                let initdb_path = pg_bin_dir.join("initdb");
-                let mut child = Command::new(&initdb_path)
-                    .envs(vec![
-                        ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                        ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ])
-                    .args(["-D", pg_data_path.as_ref()])
-                    .spawn()
-                    .expect("Failed to spawn initdb");
-                let status = child.wait().await?;
-                if !status.success() {
-                    anyhow::bail!("initdb failed with status {status}");
-                }
-            };
-
-            // Write a minimal config file:
-            // - Specify the port, since this is chosen dynamically
-            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-            //   the storage controller we don't want a slow local disk to interfere with that.
-            //
-            // NB: it's important that we rewrite this file on each start command so we propagate changes
-            // from `LocalEnv`'s config file (`.neon/config`).
-            tokio::fs::write(
-                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}\nfsync=off\n", postgres_port),
-            )
-            .await?;
-
-            println!("Starting storage controller database...");
-            let db_start_args = [
-                "-w",
-                "-D",
-                pg_data_path.as_ref(),
-                "-l",
-                pg_log_path.as_ref(),
-                "start",
-            ];
-
-            background_process::start_process(
-                "storage_controller_db",
-                &self.env.base_data_dir,
-                pg_bin_dir.join("pg_ctl").as_std_path(),
-                db_start_args,
-                vec![
-                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ],
-                background_process::InitialPidFile::Create(self.postgres_pid_file()),
-                &start_args.start_timeout,
-                || self.pg_isready(&pg_bin_dir, postgres_port),
-            )
-            .await?;
-
-            // Run migrations on every startup, in case something changed.
-            self.setup_database(postgres_port).await?;
-        }
-
-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
+        // Run migrations on every startup, in case something changed.
+        let database_url = self.setup_database().await?;

        // We support running a startup SQL script to fiddle with the database before we launch storcon.
        // This is used by the test suite.
@@ -408,7 +339,7 @@ impl StorageController {
                }
            }
        };
-        let (mut client, conn) = self.connect_to_database(postgres_port).await?;
+        let (mut client, conn) = self.connect_to_database().await?;
        let conn = tokio::spawn(conn);
        let tx = client.build_transaction();
        let tx = tx.start().await?;
@@ -417,20 +348,9 @@ impl StorageController {
        drop(client);
        conn.await??;

-        let listen = self
-            .listen
-            .get()
-            .expect("cell is set earlier in this function");
-        let address_for_peers = Uri::builder()
-            .scheme("http")
-            .authority(format!("{}:{}", listen.ip(), listen.port()))
-            .path_and_query("")
-            .build()
-            .unwrap();
-
        let mut args = vec![
            "-l",
-            &listen.to_string(),
+            &self.listen,
            "--dev",
            "--database-url",
            &database_url,
@@ -438,17 +358,10 @@ impl StorageController {
            &humantime::Duration::from(self.config.max_offline).to_string(),
            "--max-warming-up-interval",
            &humantime::Duration::from(self.config.max_warming_up).to_string(),
-            "--address-for-peers",
-            &address_for_peers.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
        .collect::<Vec<_>>();
-
-        if self.config.start_as_candidate {
-            args.push("--start-as-candidate".to_string());
-        }
-
        if let Some(private_key) = &self.private_key {
            let claims = Claims::new(None, Scope::PageServerApi);
            let jwt_token =
@@ -481,15 +394,15 @@ impl StorageController {

        background_process::start_process(
            COMMAND,
-            &instance_dir,
+            &self.env.base_data_dir,
            &self.env.storage_controller_bin(),
            args,
            vec![
                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
            ],
-            background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
-            &start_args.start_timeout,
+            background_process::InitialPidFile::Create(self.pid_file()),
+            retry_timeout,
            || async {
                match self.ready().await {
                    Ok(_) => Ok(true),
@@ -502,35 +415,8 @@ impl StorageController {
        Ok(())
    }

-    pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
-        background_process::stop_process(
-            stop_args.immediate,
-            COMMAND,
-            &self.pid_file(stop_args.instance_id),
-        )?;
-
-        let storcon_instances = self.env.storage_controller_instances().await?;
-        for (instance_id, instanced_dir_path) in storcon_instances {
-            if instance_id == stop_args.instance_id {
-                continue;
-            }
-
-            let pid_file = instanced_dir_path.join("storage_controller.pid");
-            let pid = tokio::fs::read_to_string(&pid_file)
-                .await
-                .map_err(|err| {
-                    anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
-                })?
-                .parse::<i32>()
-                .expect("pid is valid i32");
-
-            let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
-            if other_proc_alive {
-                // There is another storage controller instance running, so we return
-                // and leave the database running.
-                return Ok(());
-            }
-        }
+    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;

        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
        let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -543,51 +429,27 @@ impl StorageController {
            .wait()
            .await?;
        if !stop_status.success() {
-            match self.is_postgres_running().await {
-                Ok(false) => {
-                    println!("Storage controller database is already stopped");
-                    return Ok(());
-                }
-                Ok(true) => {
-                    anyhow::bail!("Failed to stop storage controller database");
-                }
-                Err(err) => {
-                    anyhow::bail!("Failed to stop storage controller database: {err}");
-                }
+            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
+            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+                .args(pg_status_args)
+                .spawn()?
+                .wait()
+                .await?;
+
+            // pg_ctl status returns this exit code if postgres is not running: in this case it is
+            // fine that stop failed.  Otherwise it is an error that stop failed.
+            const PG_STATUS_NOT_RUNNING: i32 = 3;
+            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
+                println!("Storage controller database is already stopped");
+                return Ok(());
+            } else {
+                anyhow::bail!("Failed to stop storage controller database: {stop_status}")
            }
        }

        Ok(())
    }

-    async fn is_postgres_running(&self) -> anyhow::Result<bool> {
-        let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
-
-        let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
-        let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
-            .args(pg_status_args)
-            .spawn()?
-            .wait()
-            .await?;
-
-        // pg_ctl status returns this exit code if postgres is not running: in this case it is
-        // fine that stop failed.  Otherwise it is an error that stop failed.
-        const PG_STATUS_NOT_RUNNING: i32 = 3;
-        const PG_NO_DATA_DIR: i32 = 4;
-        const PG_STATUS_RUNNING: i32 = 0;
-        match status_exitcode.code() {
-            Some(PG_STATUS_NOT_RUNNING) => Ok(false),
-            Some(PG_NO_DATA_DIR) => Ok(false),
-            Some(PG_STATUS_RUNNING) => Ok(true),
-            Some(code) => Err(anyhow::anyhow!(
-                "pg_ctl status returned unexpected status code: {:?}",
-                code
-            )),
-            None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
-        }
-    }
-
    fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
        let category = match path.find('/') {
            Some(idx) => &path[..idx],
@@ -613,31 +475,15 @@ impl StorageController {
        RQ: Serialize + Sized,
        RS: DeserializeOwned + Sized,
    {
-        // In the special case of the `storage_controller start` subcommand, we wish
-        // to use the API endpoint of the newly started storage controller in order
-        // to pass the readiness check. In this scenario [`Self::listen`] will be set
-        // (see [`Self::start`]).
-        //
-        // Otherwise, we infer the storage controller api endpoint from the configured
-        // control plane API.
-        let url = if let Some(socket_addr) = self.listen.get() {
-            Url::from_str(&format!(
-                "http://{}:{}/{path}",
-                socket_addr.ip().to_canonical(),
-                socket_addr.port()
-            ))
-            .unwrap()
-        } else {
-            // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-            // for general purpose API access.
-            let listen_url = self.env.control_plane_api.clone().unwrap();
-            Url::from_str(&format!(
-                "http://{}:{}/{path}",
-                listen_url.host_str().unwrap(),
-                listen_url.port().unwrap()
-            ))
-            .unwrap()
-        };
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let listen_url = self.env.control_plane_api.clone().unwrap();
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            listen_url.host_str().unwrap(),
+            listen_url.port().unwrap()
+        ))
+        .unwrap();

        let mut builder = self.client.request(method, url);
        if let Some(body) = body {
--- a/docs/rfcs/036-physical-replication.md
+++ b/docs/rfcs/036-physical-replication.md
@@ -1,265 +0,0 @@
-# Physical Replication
-
-This RFC is a bit special in that we have already implemented physical
-replication a long time ago. However, we never properly wrote down all
-the decisions and assumptions, and in the last months when more users
-have started to use the feature, numerous issues have surfaced.
-
-This RFC documents the design decisions that have been made.
-
-## Summary
-
-PostgreSQL has a feature called streaming replication, where a replica
-streams WAL from the primary and continuously applies it. It is also
-known as "physical replication", to distinguish it from logical
-replication.  In PostgreSQL, a replica is initialized by taking a
-physical backup of the primary. In Neon, the replica is initialized
-from a slim "base backup" from the pageserver, just like a primary,
-and the primary and the replicas connect to the same pageserver,
-sharing the storage.
-
-There are two kinds of read-only replicas in Neon:
- replicas that follow the primary, and
- "static" replicas that are pinned at a particular LSN.
-
-A static replica is useful e.g. for performing time-travel queries and
-running one-off slow queries without affecting the primary. A replica
-that follows the primary can be used e.g. to scale out read-only
-workloads.
-
-## Motivation
-
-Read-only replicas allow offloading read-only queries. It's useful for
-isolation, if you want to make sure that read-only queries don't
-affect the primary, and it's also an easy way to provide guaranteed
-read-only access to an application, without having to mess with access
-controls.
-
-## Non Goals (if relevant)
-
-This RFC is all about WAL-based *physical* replication. Logical
-replication is a different feature.
-
-Neon also has the capability to launch "static" read-only nodes which
-do not follow the primary, but are pinned to a particular LSN. They
-can be used for long-running one-off queries, or for Point-in-time
-queries. They work similarly to read replicas that follow the primary,
-but some things are simpler: there are no concerns about cache
-invalidation when the data changes on the primary, or worrying about
-transactions that are in-progress on the primary.
-
-## Impacted components (e.g. pageserver, safekeeper, console, etc)
-
- Control plane launches the replica
- Replica Postgres instance connects to the safekeepers, to stream the WAL
- The primary does not know about the standby, except for the hot standby feedback
- The primary and replicas all connect to the same pageservers
-
-
-# Context
-
-Some useful things to know about hot standby and replicas in
-PostgreSQL.
-
-## PostgreSQL startup sequence
-
-"Running" and "start up" terms are little imprecise. PostgreSQL
-replica startup goes through several stages:
-
-1. First, the process is started up, and various initialization steps
-   are performed, like initializing shared memory. If you try to
-   connect to the server in this stage, you get an error: ERROR: the
-   database system is starting up. This stage happens very quickly, no
-
-2. Then the server reads the checpoint record from the WAL and starts
-   the WAL replay starting from the checkpoint. This works differently
-   in Neon: we start the WAL replay at the basebackup LSN, not from a
-   checkpoint! If you connect to the server in this state, you get an
-   error: ERROR: the database system is not yet accepting
-   connections. We proceed to the next stage, when the WAL replay sees
-   a running-xacts record. Or in Neon, the "CLOG scanning" mechanism
-   can allow us to move directly to next stage, with all the caveats
-   listed in this RFC.
-
-3. When the running-xacts information is established, the server
-   starts to accept connections normally.
-
-From PostgreSQL's point of view, the server is already running in
-stage 2, even though it's not accepting connections yet. Our
-`compute_ctl` does not consider it as running until stage 3. If the
-transition from stage 2 to 3 doesn't happen fast enough, the control
-plane will mark the start operation as failed.
-
-
-## Decisions, Issues
-
-### Cache invalidation in replica
-
-When a read replica follows the primary in PostgreSQL, it needs to
-stream all the WAL from the primary and apply all the records, to keep
-the local copy of the data consistent with the primary. In Neon, the
-replica can fetch the updated page versions from the pageserver, so
-it's not necessary to apply all the WAL. However, it needs to ensure
-that any pages that are currently in the Postgres buffer cache, or the
-Local File Cache, are either updated, or thrown away so that the next
-read of the page will fetch the latest version.
-
-We choose to apply the WAL records for pages that are already in the
-buffer cache, and skip records for other pages. Somewhat arbitrarily,
-we also apply records affecting catalog relations, fetching the old
-page version from the pageserver if necessary first. See
-`neon_redo_read_buffer_filter()` function.
-
-The replica wouldn't necessarily need to see all the WAL records, only
-the records that apply to cached pages. For simplicity, we do stream
-all the WAL to the replica, and the replica simply ignores WAL records
-that require no action.
-
-Like in PostgreSQL, the read replica maintains a "replay LSN", which
-is the LSN up to which the replica has received and replayed the
-WAL. The replica can lag behind the primary, if it cannot quite keep
-up with the primary, or if a long-running query conflicts with changes
-that are about to be applied, or even intentionally if the user wishes
-to see delayed data (see recovery_min_apply_delay). It's important
-that the replica sees a consistent view of the whole cluster at the
-replay LSN, when it's lagging behind.
-
-In Neon, the replica connects to a safekeeper to get the WAL
-stream. That means that the safekeepers must be able to regurgitate
-the original WAL as far back as the replay LSN of any running read
-replica. (A static read-only node that does not follow the primary
-does not require a WAL stream however). The primary does not need to
-be running, and when it is, the replicas don't incur any extra
-overhead to the primary (see hot standby feedback though).
-
-### In-progress transactions
-
-In PostgreSQL, when a hot standby server starts up, it cannot
-immediately open up for queries (see [PostgreSQL startup
-sequence]). It first needs to establish a complete list of in-progress
-transactions, including subtransactions, that are running at the
-primary, at the current replay LSN. Normally that happens quickly,
-when the replica sees a "running-xacts" WAL record, because the
-primary writes a running-xacts WAL record at every checkpoint, and in
-PostgreSQL the replica always starts the WAL replay from a checkpoint
-REDO point. (A shutdown checkpoint WAL record also implies that all
-the non-prepared transactions have ended.) If there are a lot of
-subtransactions in progress, however, the standby might need to wait
-for old transactions to complete before it can open up for queries.
-
-In Neon that problem is worse: a replica can start at any LSN, so
-there's no guarantee that it will see a running-xacts record any time
-soon. In particular, if the primary is not running when the replica is
-started, it might never see a running-xacts record.
-
-To make things worse, we initially missed this issue, and always
-started accepting queries at replica startup, even if it didn't have
-the transaction information. That could lead to incorrect query
-results and data corruption later. However, as we fixed that, we
-introduced a new problem compared to what we had before: previously
-the replica would always start up, but after fixing that bug, it might
-not. In a superficial way, the old behavior was better (but could lead
-to serious issues later!). That made fixing that bug was very hard,
-because as we fixed it, we made things (superficially) worse for
-others.
-
-See https://github.com/neondatabase/neon/pull/7288 which fixed the
-bug, and follow-up PRs https://github.com/neondatabase/neon/pull/8323
-and https://github.com/neondatabase/neon/pull/8484 to try to claw back
-the cases that started to cause trouble as fixing it. As of this
-writing, there are still cases where a replica might not immediately
-start up, causing the control plane operation to fail, the remaining
-issues are tracked in https://github.com/neondatabase/neon/issues/6211.
-
-One long-term fix for this is to switch to using so-called CSN
-snapshots in read replica. That would make it unnecessary to have the
-full in-progress transaction list in the replica at startup time. See
-https://commitfest.postgresql.org/48/4912/ for a work-in-progress
-patch to upstream to implement that.
-
-Another thing we could do is to teach the control plane about that
-distinction between "starting up" and "running but haven't received
-running-xacts information yet", so that we could keep the replica
-waiting longer in that stage, and also give any client connections the
-same `ERROR: the database system is not yet accepting connections`
-error that you get in standalone PostgreSQL in that state.
-
-
-### Recovery conflicts and Hot standby feedback
-
-It's possible that a tuple version is vacuumed away in the primary,
-even though it is still needed by a running transactions in the
-replica. This is called a "recovery conflict", and PostgreSQL provides
-various options for dealing with it. By default, the WAL replay will
-wait up to 30 s for the conflicting query to finish. After that, it
-will kill the running query, so that the WAL replay can proceed.
-
-Another way to avoid the situation is to enable the
-[`hot_standby_feedback`](https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY-FEEDBACK)
-option. When it is enabled, the primary will refrain from vacuuming
-tuples that are still needed in the primary. That means potentially
-bloating the primary, which violates the usual rule that read replicas
-don't affect the operations on the primary, which is why it's off by
-default. We leave it to users to decide if they want to turn it on,
-same as PostgreSQL.
-
-Neon supports `hot_standby_feedback` by passing the feedback messages
-from the replica to the safekeepers, and from safekeepers to the
-primary.
-
-### Relationship of settings between primary and replica
-
-In order to enter hot standby mode, some configuration options need to
-be set to the same or larger values in the standby, compared to the
-primary.  See [explanation in the PostgreSQL
-docs](https://www.postgresql.org/docs/current/hot-standby.html#HOT-STANDBY-ADMIN)
-
-In Neon, we have this problem too. To prevent customers from hitting
-it, the control plane automatically adjusts the settings of a replica,
-so that they match or exceed the primary's settings (see
-https://github.com/neondatabase/cloud/issues/14903). However, you
-can still hit the issue if the primary is restarted with larger
-settings, while the replica is running.
-
-
-### Interaction with Pageserver GC
-
-The read replica can lag behind the primary. If there are recovery
-conflicts or the replica cannot keep up for some reason, the lag can
-in principle grow indefinitely. The replica will issue all GetPage
-requests to the pageservers at the current replay LSN, and needs to
-see the old page versions.
-
-If the retention period in the pageserver is set to be small, it may
-have already garbage collected away the old page versions. That will
-cause read errors in the compute, and can mean that the replica cannot
-make progress with the replication anymore.
-
-There is a mechanism for replica to pass information about its replay
-LSN to the pageserver, so that the pageserver refrains from GC'ing
-data that is still needed by the standby. It's called
-'standby_horizon' in the pageserver code, see
-https://github.com/neondatabase/neon/pull/7368. A separate "lease"
-mechanism also is in the works, where the replica could hold a lease
-on the old LSN, preventing the pageserver from advancing the GC
-horizon past that point. The difference is that the standby_horizon
-mechanism relies on a feedback message from replica to safekeeper,
-while the least API is exposed directly from the pageserver. A static
-read-only node is not connected to safekeepers, so it cannot use the
-standby_horizon mechanism.
-
-
-### Synchronous replication
-
-We haven't put any effort into synchronous replication yet.
-
-PostgreSQL provides multiple levels of synchronicity. In the weaker
-levels, a transaction is not acknowledged as committed to the client
-in the primary until the WAL has been streamed to a replica or flushed
-to disk there. Those modes don't make senses in Neon, because the
-safekeepers handle durability.
-
-`synchronous_commit=remote_apply` mode would make sense. In that mode,
-the commit is not acknowledged to the client until it has been
-replayed in the replica. That ensures that after commit, you can see
-the commit in the replica too (aka. read-your-write consistency).
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -50,6 +50,7 @@ pub mod defaults {
        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
        DEFAULT_PG_LISTEN_PORT,
    };
+    use pageserver_api::models::ImageCompressionAlgorithm;
    pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;

    pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
@@ -89,7 +90,8 @@ pub mod defaults {

    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

-    pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
+    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
+        ImageCompressionAlgorithm::Disabled;

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;

@@ -476,7 +478,7 @@ impl PageServerConfigBuilder {
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
-            image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
+            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
@@ -1063,7 +1065,7 @@ impl PageServerConf {
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
            ),
-            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
+            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1303,7 +1305,7 @@ background_task_maximum_delay = '334 s'
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
                ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
@@ -1376,7 +1378,7 @@ background_task_maximum_delay = '334 s'
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
                ),
-                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapTimeline {
    #[serde_as(as = "DisplayFromStr")]
-    pub(crate) timeline_id: TimelineId,
+    pub(super) timeline_id: TimelineId,

-    pub(crate) layers: Vec<HeatMapLayer>,
+    pub(super) layers: Vec<HeatMapLayer>,
 }

 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
-    pub(crate) name: LayerName,
-    pub(crate) metadata: LayerFileMetadata,
+    pub(super) name: LayerName,
+    pub(super) metadata: LayerFileMetadata,

    #[serde_as(as = "TimestampSeconds<i64>")]
    pub(super) access_time: SystemTime,
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -208,8 +208,6 @@ impl SplitDeltaLayerWriter {

 #[cfg(test)]
 mod tests {
-    use rand::{RngCore, SeedableRng};
-
    use crate::{
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
@@ -231,10 +229,7 @@ mod tests {
    }

    fn get_large_img() -> Bytes {
-        let mut rng = rand::rngs::SmallRng::seed_from_u64(42);
-        let mut data = vec![0; 8192];
-        rng.fill_bytes(&mut data);
-        data.into()
+        vec![0; 8192].into()
    }

    #[tokio::test]
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2977,7 +2977,11 @@ impl Timeline {
                LayerVisibilityHint::Visible => {
                    // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
                    let last_activity_ts = layer.latest_activity();
-                    Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
+                    Some(HeatMapLayer::new(
+                        layer.layer_desc().layer_name(),
+                        layer.metadata(),
+                        last_activity_ts,
+                    ))
                }
                LayerVisibilityHint::Covered => {
                    // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -2986,23 +2990,7 @@ impl Timeline {
            }
        });

-        let mut layers = resident.collect::<Vec<_>>();
-
-        // Sort layers in order of which to download first.  For a large set of layers to download, we
-        // want to prioritize those layers which are most likely to still be in the resident many minutes
-        // or hours later:
-        // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
-        //   only exist for a few minutes before being compacted into L1s.
-        // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
-        //   the layer is likely to be covered by an image layer during compaction.
-        layers.sort_by_key(|(desc, _meta, _atime)| {
-            std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
-        });
-
-        let layers = layers
-            .into_iter()
-            .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
-            .collect();
+        let layers = resident.collect();

        Some(HeatMapTimeline::new(self.timeline_id, layers))
    }
@@ -4528,7 +4516,6 @@ impl DurationRecorder {
 /// the layer descriptor requires the user to provide the ranges, which should cover all
 /// keys specified in the `data` field.
 #[cfg(test)]
-#[derive(Clone)]
 pub struct DeltaLayerTestDesc {
    pub lsn_range: Range<Lsn>,
    pub key_range: Range<Key>,
@@ -4558,13 +4545,6 @@ impl DeltaLayerTestDesc {
            data,
        }
    }
-
-    pub(crate) fn layer_name(&self) -> LayerName {
-        LayerName::Delta(super::storage_layer::DeltaLayerName {
-            key_range: self.key_range.clone(),
-            lsn_range: self.lsn_range.clone(),
-        })
-    }
 }

 impl Timeline {
@@ -5788,110 +5768,12 @@ fn is_send() {

 #[cfg(test)]
 mod tests {
-    use pageserver_api::key::Key;
    use utils::{id::TimelineId, lsn::Lsn};

-    use crate::{
-        repository::Value,
-        tenant::{
-            harness::{test_img, TenantHarness},
-            layer_map::LayerMap,
-            storage_layer::{Layer, LayerName},
-            timeline::{DeltaLayerTestDesc, EvictionError},
-            Timeline,
-        },
+    use crate::tenant::{
+        harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline,
    };

-    #[tokio::test]
-    async fn test_heatmap_generation() {
-        let harness = TenantHarness::create("heatmap_generation").await.unwrap();
-
-        let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
-            Lsn(0x10)..Lsn(0x20),
-            vec![(
-                Key::from_hex("620000000033333333444444445500000000").unwrap(),
-                Lsn(0x11),
-                Value::Image(test_img("foo")),
-            )],
-        );
-        let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
-            Lsn(0x10)..Lsn(0x20),
-            vec![(
-                Key::from_hex("720000000033333333444444445500000000").unwrap(),
-                Lsn(0x11),
-                Value::Image(test_img("foo")),
-            )],
-        );
-        let l0_delta = DeltaLayerTestDesc::new(
-            Lsn(0x20)..Lsn(0x30),
-            Key::from_hex("000000000000000000000000000000000000").unwrap()
-                ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
-            vec![(
-                Key::from_hex("720000000033333333444444445500000000").unwrap(),
-                Lsn(0x25),
-                Value::Image(test_img("foo")),
-            )],
-        );
-        let delta_layers = vec![
-            covered_delta.clone(),
-            visible_delta.clone(),
-            l0_delta.clone(),
-        ];
-
-        let image_layer = (
-            Lsn(0x40),
-            vec![(
-                Key::from_hex("620000000033333333444444445500000000").unwrap(),
-                test_img("bar"),
-            )],
-        );
-        let image_layers = vec![image_layer];
-
-        let (tenant, ctx) = harness.load().await;
-        let timeline = tenant
-            .create_test_timeline_with_layers(
-                TimelineId::generate(),
-                Lsn(0x10),
-                14,
-                &ctx,
-                delta_layers,
-                image_layers,
-                Lsn(0x100),
-            )
-            .await
-            .unwrap();
-
-        // Layer visibility is an input to heatmap generation, so refresh it first
-        timeline.update_layer_visibility().await.unwrap();
-
-        let heatmap = timeline
-            .generate_heatmap()
-            .await
-            .expect("Infallible while timeline is not shut down");
-
-        assert_eq!(heatmap.timeline_id, timeline.timeline_id);
-
-        // L0 should come last
-        assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
-
-        let mut last_lsn = Lsn::MAX;
-        for layer in heatmap.layers {
-            // Covered layer should be omitted
-            assert!(layer.name != covered_delta.layer_name());
-
-            let layer_lsn = match &layer.name {
-                LayerName::Delta(d) => d.lsn_range.end,
-                LayerName::Image(i) => i.lsn,
-            };
-
-            // Apart from L0s, newest Layers should come first
-            if !LayerMap::is_l0(layer.name.key_range()) {
-                assert!(layer_lsn <= last_lsn);
-                last_lsn = layer_lsn;
-            }
-        }
-    }
-
    #[tokio::test]
    async fn two_layer_eviction_attempts_at_the_same_time() {
        let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -41,8 +41,6 @@

 #include "hll.h"

-#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
-
 /*
 * Local file cache is used to temporary store relations pages in local file system.
 * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -53,43 +51,19 @@
 *
 * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
 * its consistency.
-
- *
- * ## Holes
- *
- * The LFC can be resized on the fly, up to a maximum size that's determined
- * at server startup (neon.max_file_cache_size). After server startup, we
- * expand the underlying file when needed, until it reaches the soft limit
- * (neon.file_cache_size_limit). If the soft limit is later reduced, we shrink
- * the LFC by punching holes in the underlying file with a
- * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
- * shrink, but the disk space it uses does.
- *
- * Each hole is tracked by a dummy FileCacheEntry, which are kept in the
- * 'holes' linked list. They are entered into the chunk hash table, with a
- * special key where the blockNumber is used to store the 'offset' of the
- * hole, and all other fields are zero. Holes are never looked up in the hash
- * table, we only enter them there to have a FileCacheEntry that we can keep
- * in the linked list. If the soft limit is raised again, we reuse the holes
- * before extending the nominal size of the file.
 */

 /* Local file storage allocation chunk.
- * Should be power of two. Using larger than page chunks can
+ * Should be power of two and not less than 32. Using larger than page chunks can
 * 1. Reduce hash-map memory footprint: 8TB database contains billion pages
 *    and size of hash entry is 40 bytes, so we need 40Gb just for hash map.
 *    1Mb chunks can reduce hash map size to 320Mb.
 * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
 */
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
-/*
- * Smaller chunk seems to be better for OLTP workload
- */
-// #define BLOCKS_PER_CHUNK	8 /* 64kb chunk */
 #define MB					((uint64)1024*1024)

 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
-#define CHUNK_BITMAP_SIZE ((BLOCKS_PER_CHUNK + 31) / 32)

 typedef struct FileCacheEntry
 {
@@ -97,8 +71,8 @@ typedef struct FileCacheEntry
 	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	uint32		bitmap[CHUNK_BITMAP_SIZE];
-	dlist_node	list_node;		/* LRU/holes list node */
+	uint32		bitmap[BLOCKS_PER_CHUNK / 32];
+	dlist_node	lru_node;		/* LRU list node */
 } FileCacheEntry;

 typedef struct FileCacheControl
@@ -113,7 +87,6 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
-	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 } FileCacheControl;

@@ -162,7 +135,6 @@ lfc_disable(char const *op)
 		lfc_ctl->used = 0;
 		lfc_ctl->limit = 0;
 		dlist_init(&lfc_ctl->lru);
-		dlist_init(&lfc_ctl->holes);

 		if (lfc_desc > 0)
 		{
@@ -242,18 +214,18 @@ lfc_shmem_startup(void)
 	if (!found)
 	{
 		int			fd;
-		uint32		n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);
+		uint32		lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);

 		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
 		info.keysize = sizeof(BufferTag);
 		info.entrysize = sizeof(FileCacheEntry);

 		/*
-		 * n_chunks+1 because we add new element to hash table before eviction
+		 * lfc_size+1 because we add new element to hash table before eviction
 		 * of victim
 		 */
 		lfc_hash = ShmemInitHash("lfc_hash",
-								 n_chunks + 1, n_chunks + 1,
+								 lfc_size + 1, lfc_size + 1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
 		lfc_ctl->generation = 0;
@@ -263,7 +235,6 @@ lfc_shmem_startup(void)
 		lfc_ctl->misses = 0;
 		lfc_ctl->writes = 0;
 		dlist_init(&lfc_ctl->lru);
-		dlist_init(&lfc_ctl->holes);

 		/* Initialize hyper-log-log structure for estimating working set size */
 		initSHLL(&lfc_ctl->wss_estimation);
@@ -339,31 +310,14 @@ lfc_change_limit_hook(int newval, void *extra)
 		 * Shrink cache by throwing away least recently accessed chunks and
 		 * returning their space to file system
 		 */
-		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-		FileCacheEntry *hole;
-		uint32		offset = victim->offset;
-		uint32		hash;
-		bool		found;
-		BufferTag	holetag;
+		FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));

-		CriticalAssert(victim->access_count == 0);
+		Assert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
 		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
 			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
-		/* We remove the old entry, and re-enter a hole to the hash table */
 		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-
-		memset(&holetag, 0, sizeof(holetag));
-		holetag.blockNum = offset;
-		hash = get_hash_value(lfc_hash, &holetag);
-		hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
-		hole->hash = hash;
-		hole->offset = offset;
-		hole->access_count = 0;
-		CriticalAssert(!found);
-		dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
-
 		lfc_ctl->used -= 1;
 	}
 	lfc_ctl->limit = new_size;
@@ -455,8 +409,6 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
-
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);
@@ -488,7 +440,6 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	tag.forkNum = forkNum;
 	tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));

-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -519,7 +470,7 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	{
 		bool		has_remaining_pages;

-		for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
+		for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
 		{
 			if (entry->bitmap[i] != 0)
 			{
@@ -534,8 +485,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 		 */
 		if (!has_remaining_pages)
 		{
-			dlist_delete(&entry->list_node);
-			dlist_push_head(&lfc_ctl->lru, &entry->list_node);
+			dlist_delete(&entry->lru_node);
+			dlist_push_head(&lfc_ctl->lru, &entry->lru_node);
 		}
 	}

@@ -574,8 +525,6 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
-
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -602,7 +551,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 	/* Unlink entry from LRU list to pin it for the duration of IO operation */
 	if (entry->access_count++ == 0)
-		dlist_delete(&entry->list_node);
+		dlist_delete(&entry->lru_node);
 	generation = lfc_ctl->generation;
 	entry_offset = entry->offset;

@@ -620,12 +569,12 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	if (lfc_ctl->generation == generation)
 	{
-		CriticalAssert(LFC_ENABLED());
+		Assert(LFC_ENABLED());
 		lfc_ctl->hits += 1;
 		pgBufferUsage.file_cache.hits += 1;
-		CriticalAssert(entry->access_count > 0);
+		Assert(entry->access_count > 0);
 		if (--entry->access_count == 0)
-			dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
 	}
 	else
 		result = false;
@@ -664,8 +613,6 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
 	CopyNRelFileInfoToBufTag(tag, rinfo);
-
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -685,7 +632,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		 * operation
 		 */
 		if (entry->access_count++ == 0)
-			dlist_delete(&entry->list_node);
+			dlist_delete(&entry->lru_node);
 	}
 	else
 	{
@@ -708,26 +655,13 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
 		if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
 		{
 			/* Cache overflow: evict least recently used chunk */
-			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));

-			CriticalAssert(victim->access_count == 0);
+			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
 			neon_log(DEBUG2, "Swap file cache page");
 		}
-		else if (!dlist_is_empty(&lfc_ctl->holes))
-		{
-			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
-			uint32		offset = hole->offset;
-			bool		found;
-
-			hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &found);
-			CriticalAssert(found);
-
-			lfc_ctl->used += 1;
-			entry->offset = offset;	/* reuse the hole */
-		}
 		else
 		{
 			lfc_ctl->used += 1;
@@ -755,11 +689,11 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void

 		if (lfc_ctl->generation == generation)
 		{
-			CriticalAssert(LFC_ENABLED());
+			Assert(LFC_ENABLED());
 			/* Place entry to the head of LRU list */
-			CriticalAssert(entry->access_count > 0);
+			Assert(entry->access_count > 0);
 			if (--entry->access_count == 0)
-				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
+				dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);

 			entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
 		}
@@ -774,6 +708,7 @@ typedef struct
 } NeonGetStatsCtx;

 #define NUM_NEON_GET_STATS_COLS	2
+#define NUM_NEON_GET_STATS_ROWS	3

 PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
 Datum
@@ -809,6 +744,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 						   INT8OID, -1, 0);

 		fctx->tupdesc = BlessTupleDesc(tupledesc);
+		funcctx->max_calls = NUM_NEON_GET_STATS_ROWS;
 		funcctx->user_fctx = fctx;

 		/* Return to original context when allocating transient memory */
@@ -842,11 +778,6 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->writes;
 			break;
-		case 4:
-			key = "file_cache_size";
-			if (lfc_ctl)
-				value = lfc_ctl->size;
-			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}
@@ -970,7 +901,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
+					for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
 						n_pages += pg_popcount32(entry->bitmap[i]);
 				}
 			}
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -54,10 +54,6 @@

 #define BufTagGetNRelFileInfo(tag) tag.rnode

-#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
-
-#define InvalidRelFileNumber InvalidOid
-
 #define SMgrRelGetRelInfo(reln) \
 	(reln->smgr_rnode.node)

--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -151,21 +151,34 @@ impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
 #[derive(Clone)]
 pub struct CancelClosure {
    socket_addr: SocketAddr,
-    cancel_token: CancelToken,
+    cancel_token: Option<CancelToken>,
 }

 impl CancelClosure {
    pub fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
        Self {
            socket_addr,
-            cancel_token,
+            cancel_token: Some(cancel_token),
        }
    }
+
+    #[cfg(test)]
+    pub fn test() -> Self {
+        use std::net::{Ipv4Addr, SocketAddrV4};
+
+        Self {
+            socket_addr: SocketAddr::V4(SocketAddrV4::new(Ipv4Addr::from_bits(0), 0)),
+            cancel_token: None,
+        }
+    }
+
    /// Cancels the query running on user's compute node.
    pub async fn try_cancel_query(self) -> Result<(), CancelError> {
-        let socket = TcpStream::connect(self.socket_addr).await?;
-        self.cancel_token.cancel_query_raw(socket, NoTls).await?;
-        info!("query was cancelled");
+        if let Some(cancel_token) = self.cancel_token {
+            let socket = TcpStream::connect(self.socket_addr).await?;
+            cancel_token.cancel_query_raw(socket, NoTls).await?;
+            info!("query was cancelled");
+        }
        Ok(())
    }
 }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -16,8 +16,10 @@ use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError}
 use std::{io, net::SocketAddr, sync::Arc, time::Duration};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio_postgres::tls::MakeTlsConnect;
-use tokio_postgres_rustls::MakeRustlsConnect;
+use tokio_postgres::{
+    tls::{MakeTlsConnect, NoTlsError},
+    Client, Connection,
+};
 use tracing::{error, info, warn};

 const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
@@ -42,6 +44,12 @@ pub enum ConnectionError {
    TooManyConnectionAttempts(#[from] ApiLockError),
 }

+impl From<NoTlsError> for ConnectionError {
+    fn from(value: NoTlsError) -> Self {
+        Self::CouldNotConnect(io::Error::new(io::ErrorKind::Other, value.to_string()))
+    }
+}
+
 impl UserFacingError for ConnectionError {
    fn to_string_client(&self) -> String {
        use ConnectionError::*;
@@ -273,6 +281,30 @@ pub struct PostgresConnection {
 }

 impl ConnCfg {
+    /// Connect to a corresponding compute node.
+    pub async fn managed_connect<M: MakeTlsConnect<tokio::net::TcpStream>>(
+        &self,
+        ctx: &RequestMonitoring,
+        timeout: Duration,
+        mktls: &mut M,
+    ) -> Result<(SocketAddr, Client, Connection<TcpStream, M::Stream>), ConnectionError>
+    where
+        ConnectionError: From<M::Error>,
+    {
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
+        drop(pause);
+
+        let tls = mktls.make_tls_connect(host)?;
+
+        // connect_raw() will not use TLS if sslmode is "disable"
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+        let (client, connection) = self.0.connect_raw(stream, tls).await?;
+        drop(pause);
+
+        Ok((socket_addr, client, connection))
+    }
+
    /// Connect to a corresponding compute node.
    pub async fn connect(
        &self,
@@ -281,10 +313,6 @@ impl ConnCfg {
        aux: MetricsAuxInfo,
        timeout: Duration,
    ) -> Result<PostgresConnection, ConnectionError> {
-        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
-        drop(pause);
-
        let client_config = if allow_self_signed_compute {
            // Allow all certificates for creating the connection
            let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
@@ -298,21 +326,15 @@ impl ConnCfg {
        let client_config = client_config.with_no_client_auth();

        let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
-        let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
-            &mut mk_tls,
-            host,
-        )?;

-        // connect_raw() will not use TLS if sslmode is "disable"
-        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let (client, connection) = self.0.connect_raw(stream, tls).await?;
-        drop(pause);
+        let (socket_addr, client, connection) =
+            self.managed_connect(ctx, timeout, &mut mk_tls).await?;
        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
        let stream = connection.stream.into_inner();

        info!(
            cold_start_info = ctx.cold_start_info().as_str(),
-            "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
+            "connected to compute node ({socket_addr}) sslmode={:?}",
            self.0.get_ssl_mode()
        );

--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -5,7 +5,8 @@ use tracing::{field::display, info};

 use crate::{
    auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
-    compute,
+    cancellation::CancelClosure,
+    compute::{self, ConnectionError},
    config::{AuthenticationConfig, ProxyConfig},
    console::{
        errors::{GetAuthInfoError, WakeComputeError},
@@ -142,7 +143,7 @@ pub enum HttpConnError {
    #[error("pooled connection closed at inconsistent state")]
    ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
    #[error("could not connection to compute")]
-    ConnectionError(#[from] tokio_postgres::Error),
+    ConnectionError(#[from] ConnectionError),

    #[error("could not get auth info")]
    GetAuthInfo(#[from] GetAuthInfoError),
@@ -229,17 +230,16 @@ impl ConnectMechanism for TokioMechanism {
        let host = node_info.config.get_host()?;
        let permit = self.locks.get_permit(&host).await?;

-        let mut config = (*node_info.config).clone();
-        let config = config
-            .user(&self.conn_info.user_info.user)
-            .password(&*self.conn_info.password)
-            .dbname(&self.conn_info.dbname)
-            .connect_timeout(timeout);
+        let (socket_addr, client, connection) = permit.release_result(
+            node_info
+                .config
+                .managed_connect(ctx, timeout, &mut tokio_postgres::NoTls)
+                .await,
+        )?;

-        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let res = config.connect(tokio_postgres::NoTls).await;
-        drop(pause);
-        let (client, connection) = permit.release_result(res)?;
+        // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
+        // Yet another reason to rework the connection establishing code.
+        let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());

        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
        Ok(poll_client(
@@ -250,8 +250,14 @@ impl ConnectMechanism for TokioMechanism {
            connection,
            self.conn_id,
            node_info.aux.clone(),
+            cancel_closure,
        ))
    }

-    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
+    fn update_connect_config(&self, config: &mut compute::ConnCfg) {
+        config
+            .user(&self.conn_info.user_info.user)
+            .dbname(&self.conn_info.dbname)
+            .password(&self.conn_info.password);
+    }
 }
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -12,11 +12,13 @@ use std::{
    ops::Deref,
    sync::atomic::{self, AtomicUsize},
 };
+use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
+use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 use tokio_util::sync::CancellationToken;

+use crate::cancellation::CancelClosure;
 use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
@@ -463,14 +465,16 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
    }
 }

+#[allow(clippy::too_many_arguments)]
 pub fn poll_client<C: ClientInnerExt>(
    global_pool: Arc<GlobalConnPool<C>>,
    ctx: &RequestMonitoring,
    conn_info: ConnInfo,
    client: C,
-    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
+    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
    conn_id: uuid::Uuid,
    aux: MetricsAuxInfo,
+    cancel_closure: CancelClosure,
 ) -> Client<C> {
    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
    let mut session_id = ctx.session_id();
@@ -572,6 +576,7 @@ pub fn poll_client<C: ClientInnerExt>(
        cancel,
        aux,
        conn_id,
+        cancel_closure,
    };
    Client::new(inner, conn_info, pool_clone)
 }
@@ -582,6 +587,7 @@ struct ClientInner<C: ClientInnerExt> {
    cancel: CancellationToken,
    aux: MetricsAuxInfo,
    conn_id: uuid::Uuid,
+    cancel_closure: CancelClosure,
 }

 impl<C: ClientInnerExt> Drop for ClientInner<C> {
@@ -646,7 +652,7 @@ impl<C: ClientInnerExt> Client<C> {
            pool,
        }
    }
-    pub fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
+    pub fn inner(&mut self) -> (&mut C, &CancelClosure, Discard<'_, C>) {
        let Self {
            inner,
            pool,
@@ -654,7 +660,11 @@ impl<C: ClientInnerExt> Client<C> {
            span: _,
        } = self;
        let inner = inner.as_mut().expect("client inner should not be removed");
-        (&mut inner.inner, Discard { pool, conn_info })
+        (
+            &mut inner.inner,
+            &inner.cancel_closure,
+            Discard { pool, conn_info },
+        )
    }
 }

@@ -751,6 +761,7 @@ mod tests {
                cold_start_info: crate::console::messages::ColdStartInfo::Warm,
            },
            conn_id: uuid::Uuid::new_v4(),
+            cancel_closure: CancelClosure::test(),
        }
    }

@@ -785,7 +796,7 @@ mod tests {
        {
            let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
            assert_eq!(0, pool.get_global_connections_count());
-            client.inner().1.discard();
+            client.inner().2.discard();
            // Discard should not add the connection from the pool.
            assert_eq!(0, pool.get_global_connections_count());
        }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -26,7 +26,6 @@ use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::error::SqlState;
 use tokio_postgres::GenericClient;
 use tokio_postgres::IsolationLevel;
-use tokio_postgres::NoTls;
 use tokio_postgres::ReadyForQueryStatus;
 use tokio_postgres::Transaction;
 use tokio_util::sync::CancellationToken;
@@ -261,7 +260,9 @@ pub async fn handle(

            let mut message = e.to_string_client();
            let db_error = match &e {
-                SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
+                SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(
+                    crate::compute::ConnectionError::Postgres(e),
+                ))
                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
                _ => None,
            };
@@ -622,8 +623,7 @@ impl QueryData {
        client: &mut Client<tokio_postgres::Client>,
        parsed_headers: HttpHeaders,
    ) -> Result<String, SqlOverHttpError> {
-        let (inner, mut discard) = client.inner();
-        let cancel_token = inner.cancel_token();
+        let (inner, cancel_token, mut discard) = client.inner();

        let res = match select(
            pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)),
@@ -647,7 +647,7 @@ impl QueryData {
            // The query was cancelled.
            Either::Right((_cancelled, query)) => {
                tracing::info!("cancelling query");
-                if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                if let Err(err) = cancel_token.clone().try_cancel_query().await {
                    tracing::error!(?err, "could not cancel query");
                }
                // wait for the query cancellation
@@ -663,7 +663,9 @@ impl QueryData {
                    // query failed or was cancelled.
                    Ok(Err(error)) => {
                        let db_error = match &error {
-                            SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
+                            SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(
+                                crate::compute::ConnectionError::Postgres(e),
+                            ))
                            | SqlOverHttpError::Postgres(e) => e.as_db_error(),
                            _ => None,
                        };
@@ -694,8 +696,7 @@ impl BatchQueryData {
        parsed_headers: HttpHeaders,
    ) -> Result<String, SqlOverHttpError> {
        info!("starting transaction");
-        let (inner, mut discard) = client.inner();
-        let cancel_token = inner.cancel_token();
+        let (inner, cancel_token, mut discard) = client.inner();
        let mut builder = inner.build_transaction();
        if let Some(isolation_level) = parsed_headers.txn_isolation_level {
            builder = builder.isolation_level(isolation_level);
@@ -728,7 +729,7 @@ impl BatchQueryData {
                    json_output
                }
                Err(SqlOverHttpError::Cancelled(_)) => {
-                    if let Err(err) = cancel_token.cancel_query(NoTls).await {
+                    if let Err(err) = cancel_token.clone().try_cancel_query().await {
                        tracing::error!(?err, "could not cancel query");
                    }
                    // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -92,7 +92,7 @@ impl TermHistory {
    }

    /// Find point of divergence between leader (walproposer) term history and
-    /// safekeeper. Arguments are not symmetric as proposer history ends at
+    /// safekeeper. Arguments are not symmetrics as proposer history ends at
    /// +infinity while safekeeper at flush_lsn.
    /// C version is at walproposer SendProposerElected.
    pub fn find_highest_common_point(
@@ -701,13 +701,7 @@ where
            .with_label_values(&["handle_elected"])
            .start_timer();

-        info!(
-            "received ProposerElected {:?}, term={}, last_log_term={}, flush_lsn={}",
-            msg,
-            self.state.acceptor_state.term,
-            self.get_last_log_term(),
-            self.flush_lsn()
-        );
+        info!("received ProposerElected {:?}", msg);
        if self.state.acceptor_state.term < msg.term {
            let mut state = self.state.start_change();
            state.acceptor_state.term = msg.term;
@@ -719,43 +713,22 @@ where
            return Ok(None);
        }

-        // Before truncating WAL check-cross the check divergence point received
-        // from the walproposer.
-        let sk_th = self.get_term_history();
-        let last_common_point = match TermHistory::find_highest_common_point(
-            &msg.term_history,
-            &sk_th,
-            self.flush_lsn(),
-        ) {
-            // No common point. Expect streaming from the beginning of the
-            // history like walproposer while we don't have proper init.
-            None => *msg.term_history.0.first().ok_or(anyhow::anyhow!(
-                "empty walproposer term history {:?}",
-                msg.term_history
-            ))?,
-            Some(lcp) => lcp,
-        };
-        // This is expected to happen in a rare race when another connection
-        // from the same walproposer writes + flushes WAL after this connection
-        // sent flush_lsn in VoteRequest; for instance, very late
-        // ProposerElected message delivery after another connection was
-        // established and wrote WAL. In such cases error is transient;
-        // reconnection makes safekeeper send newest term history and flush_lsn
-        // and walproposer recalculates the streaming point. OTOH repeating
-        // error indicates a serious bug.
-        if last_common_point.lsn != msg.start_streaming_at {
-            bail!("refusing ProposerElected with unexpected truncation point: lcp={:?} start_streaming_at={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
-                    last_common_point, msg.start_streaming_at,
-                    self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
-            );
+        // This might happen in a rare race when another (old) connection from
+        // the same walproposer writes + flushes WAL after this connection
+        // already sent flush_lsn in VoteRequest. It is generally safe to
+        // proceed, but to prevent commit_lsn surprisingly going down we should
+        // either refuse the session (simpler) or skip the part we already have
+        // from the stream (can be implemented).
+        if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
+            bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
+                   msg.term, self.flush_lsn(), msg.start_streaming_at)
        }
-
-        // We are also expected to never attempt to truncate committed data.
+        // Otherwise we must never attempt to truncate committed data.
        assert!(
            msg.start_streaming_at >= self.state.inmem.commit_lsn,
-            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}, term={}, sk_th={:?} flush_lsn={}, wp_th={:?}",
-            msg.start_streaming_at, self.state.inmem.commit_lsn,
-            self.state.acceptor_state.term, sk_th, self.flush_lsn(), msg.term_history,
+            "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
+            msg.start_streaming_at,
+            self.state.inmem.commit_lsn
        );

        // Before first WAL write initialize its segment. It makes first segment
@@ -770,6 +743,9 @@ where
                .await?;
        }

+        // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to
+        // intersection of our history and history from msg
+
        // truncate wal, update the LSNs
        self.wal_store.truncate_wal(msg.start_streaming_at).await?;

@@ -1093,7 +1069,7 @@ mod tests {

        let pem = ProposerElected {
            term: 1,
-            start_streaming_at: Lsn(3),
+            start_streaming_at: Lsn(1),
            term_history: TermHistory(vec![TermLsn {
                term: 1,
                lsn: Lsn(3),
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -520,19 +520,6 @@ async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiErr
    json_response(StatusCode::OK, node_status)
 }

-async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-    let leader = state.service.get_leader().await.map_err(|err| {
-        ApiError::InternalServerError(anyhow::anyhow!(
-            "Failed to read leader from database: {err}"
-        ))
-    })?;
-
-    json_response(StatusCode::OK, leader)
-}
-
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -1029,9 +1016,6 @@ pub fn make_router(
        .get("/control/v1/node/:node_id", |r| {
            named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
        })
-        .get("/control/v1/leader", |r| {
-            named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
-        })
        .put("/control/v1/node/:node_id/drain", |r| {
            named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
        })
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -1,7 +1,7 @@
 use crate::tenant_shard::ObservedState;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
-use std::{collections::HashMap, time::Duration};
+use std::collections::HashMap;
 use tokio_util::sync::CancellationToken;

 use hyper::Uri;
@@ -69,8 +69,6 @@ impl PeerClient {
            req
        };

-        let req = req.timeout(Duration::from_secs(2));
-
        let res = req
            .send()
            .await
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -20,8 +20,7 @@ use crate::{
    metrics,
    peer_client::{GlobalObservedState, PeerClient},
    persistence::{
-        AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
-        TenantFilter,
+        AbortShardSplitStatus, ControllerPersistence, MetadataHealthPersistence, TenantFilter,
    },
    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
@@ -490,6 +489,11 @@ pub(crate) enum ReconcileResultRequest {
    Stop,
 }

+struct LeaderStepDownState {
+    observed: GlobalObservedState,
+    leader: ControllerPersistence,
+}
+
 impl Service {
    pub fn get_config(&self) -> &Config {
        &self.config
@@ -500,8 +504,7 @@ impl Service {
    #[instrument(skip_all)]
    async fn startup_reconcile(
        self: &Arc<Service>,
-        current_leader: Option<ControllerPersistence>,
-        leader_step_down_state: Option<GlobalObservedState>,
+        leader_step_down_state: Option<LeaderStepDownState>,
        bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
            Result<(), (TenantShardId, NotifyError)>,
        >,
@@ -519,15 +522,17 @@ impl Service {
            .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
            .expect("Reconcile timeout is a modest constant");

-        let observed = if let Some(state) = leader_step_down_state {
+        let (observed, current_leader) = if let Some(state) = leader_step_down_state {
            tracing::info!(
                "Using observed state received from leader at {}",
-                current_leader.as_ref().unwrap().address
+                state.leader.address,
            );
-
-            state
+            (state.observed, Some(state.leader))
        } else {
-            self.build_global_observed_state(node_scan_deadline).await
+            (
+                self.build_global_observed_state(node_scan_deadline).await,
+                None,
+            )
        };

        // Accumulate a list of any tenant locations that ought to be detached
@@ -1377,32 +1382,13 @@ impl Service {
                };

                let leadership_status = this.inner.read().unwrap().get_leadership_status();
-                let leader = match this.get_leader().await {
-                    Ok(ok) => ok,
-                    Err(err) => {
-                        tracing::error!(
-                            "Failed to query database for current leader: {err}. Aborting start-up ..."
-                        );
-                        std::process::exit(1);
-                    }
-                };
-
-                let leader_step_down_state = match leadership_status {
-                    LeadershipStatus::Candidate => {
-                        if let Some(ref leader) = leader {
-                            this.request_step_down(leader).await
-                        } else {
-                            tracing::info!(
-                                "No leader found to request step down from. Will build observed state."
-                            );
-                            None
-                        }
-                    }
+                let peer_observed_state = match leadership_status {
+                    LeadershipStatus::Candidate => this.request_step_down().await,
                    LeadershipStatus::Leader => None,
                    LeadershipStatus::SteppedDown => unreachable!(),
                };

-                this.startup_reconcile(leader, leader_step_down_state, bg_compute_notify_result_tx)
+                this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
                    .await;

                drop(startup_completion);
@@ -4664,10 +4650,6 @@ impl Service {
            ))
    }

-    pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<ControllerPersistence>> {
-        self.persistence.get_leader().await
-    }
-
    pub(crate) async fn node_register(
        &self,
        register_req: NodeRegisterRequest,
@@ -6360,7 +6342,6 @@ impl Service {

    pub(crate) async fn step_down(&self) -> GlobalObservedState {
        tracing::info!("Received step down request from peer");
-        failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");

        self.inner.write().unwrap().step_down();
        // TODO: would it make sense to have a time-out for this?
@@ -6386,31 +6367,50 @@ impl Service {
    ///
    /// On failures to query the database or step down error responses the process is killed
    /// and we rely on k8s to retry.
-    async fn request_step_down(
-        &self,
-        leader: &ControllerPersistence,
-    ) -> Option<GlobalObservedState> {
-        tracing::info!("Sending step down request to {leader:?}");
-
-        // TODO: jwt token
-        let client = PeerClient::new(
-            Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
-            self.config.jwt_token.clone(),
-        );
-        let state = client.step_down(&self.cancel).await;
-        match state {
-            Ok(state) => Some(state),
+    async fn request_step_down(&self) -> Option<LeaderStepDownState> {
+        let leader = match self.persistence.get_leader().await {
+            Ok(leader) => leader,
            Err(err) => {
-                // TODO: Make leaders periodically update a timestamp field in the
-                // database and, if the leader is not reachable from the current instance,
-                // but inferred as alive from the timestamp, abort start-up. This avoids
-                // a potential scenario in which we have two controllers acting as leaders.
                tracing::error!(
-                    "Leader ({}) did not respond to step-down request: {}",
-                    leader.address,
-                    err
+                    "Failed to query database for current leader: {err}. Aborting start-up ..."
                );
+                std::process::exit(1);
+            }
+        };

+        match leader {
+            Some(leader) => {
+                tracing::info!("Sending step down request to {leader:?}");
+
+                // TODO: jwt token
+                let client = PeerClient::new(
+                    Uri::try_from(leader.address.as_str()).expect("Failed to build leader URI"),
+                    self.config.jwt_token.clone(),
+                );
+                let state = client.step_down(&self.cancel).await;
+                match state {
+                    Ok(state) => Some(LeaderStepDownState {
+                        observed: state,
+                        leader: leader.clone(),
+                    }),
+                    Err(err) => {
+                        // TODO: Make leaders periodically update a timestamp field in the
+                        // database and, if the leader is not reachable from the current instance,
+                        // but inferred as alive from the timestamp, abort start-up. This avoids
+                        // a potential scenario in which we have two controllers acting as leaders.
+                        tracing::error!(
+                            "Leader ({}) did not respond to step-down request: {}",
+                            leader.address,
+                            err
+                        );
+                        None
+                    }
+                }
+            }
+            None => {
+                tracing::info!(
+                    "No leader found to request step down from. Will build observed state."
+                );
                None
            }
        }
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -3,10 +3,9 @@ use camino::Utf8PathBuf;
 use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
 use reqwest::{Method, Url};
-use storage_controller_client::control_api;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
-use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata;
+use storage_scrubber::scan_pageserver_metadata::scan_metadata;
 use storage_scrubber::tenant_snapshot::SnapshotDownloader;
 use storage_scrubber::{find_large_objects, ControllerClientConfig};
 use storage_scrubber::{
@@ -69,7 +68,7 @@ enum Command {
        #[arg(long = "tenant-id", num_args = 0..)]
        tenant_ids: Vec<TenantShardId>,
        #[arg(long = "post", default_value_t = false)]
-        post_to_storcon: bool,
+        post_to_storage_controller: bool,
        #[arg(long, default_value = None)]
        /// For safekeeper node_kind only, points to db with debug dump
        dump_db_connstr: Option<String>,
@@ -101,16 +100,6 @@ enum Command {
        #[arg(long = "concurrency", short = 'j', default_value_t = 64)]
        concurrency: usize,
    },
-    CronJob {
-        // PageserverPhysicalGc
-        #[arg(long = "min-age")]
-        gc_min_age: humantime::Duration,
-        #[arg(short, long, default_value_t = GcMode::IndicesOnly)]
-        gc_mode: GcMode,
-        // ScanMetadata
-        #[arg(long = "post", default_value_t = false)]
-        post_to_storcon: bool,
-    },
 }

 #[tokio::main]
@@ -128,7 +117,6 @@ async fn main() -> anyhow::Result<()> {
        Command::TenantSnapshot { .. } => "tenant-snapshot",
        Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
        Command::FindLargeObjects { .. } => "find-large-objects",
-        Command::CronJob { .. } => "cron-job",
    };
    let _guard = init_logging(&format!(
        "{}_{}_{}_{}.log",
@@ -138,13 +126,12 @@ async fn main() -> anyhow::Result<()> {
        chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
    ));

-    let controller_client = cli.controller_api.map(|controller_api| {
+    let controller_client_conf = cli.controller_api.map(|controller_api| {
        ControllerClientConfig {
            controller_api,
            // Default to no key: this is a convenience when working in a development environment
            controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
        }
-        .build_client()
    });

    match cli.command {
@@ -152,7 +139,7 @@ async fn main() -> anyhow::Result<()> {
            json,
            tenant_ids,
            node_kind,
-            post_to_storcon,
+            post_to_storage_controller,
            dump_db_connstr,
            dump_db_table,
        } => {
@@ -191,14 +178,53 @@ async fn main() -> anyhow::Result<()> {
                }
                Ok(())
            } else {
-                scan_pageserver_metadata_cmd(
-                    bucket_config,
-                    controller_client.as_ref(),
-                    tenant_ids,
-                    json,
-                    post_to_storcon,
-                )
-                .await
+                if controller_client_conf.is_none() && post_to_storage_controller {
+                    return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
+                }
+                match scan_metadata(bucket_config.clone(), tenant_ids).await {
+                    Err(e) => {
+                        tracing::error!("Failed: {e}");
+                        Err(e)
+                    }
+                    Ok(summary) => {
+                        if json {
+                            println!("{}", serde_json::to_string(&summary).unwrap())
+                        } else {
+                            println!("{}", summary.summary_string());
+                        }
+
+                        if post_to_storage_controller {
+                            if let Some(conf) = controller_client_conf {
+                                let controller_client = conf.build_client();
+                                let body = summary.build_health_update_request();
+                                controller_client
+                                    .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
+                                        Method::POST,
+                                        "control/v1/metadata_health/update".to_string(),
+                                        Some(body),
+                                    )
+                                    .await?;
+                            }
+                        }
+
+                        if summary.is_fatal() {
+                            tracing::error!("Fatal scrub errors detected");
+                        } else if summary.is_empty() {
+                            // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                            // scrubber they were likely expecting to scan something, and if we see no timelines
+                            // at all then it's likely due to some configuration issues like a bad prefix
+                            tracing::error!(
+                                "No timelines found in bucket {} prefix {}",
+                                bucket_config.bucket,
+                                bucket_config
+                                    .prefix_in_bucket
+                                    .unwrap_or("<none>".to_string())
+                            );
+                        }
+
+                        Ok(())
+                    }
+                }
            }
        }
        Command::FindGarbage {
@@ -228,14 +254,31 @@ async fn main() -> anyhow::Result<()> {
            min_age,
            mode,
        } => {
-            pageserver_physical_gc_cmd(
-                &bucket_config,
-                controller_client.as_ref(),
+            match (&controller_client_conf, mode) {
+                (Some(_), _) => {
+                    // Any mode may run when controller API is set
+                }
+                (None, GcMode::Full) => {
+                    // The part of physical GC where we erase ancestor layers cannot be done safely without
+                    // confirming the most recent complete shard split with the controller.  Refuse to run, rather
+                    // than doing it unsafely.
+                    return Err(anyhow!("Full physical GC requires `--controller-api` and `--controller-jwt` to run"));
+                }
+                (None, GcMode::DryRun | GcMode::IndicesOnly) => {
+                    // These GcModes do not require the controller to run.
+                }
+            }
+
+            let summary = pageserver_physical_gc(
+                bucket_config,
+                controller_client_conf,
                tenant_ids,
-                min_age,
+                min_age.into(),
                mode,
            )
-            .await
+            .await?;
+            println!("{}", serde_json::to_string(&summary).unwrap());
+            Ok(())
        }
        Command::FindLargeObjects {
            min_size,
@@ -252,142 +295,5 @@ async fn main() -> anyhow::Result<()> {
            println!("{}", serde_json::to_string(&summary).unwrap());
            Ok(())
        }
-        Command::CronJob {
-            gc_min_age,
-            gc_mode,
-            post_to_storcon,
-        } => {
-            run_cron_job(
-                bucket_config,
-                controller_client.as_ref(),
-                gc_min_age,
-                gc_mode,
-                post_to_storcon,
-            )
-            .await
-        }
-    }
-}
-
-/// Runs the scrubber cron job.
-/// 1. Do pageserver physical gc
-/// 2. Scan pageserver metadata
-pub async fn run_cron_job(
-    bucket_config: BucketConfig,
-    controller_client: Option<&control_api::Client>,
-    gc_min_age: humantime::Duration,
-    gc_mode: GcMode,
-    post_to_storcon: bool,
-) -> anyhow::Result<()> {
-    tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc");
-    pageserver_physical_gc_cmd(
-        &bucket_config,
-        controller_client,
-        Vec::new(),
-        gc_min_age,
-        gc_mode,
-    )
-    .await?;
-    tracing::info!(%post_to_storcon, node_kind = %NodeKind::Pageserver, "Running scan-metadata");
-    scan_pageserver_metadata_cmd(
-        bucket_config,
-        controller_client,
-        Vec::new(),
-        true,
-        post_to_storcon,
-    )
-    .await?;
-
-    Ok(())
-}
-
-pub async fn pageserver_physical_gc_cmd(
-    bucket_config: &BucketConfig,
-    controller_client: Option<&control_api::Client>,
-    tenant_shard_ids: Vec<TenantShardId>,
-    min_age: humantime::Duration,
-    mode: GcMode,
-) -> anyhow::Result<()> {
-    match (controller_client, mode) {
-        (Some(_), _) => {
-            // Any mode may run when controller API is set
-        }
-        (None, GcMode::Full) => {
-            // The part of physical GC where we erase ancestor layers cannot be done safely without
-            // confirming the most recent complete shard split with the controller.  Refuse to run, rather
-            // than doing it unsafely.
-            return Err(anyhow!(
-                "Full physical GC requires `--controller-api` and `--controller-jwt` to run"
-            ));
-        }
-        (None, GcMode::DryRun | GcMode::IndicesOnly) => {
-            // These GcModes do not require the controller to run.
-        }
-    }
-
-    let summary = pageserver_physical_gc(
-        bucket_config,
-        controller_client,
-        tenant_shard_ids,
-        min_age.into(),
-        mode,
-    )
-    .await?;
-    println!("{}", serde_json::to_string(&summary).unwrap());
-    Ok(())
-}
-
-pub async fn scan_pageserver_metadata_cmd(
-    bucket_config: BucketConfig,
-    controller_client: Option<&control_api::Client>,
-    tenant_shard_ids: Vec<TenantShardId>,
-    json: bool,
-    post_to_storcon: bool,
-) -> anyhow::Result<()> {
-    if controller_client.is_none() && post_to_storcon {
-        return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
-    }
-    match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
-        Err(e) => {
-            tracing::error!("Failed: {e}");
-            Err(e)
-        }
-        Ok(summary) => {
-            if json {
-                println!("{}", serde_json::to_string(&summary).unwrap())
-            } else {
-                println!("{}", summary.summary_string());
-            }
-
-            if post_to_storcon {
-                if let Some(client) = controller_client {
-                    let body = summary.build_health_update_request();
-                    client
-                        .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
-                            Method::POST,
-                            "control/v1/metadata_health/update".to_string(),
-                            Some(body),
-                        )
-                        .await?;
-                }
-            }
-
-            if summary.is_fatal() {
-                tracing::error!("Fatal scrub errors detected");
-            } else if summary.is_empty() {
-                // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                // scrubber they were likely expecting to scan something, and if we see no timelines
-                // at all then it's likely due to some configuration issues like a bad prefix
-                tracing::error!(
-                    "No timelines found in bucket {} prefix {}",
-                    bucket_config.bucket,
-                    bucket_config
-                        .prefix_in_bucket
-                        .unwrap_or("<none>".to_string())
-                );
-            }
-
-            Ok(())
-        }
    }
 }
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -4,7 +4,9 @@ use std::time::{Duration, SystemTime};

 use crate::checks::{list_timeline_blobs, BlobDataParseResult};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::{
+    init_remote, BucketConfig, ControllerClientConfig, NodeKind, RootTarget, TenantShardTimelineId,
+};
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -471,8 +473,8 @@ async fn gc_ancestor(
 /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
 /// make sure that object listings don't get slowed down by large numbers of garbage objects.
 pub async fn pageserver_physical_gc(
-    bucket_config: &BucketConfig,
-    controller_client: Option<&control_api::Client>,
+    bucket_config: BucketConfig,
+    controller_client_conf: Option<ControllerClientConfig>,
    tenant_shard_ids: Vec<TenantShardId>,
    min_age: Duration,
    mode: GcMode,
@@ -556,7 +558,7 @@ pub async fn pageserver_physical_gc(
        let timelines = timelines.map_ok(|ttid| {
            gc_timeline(
                &s3_client,
-                bucket_config,
+                &bucket_config,
                &min_age,
                &target,
                mode,
@@ -572,7 +574,7 @@ pub async fn pageserver_physical_gc(
    }

    // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
-    let Some(client) = controller_client else {
+    let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
        tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
        return Ok(summary);
    };
@@ -581,13 +583,13 @@ pub async fn pageserver_physical_gc(
        .unwrap()
        .into_inner()
        .unwrap()
-        .into_gc_ancestors(client, &mut summary)
+        .into_gc_ancestors(&controller_client, &mut summary)
        .await;

    for ancestor_shard in ancestor_shards {
        gc_ancestor(
            &s3_client,
-            bucket_config,
+            &bucket_config,
            &target,
            &min_age,
            ancestor_shard,
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -116,7 +116,7 @@ Index versions: {version_summary}
 }

 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
-pub async fn scan_pageserver_metadata(
+pub async fn scan_metadata(
    bucket_config: BucketConfig,
    tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -3,7 +3,6 @@ pytest_plugins = (
    "fixtures.parametrize",
    "fixtures.httpserver",
    "fixtures.compute_reconfigure",
-    "fixtures.storage_controller_proxy",
    "fixtures.neon_fixtures",
    "fixtures.benchmark_fixture",
    "fixtures.pg_stats",
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -497,7 +497,6 @@ class NeonEnvBuilder:
        pageserver_aux_file_policy: Optional[AuxFileStore] = None,
        pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
        safekeeper_extra_opts: Optional[list[str]] = None,
-        storage_controller_port_override: Optional[int] = None,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
@@ -550,8 +549,6 @@ class NeonEnvBuilder:

        self.safekeeper_extra_opts = safekeeper_extra_opts

-        self.storage_controller_port_override = storage_controller_port_override
-
        assert test_name.startswith(
            "test_"
        ), "Unexpectedly instantiated from outside a test function"
@@ -1057,7 +1054,6 @@ class NeonEnv:
    """

    BASE_PAGESERVER_ID = 1
-    storage_controller: NeonStorageController | NeonProxiedStorageController

    def __init__(self, config: NeonEnvBuilder):
        self.repo_dir = config.repo_dir
@@ -1088,41 +1084,27 @@ class NeonEnv:
        self.initial_tenant = config.initial_tenant
        self.initial_timeline = config.initial_timeline

+        # Find two adjacent ports for storage controller and its postgres DB.  This
+        # loop would eventually throw from get_port() if we run out of ports (extremely
+        # unlikely): usually we find two adjacent free ports on the first iteration.
+        while True:
+            self.storage_controller_port = self.port_distributor.get_port()
+            storage_controller_pg_port = self.port_distributor.get_port()
+            if storage_controller_pg_port == self.storage_controller_port + 1:
+                break
+
        # The URL for the pageserver to use as its control_plane_api config
-        if config.storage_controller_port_override is not None:
-            log.info(
-                f"Using storage controller api override {config.storage_controller_port_override}"
-            )
-
-            self.storage_controller_port = config.storage_controller_port_override
-            self.storage_controller = NeonProxiedStorageController(
-                self, config.storage_controller_port_override, config.auth_enabled
-            )
-        else:
-            # Find two adjacent ports for storage controller and its postgres DB.  This
-            # loop would eventually throw from get_port() if we run out of ports (extremely
-            # unlikely): usually we find two adjacent free ports on the first iteration.
-            while True:
-                storage_controller_port = self.port_distributor.get_port()
-                storage_controller_pg_port = self.port_distributor.get_port()
-                if storage_controller_pg_port == storage_controller_port + 1:
-                    break
-
-            self.storage_controller_port = storage_controller_port
-            self.storage_controller = NeonStorageController(
-                self, storage_controller_port, config.auth_enabled
-            )
-
-            log.info(
-                f"Using generated control_plane_api: {self.storage_controller.upcall_api_endpoint()}"
-            )
-
-        self.storage_controller_api: str = self.storage_controller.api_root()
-        self.control_plane_api: str = self.storage_controller.upcall_api_endpoint()
+        self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1"
+        # The base URL of the storage controller
+        self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}"

        # For testing this with a fake HTTP server, enable passing through a URL from config
        self.control_plane_compute_hook_api = config.control_plane_compute_hook_api

+        self.storage_controller: NeonStorageController = NeonStorageController(
+            self, config.auth_enabled
+        )
+
        self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
        self.pageserver_aux_file_policy = config.pageserver_aux_file_policy

@@ -1162,6 +1144,7 @@ class NeonEnv:
                "listen_http_addr": f"localhost:{pageserver_port.http}",
                "pg_auth_type": pg_auth_type,
                "http_auth_type": http_auth_type,
+                "image_compression": "zstd",
            }
            if self.pageserver_virtual_file_io_engine is not None:
                ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
@@ -1886,24 +1869,16 @@ class NeonCli(AbstractNeonCli):
    def storage_controller_start(
        self,
        timeout_in_seconds: Optional[int] = None,
-        instance_id: Optional[int] = None,
-        base_port: Optional[int] = None,
    ):
        cmd = ["storage_controller", "start"]
        if timeout_in_seconds is not None:
            cmd.append(f"--start-timeout={timeout_in_seconds}s")
-        if instance_id is not None:
-            cmd.append(f"--instance-id={instance_id}")
-        if base_port is not None:
-            cmd.append(f"--base-port={base_port}")
        return self.raw_cli(cmd)

-    def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None):
+    def storage_controller_stop(self, immediate: bool):
        cmd = ["storage_controller", "stop"]
        if immediate:
            cmd.extend(["-m", "immediate"])
-        if instance_id is not None:
-            cmd.append(f"--instance-id={instance_id}")
        return self.raw_cli(cmd)

    def pageserver_start(
@@ -2214,30 +2189,17 @@ class PageserverSchedulingPolicy(str, Enum):
    PAUSE_FOR_RESTART = "PauseForRestart"


-class StorageControllerLeadershipStatus(str, Enum):
-    LEADER = "leader"
-    STEPPED_DOWN = "stepped_down"
-    CANDIDATE = "candidate"
-
-
 class NeonStorageController(MetricsGetter, LogUtils):
-    def __init__(self, env: NeonEnv, port: int, auth_enabled: bool):
+    def __init__(self, env: NeonEnv, auth_enabled: bool):
        self.env = env
-        self.port: int = port
-        self.api: str = f"http://127.0.0.1:{port}"
        self.running = False
        self.auth_enabled = auth_enabled
        self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
-        self.logfile = self.env.repo_dir / "storage_controller_1" / "storage_controller.log"
+        self.logfile = self.workdir / "storage_controller.log"

-    def start(
-        self,
-        timeout_in_seconds: Optional[int] = None,
-        instance_id: Optional[int] = None,
-        base_port: Optional[int] = None,
-    ):
+    def start(self, timeout_in_seconds: Optional[int] = None):
        assert not self.running
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
+        self.env.neon_cli.storage_controller_start(timeout_in_seconds)
        self.running = True
        return self

@@ -2247,12 +2209,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
            self.running = False
        return self

-    def upcall_api_endpoint(self) -> str:
-        return f"{self.api}/upcall/v1"
-
-    def api_root(self) -> str:
-        return self.api
-
    @staticmethod
    def retryable_node_operation(op, ps_id, max_attempts, backoff):
        while max_attempts > 0:
@@ -2281,9 +2237,7 @@ class NeonStorageController(MetricsGetter, LogUtils):

    def assert_no_errors(self):
        assert_no_errors(
-            self.logfile,
-            "storage_controller",
-            self.allowed_errors,
+            self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
        )

    def pageserver_api(self) -> PageserverHttpClient:
@@ -2295,7 +2249,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        auth_token = None
        if self.auth_enabled:
            auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API)
-        return PageserverHttpClient(self.port, lambda: True, auth_token)
+        return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token)

    def request(self, method, *args, **kwargs) -> requests.Response:
        resp = requests.request(method, *args, **kwargs)
@@ -2312,13 +2266,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
        return headers

    def get_metrics(self) -> Metrics:
-        res = self.request("GET", f"{self.api}/metrics")
+        res = self.request("GET", f"{self.env.storage_controller_api}/metrics")
        return parse_metrics(res.text)

    def ready(self) -> bool:
        status = None
        try:
-            resp = self.request("GET", f"{self.api}/ready")
+            resp = self.request("GET", f"{self.env.storage_controller_api}/ready")
            status = resp.status_code
        except StorageControllerApiException as e:
            status = e.status_code
@@ -2351,7 +2305,7 @@ class NeonStorageController(MetricsGetter, LogUtils):

        response = self.request(
            "POST",
-            f"{self.api}/debug/v1/attach-hook",
+            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
            json=body,
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2362,7 +2316,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
    def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
        self.request(
            "POST",
-            f"{self.api}/debug/v1/attach-hook",
+            f"{self.env.storage_controller_api}/debug/v1/attach-hook",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2373,7 +2327,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        """
        response = self.request(
            "POST",
-            f"{self.api}/debug/v1/inspect",
+            f"{self.env.storage_controller_api}/debug/v1/inspect",
            json={"tenant_shard_id": str(tenant_shard_id)},
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2396,7 +2350,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        log.info(f"node_register({body})")
        self.request(
            "POST",
-            f"{self.api}/control/v1/node",
+            f"{self.env.storage_controller_api}/control/v1/node",
            json=body,
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2405,7 +2359,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        log.info(f"node_delete({node_id})")
        self.request(
            "DELETE",
-            f"{self.api}/control/v1/node/{node_id}",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
            headers=self.headers(TokenScope.ADMIN),
        )

@@ -2413,7 +2367,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        log.info(f"node_drain({node_id})")
        self.request(
            "PUT",
-            f"{self.api}/control/v1/node/{node_id}/drain",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
            headers=self.headers(TokenScope.ADMIN),
        )

@@ -2421,7 +2375,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        log.info(f"cancel_node_drain({node_id})")
        self.request(
            "DELETE",
-            f"{self.api}/control/v1/node/{node_id}/drain",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
            headers=self.headers(TokenScope.ADMIN),
        )

@@ -2429,7 +2383,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        log.info(f"node_fill({node_id})")
        self.request(
            "PUT",
-            f"{self.api}/control/v1/node/{node_id}/fill",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
            headers=self.headers(TokenScope.ADMIN),
        )

@@ -2437,22 +2391,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
        log.info(f"cancel_node_fill({node_id})")
        self.request(
            "DELETE",
-            f"{self.api}/control/v1/node/{node_id}/fill",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
            headers=self.headers(TokenScope.ADMIN),
        )

    def node_status(self, node_id):
        response = self.request(
            "GET",
-            f"{self.api}/control/v1/node/{node_id}",
-            headers=self.headers(TokenScope.ADMIN),
-        )
-        return response.json()
-
-    def get_leader(self):
-        response = self.request(
-            "GET",
-            f"{self.api}/control/v1/leader",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
            headers=self.headers(TokenScope.ADMIN),
        )
        return response.json()
@@ -2460,7 +2406,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
    def node_list(self):
        response = self.request(
            "GET",
-            f"{self.api}/control/v1/node",
+            f"{self.env.storage_controller_api}/control/v1/node",
            headers=self.headers(TokenScope.ADMIN),
        )
        return response.json()
@@ -2468,7 +2414,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
    def tenant_list(self):
        response = self.request(
            "GET",
-            f"{self.api}/debug/v1/tenant",
+            f"{self.env.storage_controller_api}/debug/v1/tenant",
            headers=self.headers(TokenScope.ADMIN),
        )
        return response.json()
@@ -2478,7 +2424,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        body["node_id"] = node_id
        self.request(
            "PUT",
-            f"{self.api}/control/v1/node/{node_id}/config",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config",
            json=body,
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2513,7 +2459,7 @@ class NeonStorageController(MetricsGetter, LogUtils):

        response = self.request(
            "POST",
-            f"{self.api}/v1/tenant",
+            f"{self.env.storage_controller_api}/v1/tenant",
            json=body,
            headers=self.headers(TokenScope.PAGE_SERVER_API),
        )
@@ -2526,7 +2472,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        """
        response = self.request(
            "GET",
-            f"{self.api}/debug/v1/tenant/{tenant_id}/locate",
+            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate",
            headers=self.headers(TokenScope.ADMIN),
        )
        body = response.json()
@@ -2539,7 +2485,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        """
        response = self.request(
            "GET",
-            f"{self.api}/control/v1/tenant/{tenant_id}",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}",
            headers=self.headers(TokenScope.ADMIN),
        )
        response.raise_for_status()
@@ -2550,7 +2496,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
    ) -> list[TenantShardId]:
        response = self.request(
            "PUT",
-            f"{self.api}/control/v1/tenant/{tenant_id}/shard_split",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split",
            json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size},
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2562,7 +2508,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
    def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
        self.request(
            "PUT",
-            f"{self.api}/control/v1/tenant/{tenant_shard_id}/migrate",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate",
            json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2573,7 +2519,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        log.info(f"tenant_policy_update({tenant_id}, {body})")
        self.request(
            "PUT",
-            f"{self.api}/control/v1/tenant/{tenant_id}/policy",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy",
            json=body,
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2581,14 +2527,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
    def tenant_import(self, tenant_id: TenantId):
        self.request(
            "POST",
-            f"{self.api}/debug/v1/tenant/{tenant_id}/import",
+            f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import",
            headers=self.headers(TokenScope.ADMIN),
        )

    def reconcile_all(self):
        r = self.request(
            "POST",
-            f"{self.api}/debug/v1/reconcile_all",
+            f"{self.env.storage_controller_api}/debug/v1/reconcile_all",
            headers=self.headers(TokenScope.ADMIN),
        )
        r.raise_for_status()
@@ -2621,7 +2567,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        """
        self.request(
            "POST",
-            f"{self.api}/debug/v1/consistency_check",
+            f"{self.env.storage_controller_api}/debug/v1/consistency_check",
            headers=self.headers(TokenScope.ADMIN),
        )
        log.info("storage controller passed consistency check")
@@ -2694,7 +2640,7 @@ class NeonStorageController(MetricsGetter, LogUtils):

        self.request(
            "POST",
-            f"{self.api}/control/v1/metadata_health/update",
+            f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
            json=body,
            headers=self.headers(TokenScope.SCRUBBER),
        )
@@ -2702,7 +2648,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
    def metadata_health_list_unhealthy(self):
        response = self.request(
            "GET",
-            f"{self.api}/control/v1/metadata_health/unhealthy",
+            f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
            headers=self.headers(TokenScope.ADMIN),
        )
        return response.json()
@@ -2712,7 +2658,7 @@ class NeonStorageController(MetricsGetter, LogUtils):

        response = self.request(
            "POST",
-            f"{self.api}/control/v1/metadata_health/outdated",
+            f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
            json=body,
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2735,7 +2681,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
        log.info("Asking storage controller to step down")
        response = self.request(
            "PUT",
-            f"{self.api}/control/v1/step_down",
+            f"{self.env.storage_controller_api}/control/v1/step_down",
            headers=self.headers(TokenScope.ADMIN),
        )

@@ -2752,7 +2698,7 @@ class NeonStorageController(MetricsGetter, LogUtils):

        res = self.request(
            "PUT",
-            f"{self.api}/debug/v1/failpoints",
+            f"{self.env.storage_controller_api}/debug/v1/failpoints",
            json=[{"name": name, "actions": actions} for name, actions in pairs],
            headers=self.headers(TokenScope.ADMIN),
        )
@@ -2822,21 +2768,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
                parsed_tid, wait_ms=250
            )

-    def get_leadership_status(self) -> StorageControllerLeadershipStatus:
-        metric_values = {}
-        for status in StorageControllerLeadershipStatus:
-            metric_value = self.get_metric_value(
-                "storage_controller_leadership_status", filter={"status": status}
-            )
-            metric_values[status] = metric_value
-
-        assert list(metric_values.values()).count(1) == 1
-
-        for status, metric_value in metric_values.items():
-            if metric_value == 1:
-                return status
-
-        raise AssertionError("unreachable")
+    @property
+    def workdir(self) -> Path:
+        return self.env.repo_dir

    def __enter__(self) -> "NeonStorageController":
        return self
@@ -2850,59 +2784,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
        self.stop(immediate=True)


-class NeonProxiedStorageController(NeonStorageController):
-    def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool):
-        super(NeonProxiedStorageController, self).__init__(env, proxy_port, auth_enabled)
-        self.instances: dict[int, dict[str, Any]] = {}
-
-    def start(
-        self,
-        timeout_in_seconds: Optional[int] = None,
-        instance_id: Optional[int] = None,
-        base_port: Optional[int] = None,
-    ):
-        assert instance_id is not None and base_port is not None
-
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
-        self.instances[instance_id] = {"running": True}
-
-        self.running = True
-        return self
-
-    def stop_instance(
-        self, immediate: bool = False, instance_id: Optional[int] = None
-    ) -> "NeonStorageController":
-        assert instance_id in self.instances
-        if self.instances[instance_id]["running"]:
-            self.env.neon_cli.storage_controller_stop(immediate, instance_id)
-            self.instances[instance_id]["running"] = False
-
-        self.running = any(meta["running"] for meta in self.instances.values())
-        return self
-
-    def stop(self, immediate: bool = False) -> "NeonStorageController":
-        for iid, details in self.instances.items():
-            if details["running"]:
-                self.env.neon_cli.storage_controller_stop(immediate, iid)
-                self.instances[iid]["running"] = False
-
-        self.running = False
-        return self
-
-    def assert_no_errors(self):
-        for instance_id in self.instances.keys():
-            assert_no_errors(
-                self.env.repo_dir / f"storage_controller_{instance_id}" / "storage_controller.log",
-                "storage_controller",
-                self.allowed_errors,
-            )
-
-    def log_contains(
-        self, pattern: str, offset: None | LogCursor = None
-    ) -> Optional[Tuple[str, LogCursor]]:
-        raise NotImplementedError()
-
-
@dataclass
 class LogCursor:
    _line_no: int
@@ -4639,7 +4520,7 @@ class StorageScrubber:

        base_args = [
            str(self.env.neon_binpath / "storage_scrubber"),
-            f"--controller-api={self.env.storage_controller.api_root()}",
+            f"--controller-api={self.env.storage_controller_api}",
        ]
        args = base_args + args

--- a/test_runner/fixtures/storage_controller_proxy.py
+++ b/test_runner/fixtures/storage_controller_proxy.py
@@ -1,73 +0,0 @@
-import re
-from typing import Any, Optional
-
-import pytest
-import requests
-from pytest_httpserver import HTTPServer
-from werkzeug.datastructures import Headers
-from werkzeug.wrappers.request import Request
-from werkzeug.wrappers.response import Response
-
-from fixtures.log_helper import log
-
-
-class StorageControllerProxy:
-    def __init__(self, server: HTTPServer):
-        self.server: HTTPServer = server
-        self.listen: str = f"http://{server.host}:{server.port}"
-        self.routing_to: Optional[str] = None
-
-    def route_to(self, storage_controller_api: str):
-        self.routing_to = storage_controller_api
-
-    def port(self) -> int:
-        return self.server.port
-
-    def upcall_api_endpoint(self) -> str:
-        return f"{self.listen}/upcall/v1"
-
-
-def proxy_request(method: str, url: str, **kwargs) -> requests.Response:
-    return requests.request(method, url, **kwargs)
-
-
-@pytest.fixture(scope="function")
-def storage_controller_proxy(make_httpserver):
-    """
-    Proxies requests into the storage controller to the currently
-    selected storage controller instance via `StorageControllerProxy.route_to`.
-
-    This fixture is intended for tests that need to run multiple instances
-    of the storage controller at the same time.
-    """
-    server = make_httpserver
-
-    self = StorageControllerProxy(server)
-
-    log.info(f"Storage controller proxy listening on {self.listen}")
-
-    def handler(request: Request):
-        if self.route_to is None:
-            log.info(f"Storage controller proxy has no routing configured for {request.url}")
-            return Response("Routing not configured", status=503)
-
-        route_to_url = f"{self.routing_to}{request.path}"
-
-        log.info(f"Routing {request.url} to {route_to_url}")
-
-        args: dict[str, Any] = {"headers": request.headers}
-        if request.is_json:
-            args["json"] = request.json
-
-        response = proxy_request(request.method, route_to_url, **args)
-
-        headers = Headers()
-        for key, value in response.headers.items():
-            headers.add(key, value)
-
-        return Response(response.content, headers=headers, status=response.status_code)
-
-    self.server.expect_request(re.compile(".*")).respond_with_handler(handler)
-
-    yield self
-    server.clear()
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -403,7 +403,7 @@ def wait_until(
        try:
            res = func()
        except Exception as e:
-            log.info("waiting for %s iteration %s failed: %s", func, i + 1, e)
+            log.info("waiting for %s iteration %s failed", func, i + 1)
            last_exception = e
            if show_intermediate_error:
                log.info(e)
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -282,16 +282,15 @@ def test_snap_files(

    env = benchmark_project_pub.pgbench_env
    connstr = benchmark_project_pub.connstr
+    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)

    with psycopg2.connect(connstr) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
-            is_super = cur.fetchall()[0][0]
+            is_super = cur.fetchall()[0]
            assert is_super, "This benchmark won't work if we don't have superuser"

-    pg_bin.run_capture(["pgbench", "-i", "-s100"], env=env)
-
    conn = psycopg2.connect(connstr)
    conn.autocommit = True
    cur = conn.cursor()
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -1,7 +1,3 @@
-import os
-import random
-import re
-import subprocess
 import threading
 import time

@@ -21,17 +17,17 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
        "test_lfc_resize",
        config_lines=[
            "neon.file_cache_path='file.cache'",
-            "neon.max_file_cache_size=512MB",
-            "neon.file_cache_size_limit=512MB",
+            "neon.max_file_cache_size=1GB",
+            "neon.file_cache_size_limit=1GB",
        ],
    )
    n_resize = 10
-    scale = 100
+    scale = 10

    def run_pgbench(connstr: str):
        log.info(f"Start a pgbench workload on pg {connstr}")
        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
-        pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr])
+        pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr])

    thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
    thread.start()
@@ -39,21 +35,9 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
    conn = endpoint.connect()
    cur = conn.cursor()

-    for _ in range(n_resize):
-        size = random.randint(1, 512)
-        cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
+    for i in range(n_resize):
+        cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'")
        cur.execute("select pg_reload_conf()")
        time.sleep(1)

-    cur.execute("alter system set neon.file_cache_size_limit='100MB'")
-    cur.execute("select pg_reload_conf()")
-
    thread.join()
-
-    lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
-    lfc_file_size = os.path.getsize(lfc_file_path)
-    res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True)
-    lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
-    log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
-    assert lfc_file_size <= 512 * 1024 * 1024
-    assert int(lfc_file_blocks) <= 128 * 1024
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1,4 +1,3 @@
-import concurrent.futures
 import json
 import threading
 import time
@@ -17,7 +16,6 @@ from fixtures.neon_fixtures import (
    PageserverSchedulingPolicy,
    PgBin,
    StorageControllerApiException,
-    StorageControllerLeadershipStatus,
    TokenScope,
    last_flush_lsn_upload,
 )
@@ -32,9 +30,7 @@ from fixtures.pageserver.utils import (
    timeline_delete_wait_completed,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.storage_controller_proxy import StorageControllerProxy
 from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
 from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
@@ -2097,131 +2093,6 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
    )


-# This is a copy of NeonEnv.start which injects the instance id and port
-# into the call to NeonStorageController.start
-def start_env(env: NeonEnv, storage_controller_port: int):
-    timeout_in_seconds = 30
-
-    # Storage controller starts first, so that pageserver /re-attach calls don't
-    # bounce through retries on startup
-    env.storage_controller.start(timeout_in_seconds, 1, storage_controller_port)
-
-    # Wait for storage controller readiness to prevent unnecessary post start-up
-    # reconcile.
-    env.storage_controller.wait_until_ready()
-
-    # Start up broker, pageserver and all safekeepers
-    futs = []
-    with concurrent.futures.ThreadPoolExecutor(
-        max_workers=2 + len(env.pageservers) + len(env.safekeepers)
-    ) as executor:
-        futs.append(
-            executor.submit(lambda: env.broker.try_start() or None)
-        )  # The `or None` is for the linter
-
-        for pageserver in env.pageservers:
-            futs.append(
-                executor.submit(
-                    lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
-                )
-            )
-
-        for safekeeper in env.safekeepers:
-            futs.append(
-                executor.submit(
-                    lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
-                )
-            )
-
-    for f in futs:
-        f.result()
-
-
-@pytest.mark.parametrize("step_down_times_out", [False, True])
-def test_storage_controller_leadership_transfer(
-    neon_env_builder: NeonEnvBuilder,
-    storage_controller_proxy: StorageControllerProxy,
-    port_distributor: PortDistributor,
-    step_down_times_out: bool,
-):
-    neon_env_builder.num_pageservers = 3
-
-    neon_env_builder.storage_controller_config = {
-        "database_url": f"127.0.0.1:{port_distributor.get_port()}",
-        "start_as_candidate": True,
-    }
-
-    neon_env_builder.storage_controller_port_override = storage_controller_proxy.port()
-
-    storage_controller_1_port = port_distributor.get_port()
-    storage_controller_2_port = port_distributor.get_port()
-
-    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
-
-    env = neon_env_builder.init_configs()
-    start_env(env, storage_controller_1_port)
-
-    assert (
-        env.storage_controller.get_leadership_status() == StorageControllerLeadershipStatus.LEADER
-    )
-    leader = env.storage_controller.get_leader()
-    assert leader["address"] == f"http://127.0.0.1:{storage_controller_1_port}/"
-
-    if step_down_times_out:
-        env.storage_controller.configure_failpoints(
-            ("sleep-on-step-down-handling", "return(10000)")
-        )
-        env.storage_controller.allowed_errors.append(".*request was dropped before completing.*")
-
-    tenant_count = 2
-    shard_count = 4
-    tenants = set(TenantId.generate() for _ in range(0, tenant_count))
-
-    for tid in tenants:
-        env.storage_controller.tenant_create(
-            tid, shard_count=shard_count, placement_policy={"Attached": 1}
-        )
-    env.storage_controller.reconcile_until_idle()
-
-    env.storage_controller.start(
-        timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
-    )
-
-    if not step_down_times_out:
-
-        def previous_stepped_down():
-            assert (
-                env.storage_controller.get_leadership_status()
-                == StorageControllerLeadershipStatus.STEPPED_DOWN
-            )
-
-        wait_until(5, 1, previous_stepped_down)
-
-    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
-
-    def new_becomes_leader():
-        assert (
-            env.storage_controller.get_leadership_status()
-            == StorageControllerLeadershipStatus.LEADER
-        )
-
-    wait_until(15, 1, new_becomes_leader)
-    leader = env.storage_controller.get_leader()
-    assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/"
-
-    env.storage_controller.wait_until_ready()
-    env.storage_controller.consistency_check()
-
-    if step_down_times_out:
-        env.storage_controller.allowed_errors.extend(
-            [
-                ".*Leader.*did not respond to step-down request.*",
-                ".*Send step down request failed.*",
-                ".*Send step down request still failed.*",
-            ]
-        )
-
-
 def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
    # single unsharded tenant, two locations
    neon_env_builder.num_pageservers = 2
Author	SHA1	Message	Date
Conrad Ludgate	0e551edb06	rename fn	2024-08-16 08:54:25 +01:00
Conrad Ludgate	484cdccbf2	fix cancellation	2024-08-16 08:44:03 +01:00
Conrad Ludgate	39d1b78817	fix http connect config	2024-08-16 08:42:14 +01:00