the fix: shutdown method for WalRedoManager, so the Arc<> can outlive the process

Revert "WIP: solution approach 1: propagate cancellationtoken from tenant"
This reverts commit 5202d2dc98.
2026-05-28 10:30:40 +00:00 · 2024-06-24 19:29:31 +00:00 · 2024-06-24 19:04:02 +00:00 · 2024-06-24 19:03:30 +00:00 · 2024-06-24 18:46:29 +00:00 · 2024-06-24 18:25:57 +00:00
33 changed files with 1042 additions and 1091 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1014,6 +1014,9 @@ name = "camino"
 version = "1.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c"
+dependencies = [
+ "serde",
+]

 [[package]]
 name = "camino-tempfile"
@@ -4002,7 +4005,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4015,7 +4018,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -4034,7 +4037,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4647,6 +4650,7 @@ dependencies = [
 "futures-util",
 "http-types",
 "humantime",
+ "humantime-serde",
 "hyper 0.14.26",
 "itertools",
 "metrics",
@@ -6206,7 +6210,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -7367,6 +7371,7 @@ dependencies = [
 "base64 0.21.1",
 "base64ct",
 "bytes",
+ "camino",
 "cc",
 "chrono",
 "clap",
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -467,31 +467,6 @@ RUN case "${PG_VERSION}" in \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control

-#########################################################################################
-#
-# Layer "kq-imcx-pg-build"
-# compile kq_imcx extension
-#
-#########################################################################################
-FROM build-deps AS kq-imcx-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN apt-get update && \
-    apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
-    wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
-    echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
-    mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
-    mkdir build && cd build && \
-    cmake -DCMAKE_BUILD_TYPE=Release .. && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -

 #########################################################################################
 #
@@ -840,7 +815,6 @@ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -961,7 +935,6 @@ COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
 COPY patches/pg_hintplan.patch /ext-src
-#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -144,20 +144,7 @@ impl PgConnectionConfig {
            // implement and this function is hardly a bottleneck. The function is only called around
            // establishing a new connection.
            #[allow(unstable_name_collisions)]
-            config.options(
-                &self
-                    .options
-                    .iter()
-                    .map(|s| {
-                        if s.contains(['\\', ' ']) {
-                            Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
-                        } else {
-                            Cow::Borrowed(s.as_str())
-                        }
-                    })
-                    .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
-                    .collect::<String>(),
-            );
+            config.options(&encode_options(&self.options));
        }
        config
    }
@@ -178,6 +165,21 @@ impl PgConnectionConfig {
    }
 }

+#[allow(unstable_name_collisions)]
+fn encode_options(options: &[String]) -> String {
+    options
+        .iter()
+        .map(|s| {
+            if s.contains(['\\', ' ']) {
+                Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
+            } else {
+                Cow::Borrowed(s.as_str())
+            }
+        })
+        .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
+        .collect::<String>()
+}
+
 impl fmt::Display for PgConnectionConfig {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // The password is intentionally hidden and not part of this display string.
@@ -206,7 +208,7 @@ impl fmt::Debug for PgConnectionConfig {

 #[cfg(test)]
 mod tests_pg_connection_config {
-    use crate::PgConnectionConfig;
+    use crate::{encode_options, PgConnectionConfig};
    use once_cell::sync::Lazy;
    use url::Host;

@@ -255,18 +257,12 @@ mod tests_pg_connection_config {

    #[test]
    fn test_with_options() {
-        let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
-            "hello",
-            "world",
-            "with space",
-            "and \\ backslashes",
+        let options = encode_options(&[
+            "hello".to_owned(),
+            "world".to_owned(),
+            "with space".to_owned(),
+            "and \\ backslashes".to_owned(),
        ]);
-        assert_eq!(cfg.host(), &*STUB_HOST);
-        assert_eq!(cfg.port(), 123);
-        assert_eq!(cfg.raw_address(), "stub.host.example:123");
-        assert_eq!(
-            cfg.to_tokio_postgres_config().get_options(),
-            Some("hello world with\\ space and\\ \\\\\\ backslashes")
-        );
+        assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
    }
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -14,8 +14,9 @@ aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
 bytes.workspace = true
-camino.workspace = true
+camino = { workspace = true, features = ["serde1"] }
 humantime.workspace = true
+humantime-serde.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 rand.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -34,7 +34,7 @@ use utils::backoff;

 use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
-    error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
+    config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing,
    ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
 };

--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -0,0 +1,277 @@
+use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
+
+use anyhow::bail;
+use aws_sdk_s3::types::StorageClass;
+use camino::Utf8PathBuf;
+
+use serde::{Deserialize, Serialize};
+
+use crate::{
+    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT,
+    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
+};
+
+/// External backup storage configuration, enough for creating a client for that storage.
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
+pub struct RemoteStorageConfig {
+    /// The storage connection configuration.
+    #[serde(flatten)]
+    pub storage: RemoteStorageKind,
+    /// A common timeout enforced for all requests after concurrency limiter permit has been
+    /// acquired.
+    #[serde(
+        with = "humantime_serde",
+        default = "default_timeout",
+        skip_serializing_if = "is_default_timeout"
+    )]
+    pub timeout: Duration,
+}
+
+fn default_timeout() -> Duration {
+    RemoteStorageConfig::DEFAULT_TIMEOUT
+}
+
+fn is_default_timeout(d: &Duration) -> bool {
+    *d == RemoteStorageConfig::DEFAULT_TIMEOUT
+}
+
+/// A kind of a remote storage to connect to, with its connection configuration.
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum RemoteStorageKind {
+    /// Storage based on local file system.
+    /// Specify a root folder to place all stored files into.
+    LocalFs { local_path: Utf8PathBuf },
+    /// AWS S3 based storage, storing all files in the S3 bucket
+    /// specified by the config
+    AwsS3(S3Config),
+    /// Azure Blob based storage, storing all files in the container
+    /// specified by the config
+    AzureContainer(AzureConfig),
+}
+
+/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
+#[derive(Clone, PartialEq, Eq, Deserialize, Serialize)]
+pub struct S3Config {
+    /// Name of the bucket to connect to.
+    pub bucket_name: String,
+    /// The region where the bucket is located at.
+    pub bucket_region: String,
+    /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
+    pub prefix_in_bucket: Option<String>,
+    /// A base URL to send S3 requests to.
+    /// By default, the endpoint is derived from a region name, assuming it's
+    /// an AWS S3 region name, erroring on wrong region name.
+    /// Endpoint provides a way to support other S3 flavors and their regions.
+    ///
+    /// Example: `http://127.0.0.1:5000`
+    pub endpoint: Option<String>,
+    /// AWS S3 has various limits on its API calls, we need not to exceed those.
+    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
+    #[serde(default = "default_remote_storage_s3_concurrency_limit")]
+    pub concurrency_limit: NonZeroUsize,
+    #[serde(default = "default_max_keys_per_list_response")]
+    pub max_keys_per_list_response: Option<i32>,
+    #[serde(
+        deserialize_with = "deserialize_storage_class",
+        serialize_with = "serialize_storage_class",
+        default
+    )]
+    pub upload_storage_class: Option<StorageClass>,
+}
+
+fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize {
+    DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+        .try_into()
+        .unwrap()
+}
+
+fn default_max_keys_per_list_response() -> Option<i32> {
+    DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
+}
+
+impl Debug for S3Config {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("S3Config")
+            .field("bucket_name", &self.bucket_name)
+            .field("bucket_region", &self.bucket_region)
+            .field("prefix_in_bucket", &self.prefix_in_bucket)
+            .field("concurrency_limit", &self.concurrency_limit)
+            .field(
+                "max_keys_per_list_response",
+                &self.max_keys_per_list_response,
+            )
+            .finish()
+    }
+}
+
+/// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
+#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct AzureConfig {
+    /// Name of the container to connect to.
+    pub container_name: String,
+    /// Name of the storage account the container is inside of
+    pub storage_account: Option<String>,
+    /// The region where the bucket is located at.
+    pub container_region: String,
+    /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
+    pub prefix_in_container: Option<String>,
+    /// Azure has various limits on its API calls, we need not to exceed those.
+    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
+    #[serde(default = "default_remote_storage_azure_concurrency_limit")]
+    pub concurrency_limit: NonZeroUsize,
+    #[serde(default = "default_max_keys_per_list_response")]
+    pub max_keys_per_list_response: Option<i32>,
+}
+
+fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
+    NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
+}
+
+impl Debug for AzureConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("AzureConfig")
+            .field("bucket_name", &self.container_name)
+            .field("storage_account", &self.storage_account)
+            .field("bucket_region", &self.container_region)
+            .field("prefix_in_container", &self.prefix_in_container)
+            .field("concurrency_limit", &self.concurrency_limit)
+            .field(
+                "max_keys_per_list_response",
+                &self.max_keys_per_list_response,
+            )
+            .finish()
+    }
+}
+
+fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>(
+    deserializer: D,
+) -> Result<Option<StorageClass>, D::Error> {
+    Option::<String>::deserialize(deserializer).and_then(|s| {
+        if let Some(s) = s {
+            use serde::de::Error;
+            let storage_class = StorageClass::from_str(&s).expect("infallible");
+            #[allow(deprecated)]
+            if matches!(storage_class, StorageClass::Unknown(_)) {
+                return Err(D::Error::custom(format!(
+                    "Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}",
+                    StorageClass::values()
+                )));
+            }
+            Ok(Some(storage_class))
+        } else {
+            Ok(None)
+        }
+    })
+}
+
+fn serialize_storage_class<S: serde::Serializer>(
+    val: &Option<StorageClass>,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    let val = val.as_ref().map(StorageClass::as_str);
+    Option::<&str>::serialize(&val, serializer)
+}
+
+impl RemoteStorageConfig {
+    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
+
+    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
+        let document: toml_edit::Document = match toml {
+            toml_edit::Item::Table(toml) => toml.clone().into(),
+            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
+                toml.clone().into_table().into()
+            }
+            _ => bail!("toml not a table or inline table"),
+        };
+
+        if document.is_empty() {
+            return Ok(None);
+        }
+
+        Ok(Some(toml_edit::de::from_document(document)?))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
+        let toml = input.parse::<toml_edit::Document>().unwrap();
+        RemoteStorageConfig::from_toml(toml.as_item())
+    }
+
+    #[test]
+    fn parse_localfs_config_with_timeout() {
+        let input = "local_path = '.'
+timeout = '5s'";
+
+        let config = parse(input).unwrap().expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::LocalFs {
+                    local_path: Utf8PathBuf::from(".")
+                },
+                timeout: Duration::from_secs(5)
+            }
+        );
+    }
+
+    #[test]
+    fn test_s3_parsing() {
+        let toml = "\
+    bucket_name = 'foo-bar'
+    bucket_region = 'eu-central-1'
+    upload_storage_class = 'INTELLIGENT_TIERING'
+    timeout = '7s'
+    ";
+
+        let config = parse(toml).unwrap().expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::AwsS3(S3Config {
+                    bucket_name: "foo-bar".into(),
+                    bucket_region: "eu-central-1".into(),
+                    prefix_in_bucket: None,
+                    endpoint: None,
+                    concurrency_limit: default_remote_storage_s3_concurrency_limit(),
+                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                    upload_storage_class: Some(StorageClass::IntelligentTiering),
+                }),
+                timeout: Duration::from_secs(7)
+            }
+        );
+    }
+
+    #[test]
+    fn test_azure_parsing() {
+        let toml = "\
+    container_name = 'foo-bar'
+    container_region = 'westeurope'
+    upload_storage_class = 'INTELLIGENT_TIERING'
+    timeout = '7s'
+    ";
+
+        let config = parse(toml).unwrap().expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::AzureContainer(AzureConfig {
+                    container_name: "foo-bar".into(),
+                    storage_account: None,
+                    container_region: "westeurope".into(),
+                    prefix_in_container: None,
+                    concurrency_limit: default_remote_storage_azure_concurrency_limit(),
+                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                }),
+                timeout: Duration::from_secs(7)
+            }
+        );
+    }
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -10,6 +10,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
+mod config;
 mod error;
 mod local_fs;
 mod metrics;
@@ -18,17 +19,10 @@ mod simulate_failures;
 mod support;

 use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    pin::Pin,
-    str::FromStr,
-    sync::Arc,
-    time::{Duration, SystemTime},
+    collections::HashMap, fmt::Debug, num::NonZeroU32, pin::Pin, sync::Arc, time::SystemTime,
 };

-use anyhow::{bail, Context};
-use aws_sdk_s3::types::StorageClass;
+use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};

 use bytes::Bytes;
@@ -36,7 +30,6 @@ use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
-use toml_edit::Item;
 use tracing::info;

 pub use self::{
@@ -45,6 +38,8 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

+pub use crate::config::{AzureConfig, RemoteStorageConfig, RemoteStorageKind, S3Config};
+
 /// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
 pub use azure_core::Etag;

@@ -451,7 +446,7 @@ impl GenericRemoteStorage {
    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        let timeout = storage_config.timeout;
        Ok(match &storage_config.storage {
-            RemoteStorageKind::LocalFs(path) => {
+            RemoteStorageKind::LocalFs { local_path: path } => {
                info!("Using fs root '{path}' as a remote storage");
                Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
            }
@@ -526,262 +521,6 @@ impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
    }
 }

-/// External backup storage configuration, enough for creating a client for that storage.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct RemoteStorageConfig {
-    /// The storage connection configuration.
-    pub storage: RemoteStorageKind,
-    /// A common timeout enforced for all requests after concurrency limiter permit has been
-    /// acquired.
-    pub timeout: Duration,
-}
-
-/// A kind of a remote storage to connect to, with its connection configuration.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum RemoteStorageKind {
-    /// Storage based on local file system.
-    /// Specify a root folder to place all stored files into.
-    LocalFs(Utf8PathBuf),
-    /// AWS S3 based storage, storing all files in the S3 bucket
-    /// specified by the config
-    AwsS3(S3Config),
-    /// Azure Blob based storage, storing all files in the container
-    /// specified by the config
-    AzureContainer(AzureConfig),
-}
-
-/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq)]
-pub struct S3Config {
-    /// Name of the bucket to connect to.
-    pub bucket_name: String,
-    /// The region where the bucket is located at.
-    pub bucket_region: String,
-    /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
-    pub prefix_in_bucket: Option<String>,
-    /// A base URL to send S3 requests to.
-    /// By default, the endpoint is derived from a region name, assuming it's
-    /// an AWS S3 region name, erroring on wrong region name.
-    /// Endpoint provides a way to support other S3 flavors and their regions.
-    ///
-    /// Example: `http://127.0.0.1:5000`
-    pub endpoint: Option<String>,
-    /// AWS S3 has various limits on its API calls, we need not to exceed those.
-    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
-    pub concurrency_limit: NonZeroUsize,
-    pub max_keys_per_list_response: Option<i32>,
-    pub upload_storage_class: Option<StorageClass>,
-}
-
-impl Debug for S3Config {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("S3Config")
-            .field("bucket_name", &self.bucket_name)
-            .field("bucket_region", &self.bucket_region)
-            .field("prefix_in_bucket", &self.prefix_in_bucket)
-            .field("concurrency_limit", &self.concurrency_limit)
-            .field(
-                "max_keys_per_list_response",
-                &self.max_keys_per_list_response,
-            )
-            .finish()
-    }
-}
-
-/// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq)]
-pub struct AzureConfig {
-    /// Name of the container to connect to.
-    pub container_name: String,
-    /// Name of the storage account the container is inside of
-    pub storage_account: Option<String>,
-    /// The region where the bucket is located at.
-    pub container_region: String,
-    /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
-    pub prefix_in_container: Option<String>,
-    /// Azure has various limits on its API calls, we need not to exceed those.
-    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
-    pub concurrency_limit: NonZeroUsize,
-    pub max_keys_per_list_response: Option<i32>,
-}
-
-impl Debug for AzureConfig {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("AzureConfig")
-            .field("bucket_name", &self.container_name)
-            .field("storage_account", &self.storage_account)
-            .field("bucket_region", &self.container_region)
-            .field("prefix_in_container", &self.prefix_in_container)
-            .field("concurrency_limit", &self.concurrency_limit)
-            .field(
-                "max_keys_per_list_response",
-                &self.max_keys_per_list_response,
-            )
-            .finish()
-    }
-}
-
-impl RemoteStorageConfig {
-    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
-
-    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let local_path = toml.get("local_path");
-        let bucket_name = toml.get("bucket_name");
-        let bucket_region = toml.get("bucket_region");
-        let container_name = toml.get("container_name");
-        let container_region = toml.get("container_region");
-
-        let use_azure = container_name.is_some() && container_region.is_some();
-
-        let default_concurrency_limit = if use_azure {
-            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
-        } else {
-            DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
-        };
-        let concurrency_limit = NonZeroUsize::new(
-            parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
-        )
-        .context("Failed to parse 'concurrency_limit' as a positive integer")?;
-
-        let max_keys_per_list_response =
-            parse_optional_integer::<i32, _>("max_keys_per_list_response", toml)
-                .context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
-                .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
-
-        let endpoint = toml
-            .get("endpoint")
-            .map(|endpoint| parse_toml_string("endpoint", endpoint))
-            .transpose()?;
-
-        let timeout = toml
-            .get("timeout")
-            .map(|timeout| {
-                timeout
-                    .as_str()
-                    .ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
-            })
-            .transpose()
-            .and_then(|timeout| {
-                timeout
-                    .map(humantime::parse_duration)
-                    .transpose()
-                    .map_err(anyhow::Error::new)
-            })
-            .context("parse timeout")?
-            .unwrap_or(Self::DEFAULT_TIMEOUT);
-
-        if timeout < Duration::from_secs(1) {
-            bail!("timeout was specified as {timeout:?} which is too low");
-        }
-
-        let storage = match (
-            local_path,
-            bucket_name,
-            bucket_region,
-            container_name,
-            container_region,
-        ) {
-            // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
-            (None, None, None, None, None) => return Ok(None),
-            (_, Some(_), None, ..) => {
-                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
-            }
-            (_, None, Some(_), ..) => {
-                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
-            }
-            (None, Some(bucket_name), Some(bucket_region), ..) => {
-                RemoteStorageKind::AwsS3(S3Config {
-                    bucket_name: parse_toml_string("bucket_name", bucket_name)?,
-                    bucket_region: parse_toml_string("bucket_region", bucket_region)?,
-                    prefix_in_bucket: toml
-                        .get("prefix_in_bucket")
-                        .map(|prefix_in_bucket| {
-                            parse_toml_string("prefix_in_bucket", prefix_in_bucket)
-                        })
-                        .transpose()?,
-                    endpoint,
-                    concurrency_limit,
-                    max_keys_per_list_response,
-                    upload_storage_class: toml
-                        .get("upload_storage_class")
-                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
-                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
-                            let storage_class = StorageClass::from_str(&s).expect("infallible");
-                            #[allow(deprecated)]
-                            if matches!(storage_class, StorageClass::Unknown(_)) {
-                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
-                            }
-                            Ok(storage_class)
-                        })
-                        .transpose()?,
-                })
-            }
-            (_, _, _, Some(_), None) => {
-                bail!("'container_name' option is mandatory if 'container_region' is given ")
-            }
-            (_, _, _, None, Some(_)) => {
-                bail!("'container_name' option is mandatory if 'container_region' is given ")
-            }
-            (None, None, None, Some(container_name), Some(container_region)) => {
-                RemoteStorageKind::AzureContainer(AzureConfig {
-                    container_name: parse_toml_string("container_name", container_name)?,
-                    storage_account: toml
-                        .get("storage_account")
-                        .map(|storage_account| {
-                            parse_toml_string("storage_account", storage_account)
-                        })
-                        .transpose()?,
-                    container_region: parse_toml_string("container_region", container_region)?,
-                    prefix_in_container: toml
-                        .get("prefix_in_container")
-                        .map(|prefix_in_container| {
-                            parse_toml_string("prefix_in_container", prefix_in_container)
-                        })
-                        .transpose()?,
-                    concurrency_limit,
-                    max_keys_per_list_response,
-                })
-            }
-            (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
-                Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
-            ),
-            (Some(_), Some(_), ..) => {
-                bail!("'local_path' and 'bucket_name' are mutually exclusive")
-            }
-            (Some(_), _, _, Some(_), Some(_)) => {
-                bail!("local_path and 'container_name' are mutually exclusive")
-            }
-        };
-
-        Ok(Some(RemoteStorageConfig { storage, timeout }))
-    }
-}
-
-// Helper functions to parse a toml Item
-fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
-where
-    I: TryFrom<i64, Error = E>,
-    E: std::error::Error + Send + Sync + 'static,
-{
-    let toml_integer = match item.get(name) {
-        Some(item) => item
-            .as_integer()
-            .with_context(|| format!("configure option {name} is not an integer"))?,
-        None => return Ok(None),
-    };
-
-    I::try_from(toml_integer)
-        .map(Some)
-        .with_context(|| format!("configure option {name} is too large"))
-}
-
-fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
-    let s = item
-        .as_str()
-        .with_context(|| format!("configure option {name} is not a string"))?;
-    Ok(s.to_string())
-}
-
 struct ConcurrencyLimiter {
    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
@@ -849,24 +588,4 @@ mod tests {
        let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
        assert_eq!(err.to_string(), "Path \"/\" is not relative");
    }
-
-    #[test]
-    fn parse_localfs_config_with_timeout() {
-        let input = "local_path = '.'
-timeout = '5s'";
-
-        let toml = input.parse::<toml_edit::Document>().unwrap();
-
-        let config = RemoteStorageConfig::from_toml(toml.as_item())
-            .unwrap()
-            .expect("it exists");
-
-        assert_eq!(
-            config,
-            RemoteStorageConfig {
-                storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
-                timeout: Duration::from_secs(5)
-            }
-        );
-    }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,12 +46,12 @@ use utils::backoff;

 use super::StorageMetadata;
 use crate::{
+    config::S3Config,
    error::Cancelled,
    metrics::{start_counting_cancelled_wait, start_measuring_requests},
    support::PermitCarrying,
    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use crate::metrics::AttemptOutcome;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -394,6 +394,10 @@ fn start_pageserver(
        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
    }

+    // Set up global state shared by all walredo processes.
+    let walredo_global_state =
+        BACKGROUND_RUNTIME.block_on(pageserver::walredo::GlobalState::spawn(conf));
+
    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
@@ -429,6 +433,7 @@ fn start_pageserver(
            broker_client: broker_client.clone(),
            remote_storage: remote_storage.clone(),
            deletion_queue_client,
+            walredo_global_state: Arc::clone(&walredo_global_state),
        },
        order,
        shutdown_pageserver.clone(),
@@ -689,7 +694,13 @@ fn start_pageserver(
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
-            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
+            pageserver::shutdown_pageserver(
+                &tenant_manager,
+                deletion_queue.clone(),
+                walredo_global_state,
+                0,
+            )
+            .await;
            unreachable!()
        })
    }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,9 +33,7 @@ use utils::{
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
-use crate::tenant::{
-    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
-};
+use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{
@@ -855,14 +853,6 @@ impl PageServerConf {
        )
    }

-    pub(crate) fn tenant_deleted_mark_file_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-    ) -> Utf8PathBuf {
-        self.tenant_path(tenant_shard_id)
-            .join(TENANT_DELETED_MARKER_FILE_NAME)
-    }
-
    pub fn traces_path(&self) -> Utf8PathBuf {
        self.workdir.join("traces")
    }
@@ -1463,7 +1453,7 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
+                    storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() },
                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -850,7 +850,9 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
-            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            storage: RemoteStorageKind::LocalFs {
+                local_path: remote_fs_dir.clone(),
+            },
            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -329,14 +329,11 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
    }
 }

-impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
-    fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
-        use crate::tenant::delete::DeleteTenantError::*;
+impl From<crate::tenant::mgr::DeleteTenantError> for ApiError {
+    fn from(value: crate::tenant::mgr::DeleteTenantError) -> Self {
+        use crate::tenant::mgr::DeleteTenantError::*;
        match value {
-            Get(g) => ApiError::from(g),
-            Timeline(t) => ApiError::from(t),
            SlotError(e) => e.into(),
-            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            Cancelled => ApiError::ShuttingDown,
        }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -11,6 +11,8 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
+use std::sync::Arc;
+
 pub use pageserver_api::keyspace;
 pub mod aux_file;
 pub mod metrics;
@@ -58,6 +60,7 @@ pub use crate::metrics::preinitialize_metrics;
 pub async fn shutdown_pageserver(
    tenant_manager: &TenantManager,
    mut deletion_queue: DeletionQueue,
+    walredo_global_state: Arc<walredo::GlobalState>,
    exit_code: i32,
 ) {
    use std::time::Duration;
@@ -79,6 +82,18 @@ pub async fn shutdown_pageserver(
    )
    .await;

+    // walredo processes are tenant-scoped and should have been shut down after tenant manager shutdown above.
+    //
+    // In practive, we have lingering walredo processes even when pageserver shuts down cleanly, i.e., even when it
+    // does not hit systemd's TimeoutSec timeout (10 seconds in prod).
+    // TODO: understand why the processes aren't gone by the time tenant_manager.shutdown() above returns.
+    timed(
+        walredo_global_state.shutdown(),
+        "wait for all walredo processes to exit",
+        Duration::from_secs(1),
+    )
+    .await;
+
    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
    // should already have been canclled via mgr::shutdown_all_tenants
    timed(
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -55,11 +55,9 @@ use self::config::AttachedLocationConfig;
 use self::config::AttachmentMode;
 use self::config::LocationConf;
 use self::config::TenantConf;
-use self::delete::DeleteTenantFlow;
 use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
-use self::mgr::TenantsMap;
 use self::remote_timeline_client::upload::upload_index_part;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineCreateGuard;
@@ -90,6 +88,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
+use crate::walredo::Error;
 use crate::InitializationOrder;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
@@ -137,7 +136,6 @@ pub mod remote_timeline_client;
 pub mod storage_layer;

 pub mod config;
-pub mod delete;
 pub mod mgr;
 pub mod secondary;
 pub mod tasks;
@@ -161,8 +159,6 @@ pub const TENANTS_SEGMENT_NAME: &str = "tenants";
 /// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
 pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

-pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
-
 /// References to shared objects that are passed into each tenant, such
 /// as the shared remote storage client and process initialization state.
 #[derive(Clone)]
@@ -170,6 +166,7 @@ pub struct TenantSharedResources {
    pub broker_client: storage_broker::BrokerClientChannel,
    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
+    pub walredo_global_state: Arc<crate::walredo::GlobalState>,
 }

 /// A [`Tenant`] is really an _attached_ tenant.  The configuration
@@ -207,7 +204,6 @@ struct TimelinePreload {
 }

 pub(crate) struct TenantPreload {
-    deleting: bool,
    timelines: HashMap<TimelineId, TimelinePreload>,
 }

@@ -286,8 +282,6 @@ pub struct Tenant {
    /// background warmup.
    pub(crate) activate_now_sem: tokio::sync::Semaphore,

-    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
-
    // Cancellation token fires when we have entered shutdown().  This is a parent of
    // Timelines' cancellation token.
    pub(crate) cancel: CancellationToken,
@@ -331,6 +325,17 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }

 impl WalRedoManager {
+
+    pub(crate) async fn shutdown(&self) {
+        match self {
+            Self::Prod(mgr) => mgr.shutdown().await,
+            #[cfg(test)]
+            Self::Test(mgr) => {
+                // Not applicable to test redo manager
+            }
+        }
+    }
+
    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
        match self {
            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
@@ -351,7 +356,7 @@ impl WalRedoManager {
        base_img: Option<(Lsn, bytes::Bytes)>,
        records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
        pg_version: u32,
-    ) -> anyhow::Result<bytes::Bytes> {
+    ) -> Result<bytes::Bytes, Error> {
        match self {
            Self::Prod(mgr) => {
                mgr.request_redo(key, lsn, base_img, records, pg_version)
@@ -654,21 +659,21 @@ impl Tenant {
        attached_conf: AttachedTenantConf,
        shard_identity: ShardIdentity,
        init_order: Option<InitializationOrder>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
-        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
-            conf,
-            tenant_shard_id,
-        )));
-
        let TenantSharedResources {
            broker_client,
            remote_storage,
            deletion_queue_client,
+            walredo_global_state,
        } = resources;

+        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
+            walredo_global_state,
+            tenant_shard_id,
+        )));
+
        let attach_mode = attached_conf.location.attach_mode;
        let generation = attached_conf.location.generation;

@@ -828,52 +833,6 @@ impl Tenant {
                // Remote preload is complete.
                drop(remote_load_completion);

-                let pending_deletion = {
-                    match DeleteTenantFlow::should_resume_deletion(
-                        conf,
-                        preload.as_ref().map(|p| p.deleting).unwrap_or(false),
-                        &tenant_clone,
-                    )
-                    .await
-                    {
-                        Ok(should_resume_deletion) => should_resume_deletion,
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error);
-                            return Ok(());
-                        }
-                    }
-                };
-
-                info!("pending_deletion {}", pending_deletion.is_some());
-
-                if let Some(deletion) = pending_deletion {
-                    // as we are no longer loading, signal completion by dropping
-                    // the completion while we resume deletion
-                    drop(_completion);
-                    let background_jobs_can_start =
-                        init_order.as_ref().map(|x| &x.background_jobs_can_start);
-                    if let Some(background) = background_jobs_can_start {
-                        info!("waiting for backgound jobs barrier");
-                        background.clone().wait().await;
-                        info!("ready for backgound jobs barrier");
-                    }
-
-                    let deleted = DeleteTenantFlow::resume_from_attach(
-                        deletion,
-                        &tenant_clone,
-                        preload,
-                        tenants,
-                        &ctx,
-                    )
-                    .await;
-
-                    if let Err(e) = deleted {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
-                    }
-
-                    return Ok(());
-                }
-
                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
                let attached = {
                    let _attach_timer = match mode {
@@ -931,21 +890,13 @@ impl Tenant {
        )
        .await?;

-        let deleting = other_keys.contains(TENANT_DELETED_MARKER_FILE_NAME);
-        info!(
-            "found {} timelines, deleting={}",
-            remote_timeline_ids.len(),
-            deleting
-        );
+        info!("found {} timelines", remote_timeline_ids.len(),);

        for k in other_keys {
-            if k != TENANT_DELETED_MARKER_FILE_NAME {
-                warn!("Unexpected non timeline key {k}");
-            }
+            warn!("Unexpected non timeline key {k}");
        }

        Ok(TenantPreload {
-            deleting,
            timelines: Self::load_timeline_metadata(
                self,
                remote_timeline_ids,
@@ -974,7 +925,6 @@ impl Tenant {
        let preload = match (preload, mode) {
            (Some(p), _) => p,
            (None, SpawnMode::Create) => TenantPreload {
-                deleting: false,
                timelines: HashMap::new(),
            },
            (None, _) => {
@@ -1941,6 +1891,10 @@ impl Tenant {
        tracing::debug!("Waiting for tasks...");
        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;

+        if let Some(walredo_mgr) = self.walredo_mgr.as_ref() {
+            walredo_mgr.shutdown().await;
+        }
+
        // Wait for any in-flight operations to complete
        self.gate.close().await;

@@ -2215,6 +2169,7 @@ impl Tenant {
            // Upload an index from the parent: this is partly to provide freshness for the
            // child tenants that will copy it, and partly for general ease-of-debugging: there will
            // always be a parent shard index in the same generation as we wrote the child shard index.
+            tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index");
            timeline
                .remote_client
                .schedule_index_upload_for_file_changes()?;
@@ -2222,12 +2177,14 @@ impl Tenant {

            // Shut down the timeline's remote client: this means that the indices we write
            // for child shards will not be invalidated by the parent shard deleting layers.
+            tracing::info!(timeline_id=%timeline.timeline_id, "Shutting down remote storage client");
            timeline.remote_client.shutdown().await;

            // Download methods can still be used after shutdown, as they don't flow through the remote client's
            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
            // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
            // we use here really is the remotely persistent one).
+            tracing::info!(timeline_id=%timeline.timeline_id, "Downloading index_part from parent");
            let result = timeline.remote_client
                .download_index_file(&self.cancel)
                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
@@ -2240,6 +2197,7 @@ impl Tenant {
            };

            for child_shard in child_shards {
+                tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index_part for child {}", child_shard.to_index());
                upload_index_part(
                    &self.remote_storage,
                    child_shard,
@@ -2628,7 +2586,6 @@ impl Tenant {
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
            activate_now_sem: tokio::sync::Semaphore::new(0),
-            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
            cancel: CancellationToken::default(),
            gate: Gate::default(),
            timeline_get_throttle: Arc::new(throttle::Throttle::new(
@@ -3906,7 +3863,9 @@ pub(crate) mod harness {
            let remote_fs_dir = conf.workdir.join("localfs");
            std::fs::create_dir_all(&remote_fs_dir).unwrap();
            let config = RemoteStorageConfig {
-                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+                storage: RemoteStorageKind::LocalFs {
+                    local_path: remote_fs_dir.clone(),
+                },
                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
@@ -4012,7 +3971,7 @@ pub(crate) mod harness {
            base_img: Option<(Lsn, Bytes)>,
            records: Vec<(Lsn, NeonWalRecord)>,
            _pg_version: u32,
-        ) -> anyhow::Result<Bytes> {
+        ) -> Result<Bytes, Error> {
            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
            if records_neon {
                // For Neon wal records, we can decode without spawning postgres, so do so.
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -1,426 +0,0 @@
-use std::sync::Arc;
-
-use anyhow::Context;
-use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::{models::TenantState, shard::TenantShardId};
-use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
-use tokio::sync::OwnedMutexGuard;
-use tokio_util::sync::CancellationToken;
-use tracing::{error, Instrument};
-
-use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
-
-use crate::{
-    config::PageServerConf,
-    context::RequestContext,
-    task_mgr::{self},
-    tenant::{
-        mgr::{TenantSlot, TenantsMapRemoveResult},
-        remote_timeline_client::remote_heatmap_path,
-    },
-};
-
-use super::{
-    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
-    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
-    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
-};
-
-#[derive(Debug, thiserror::Error)]
-pub(crate) enum DeleteTenantError {
-    #[error("GetTenant {0}")]
-    Get(#[from] GetTenantError),
-
-    #[error("Tenant map slot error {0}")]
-    SlotError(#[from] TenantSlotError),
-
-    #[error("Tenant map slot upsert error {0}")]
-    SlotUpsertError(#[from] TenantSlotUpsertError),
-
-    #[error("Timeline {0}")]
-    Timeline(#[from] DeleteTimelineError),
-
-    #[error("Cancelled")]
-    Cancelled,
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
-
-fn remote_tenant_delete_mark_path(
-    conf: &PageServerConf,
-    tenant_shard_id: &TenantShardId,
-) -> anyhow::Result<RemotePath> {
-    let tenant_remote_path = conf
-        .tenant_path(tenant_shard_id)
-        .strip_prefix(&conf.workdir)
-        .context("Failed to strip workdir prefix")
-        .and_then(RemotePath::new)
-        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
-}
-
-async fn schedule_ordered_timeline_deletions(
-    tenant: &Arc<Tenant>,
-) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
-    // Tenant is stopping at this point. We know it will be deleted.
-    // No new timelines should be created.
-    // Tree sort timelines to delete from leafs to the root.
-    // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
-    // can complete and remove timeline from the map in between our call to clone
-    // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
-    // timelines.lock is currently synchronous so we cant hold it across await point.
-    // So just ignore NotFound error if we get it from `run`.
-    // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
-    let timelines = tenant.timelines.lock().unwrap().clone();
-    let sorted =
-        tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
-
-    let mut already_running_deletions = vec![];
-
-    for (timeline_id, _) in sorted.into_iter().rev() {
-        let span = tracing::info_span!("timeline_delete", %timeline_id);
-        let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
-            .instrument(span)
-            .await;
-        if let Err(e) = res {
-            match e {
-                DeleteTimelineError::NotFound => {
-                    // Timeline deletion finished after call to clone above but before call
-                    // to `DeleteTimelineFlow::run` and removed timeline from the map.
-                    continue;
-                }
-                DeleteTimelineError::AlreadyInProgress(guard) => {
-                    already_running_deletions.push((guard, timeline_id));
-                    continue;
-                }
-                e => return Err(DeleteTenantError::Timeline(e)),
-            }
-        }
-    }
-
-    Ok(already_running_deletions)
-}
-
-async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
-    // Assert timelines dir is empty.
-    if !fs_ext::is_directory_empty(timelines_path).await? {
-        // Display first 10 items in directory
-        let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
-        let list = &list.into_iter().take(10).collect::<Vec<_>>();
-        return Err(DeleteTenantError::Other(anyhow::anyhow!(
-            "Timelines directory is not empty after all timelines deletion: {list:?}"
-        )));
-    }
-
-    Ok(())
-}
-
-async fn remove_tenant_remote_delete_mark(
-    conf: &PageServerConf,
-    remote_storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    cancel: &CancellationToken,
-) -> Result<(), DeleteTenantError> {
-    let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
-    backoff::retry(
-        || async { remote_storage.delete(&path, cancel).await },
-        TimeoutOrCancel::caused_by_cancel,
-        FAILED_UPLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        "remove_tenant_remote_delete_mark",
-        cancel,
-    )
-    .await
-    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-    .and_then(|x| x)
-    .context("remove_tenant_remote_delete_mark")?;
-    Ok(())
-}
-
-// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
-async fn cleanup_remaining_fs_traces(
-    conf: &PageServerConf,
-    tenant_shard_id: &TenantShardId,
-) -> Result<(), DeleteTenantError> {
-    let rm = |p: Utf8PathBuf, is_dir: bool| async move {
-        if is_dir {
-            tokio::fs::remove_dir(&p).await
-        } else {
-            tokio::fs::remove_file(&p).await
-        }
-        .or_else(fs_ext::ignore_not_found)
-        .with_context(|| format!("failed to delete {p}"))
-    };
-
-    rm(conf.tenant_config_path(tenant_shard_id), false).await?;
-    rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-timelines-dir"
-        ))?
-    });
-
-    rm(conf.timelines_path(tenant_shard_id), true).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-deleted-mark"
-        ))?
-    });
-
-    // Make sure previous deletions are ordered before mark removal.
-    // Otherwise there is no guarantee that they reach the disk before mark deletion.
-    // So its possible for mark to reach disk first and for other deletions
-    // to be reordered later and thus missed if a crash occurs.
-    // Note that we dont need to sync after mark file is removed
-    // because we can tolerate the case when mark file reappears on startup.
-    let tenant_path = &conf.tenant_path(tenant_shard_id);
-    if tenant_path.exists() {
-        crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
-            .await
-            .context("fsync_pre_mark_remove")?;
-    }
-
-    rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
-
-    rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-tenant-dir"
-        ))?
-    });
-
-    rm(conf.tenant_path(tenant_shard_id), true).await?;
-
-    Ok(())
-}
-
-#[derive(Default)]
-pub enum DeleteTenantFlow {
-    #[default]
-    NotStarted,
-    InProgress,
-    Finished,
-}
-
-impl DeleteTenantFlow {
-    pub(crate) async fn should_resume_deletion(
-        conf: &'static PageServerConf,
-        remote_mark_exists: bool,
-        tenant: &Tenant,
-    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
-        let acquire = |t: &Tenant| {
-            Some(
-                Arc::clone(&t.delete_progress)
-                    .try_lock_owned()
-                    .expect("we're the only owner during init"),
-            )
-        };
-
-        if remote_mark_exists {
-            return Ok(acquire(tenant));
-        }
-
-        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
-        if conf
-            .tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
-            .exists()
-        {
-            Ok(acquire(tenant))
-        } else {
-            Ok(None)
-        }
-    }
-
-    pub(crate) async fn resume_from_attach(
-        guard: DeletionGuard,
-        tenant: &Arc<Tenant>,
-        preload: Option<TenantPreload>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        ctx: &RequestContext,
-    ) -> Result<(), DeleteTenantError> {
-        let (_, progress) = completion::channel();
-
-        tenant
-            .set_stopping(progress, false, true)
-            .await
-            .expect("cant be stopping or broken");
-
-        tenant
-            .attach(preload, super::SpawnMode::Eager, ctx)
-            .await
-            .context("attach")?;
-
-        Self::background(
-            guard,
-            tenant.conf,
-            tenant.remote_storage.clone(),
-            tenants,
-            tenant,
-        )
-        .await
-    }
-
-    async fn background(
-        mut guard: OwnedMutexGuard<Self>,
-        conf: &PageServerConf,
-        remote_storage: GenericRemoteStorage,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: &Arc<Tenant>,
-    ) -> Result<(), DeleteTenantError> {
-        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
-        // Note that if deletion fails we dont mark timelines as broken,
-        // the whole tenant will become broken as by `Self::schedule_background` logic
-        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
-            .await
-            .context("schedule_ordered_timeline_deletions")?;
-
-        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-polling-ongoing-deletions"
-            ))?
-        });
-
-        // Wait for deletions that were already running at the moment when tenant deletion was requested.
-        // When we can lock deletion guard it means that corresponding timeline deletion finished.
-        for (guard, timeline_id) in already_running_timeline_deletions {
-            let flow = guard.lock().await;
-            if !flow.is_finished() {
-                return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                    "already running timeline deletion failed: {timeline_id}"
-                )));
-            }
-        }
-
-        // Remove top-level tenant objects that don't belong to a timeline, such as heatmap
-        let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id());
-        if let Some(Err(e)) = backoff::retry(
-            || async {
-                remote_storage
-                    .delete(&heatmap_path, &task_mgr::shutdown_token())
-                    .await
-            },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "remove_remote_tenant_heatmap",
-            &task_mgr::shutdown_token(),
-        )
-        .await
-        {
-            tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}");
-        }
-
-        let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
-        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
-        if timelines_path.exists() {
-            // sanity check to guard against layout changes
-            ensure_timelines_dir_empty(&timelines_path)
-                .await
-                .context("timelines dir not empty")?;
-        }
-
-        remove_tenant_remote_delete_mark(
-            conf,
-            &remote_storage,
-            &tenant.tenant_shard_id,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
-
-        pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable");
-        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
-            ))?
-        });
-
-        cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
-            .await
-            .context("cleanup_remaining_fs_traces")?;
-
-        {
-            // This block is simply removing the TenantSlot for this tenant.  It requires a loop because
-            // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
-            //
-            // This complexity will go away when we simplify how deletion works:
-            // https://github.com/neondatabase/neon/issues/5080
-            loop {
-                // Under the TenantMap lock, try to remove the tenant.  We usually succeed, but if
-                // we encounter an InProgress marker, yield the barrier it contains and wait on it.
-                let barrier = {
-                    let mut locked = tenants.write().unwrap();
-                    let removed = locked.remove(tenant.tenant_shard_id);
-
-                    // FIXME: we should not be modifying this from outside of mgr.rs.
-                    // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
-
-                    // Update stats
-                    match &removed {
-                        TenantsMapRemoveResult::Occupied(slot) => {
-                            crate::metrics::TENANT_MANAGER.slot_removed(slot);
-                        }
-                        TenantsMapRemoveResult::InProgress(barrier) => {
-                            crate::metrics::TENANT_MANAGER
-                                .slot_removed(&TenantSlot::InProgress(barrier.clone()));
-                        }
-                        TenantsMapRemoveResult::Vacant => {
-                            // Nothing changed in map, no metric update
-                        }
-                    }
-
-                    match removed {
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
-                            match tenant.current_state() {
-                                TenantState::Stopping { .. } | TenantState::Broken { .. } => {
-                                    // Expected: we put the tenant into stopping state before we start deleting it
-                                }
-                                state => {
-                                    // Unexpected state
-                                    tracing::warn!(
-                                        "Tenant in unexpected state {state} after deletion"
-                                    );
-                                }
-                            }
-                            break;
-                        }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
-                            // This is unexpected: this secondary tenants should not have been created, and we
-                            // are not in a position to shut it down from here.
-                            tracing::warn!("Tenant transitioned to secondary mode while deleting!");
-                            break;
-                        }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
-                            unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
-                        }
-                        TenantsMapRemoveResult::Vacant => {
-                            tracing::warn!(
-                                "Tenant removed from TenantsMap before deletion completed"
-                            );
-                            break;
-                        }
-                        TenantsMapRemoveResult::InProgress(barrier) => {
-                            // An InProgress entry was found, we must wait on its barrier
-                            barrier
-                        }
-                    }
-                };
-
-                tracing::info!(
-                    "Waiting for competing operation to complete before deleting state for tenant"
-                );
-                barrier.wait().await;
-            }
-        }
-
-        *guard = Self::Finished;
-
-        Ok(())
-    }
-}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -51,7 +51,6 @@ use utils::fs_ext::PathExt;
 use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

-use super::delete::DeleteTenantError;
 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
@@ -109,12 +108,6 @@ pub(crate) enum TenantsMap {
    ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }

-pub(crate) enum TenantsMapRemoveResult {
-    Occupied(TenantSlot),
-    Vacant,
-    InProgress(utils::completion::Barrier),
-}
-
 /// When resolving a TenantId to a shard, we may be looking for the 0th
 /// shard, or we might be looking for whichever shard holds a particular page.
 #[derive(Copy, Clone)]
@@ -191,26 +184,6 @@ impl TenantsMap {
        }
    }

-    /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
-    ///
-    /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
-    /// slot if the enclosed tenant is shutdown.
-    pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult {
-        use std::collections::btree_map::Entry;
-        match self {
-            TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) {
-                Entry::Occupied(entry) => match entry.get() {
-                    TenantSlot::InProgress(barrier) => {
-                        TenantsMapRemoveResult::InProgress(barrier.clone())
-                    }
-                    _ => TenantsMapRemoveResult::Occupied(entry.remove()),
-                },
-                Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
-            },
-        }
-    }
-
    #[cfg(all(debug_assertions, not(test)))]
    pub(crate) fn len(&self) -> usize {
        match self {
@@ -460,6 +433,18 @@ async fn init_load_tenant_configs(
    Ok(configs)
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum DeleteTenantError {
+    #[error("Tenant map slot error {0}")]
+    SlotError(#[from] TenantSlotError),
+
+    #[error("Cancelled")]
+    Cancelled,
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the tenant once download is completed.
@@ -629,7 +614,6 @@ pub async fn init_tenant_mgr(
                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
                    shard_identity,
                    Some(init_order.clone()),
-                    &TENANTS,
                    SpawnMode::Lazy,
                    &ctx,
                ) {
@@ -685,7 +669,6 @@ fn tenant_spawn(
    location_conf: AttachedTenantConf,
    shard_identity: ShardIdentity,
    init_order: Option<InitializationOrder>,
-    tenants: &'static std::sync::RwLock<TenantsMap>,
    mode: SpawnMode,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
@@ -712,7 +695,6 @@ fn tenant_spawn(
        location_conf,
        shard_identity,
        init_order,
-        tenants,
        mode,
        ctx,
    ) {
@@ -1161,7 +1143,6 @@ impl TenantManager {
                    attached_conf,
                    shard_identity,
                    None,
-                    self.tenants,
                    spawn_mode,
                    ctx,
                )?;
@@ -1283,7 +1264,6 @@ impl TenantManager {
            AttachedTenantConf::try_from(config)?,
            shard_identity,
            None,
-            self.tenants,
            SpawnMode::Eager,
            ctx,
        )?;
@@ -1634,7 +1614,7 @@ impl TenantManager {
        for child_shard_id in &child_shards {
            let child_shard_id = *child_shard_id;
            let child_shard = {
-                let locked = TENANTS.read().unwrap();
+                let locked = self.tenants.read().unwrap();
                let peek_slot =
                    tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
                peek_slot.and_then(|s| s.get_attached()).cloned()
@@ -1735,6 +1715,7 @@ impl TenantManager {
            let timelines = parent_shard.timelines.lock().unwrap().clone();
            let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
            for timeline in timelines.values() {
+                tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
                let timeline_layers = timeline
                    .layers
                    .read()
@@ -1774,7 +1755,12 @@ impl TenantManager {

        // Since we will do a large number of small filesystem metadata operations, batch them into
        // spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
+        let span = tracing::Span::current();
        let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
+            // Run this synchronous code in the same log context as the outer function that spawned it.
+            let _span = span.enter();
+
+            tracing::info!("Creating {} directories", create_dirs.len());
            for dir in &create_dirs {
                if let Err(e) = std::fs::create_dir_all(dir) {
                    // Ignore AlreadyExists errors, drop out on all other errors
@@ -1788,6 +1774,11 @@ impl TenantManager {
            }

            for child_prefix in child_prefixes {
+                tracing::info!(
+                    "Hard-linking {} parent layers into child path {}",
+                    parent_layers.len(),
+                    child_prefix
+                );
                for relative_layer in &parent_layers {
                    let parent_path = parent_path.join(relative_layer);
                    let child_path = child_prefix.join(relative_layer);
@@ -1813,6 +1804,7 @@ impl TenantManager {
            // Durability is not required for correctness, but if we crashed during split and
            // then came restarted with empty timeline dirs, it would be very inefficient to
            // re-populate from remote storage.
+            tracing::info!("fsyncing {} directories", create_dirs.len());
            for dir in create_dirs {
                if let Err(e) = crashsafe::fsync(&dir) {
                    // Something removed a newly created timeline dir out from underneath us?  Extremely
@@ -1866,7 +1858,7 @@ impl TenantManager {
        deletion_queue_client: &DeletionQueueClient,
    ) -> Result<(), TenantStateError> {
        let tmp_path = self
-            .detach_tenant0(conf, &TENANTS, tenant_shard_id, deletion_queue_client)
+            .detach_tenant0(conf, tenant_shard_id, deletion_queue_client)
            .await?;
        spawn_background_purge(tmp_path);

@@ -1876,7 +1868,6 @@ impl TenantManager {
    async fn detach_tenant0(
        &self,
        conf: &'static PageServerConf,
-        tenants: &std::sync::RwLock<TenantsMap>,
        tenant_shard_id: TenantShardId,
        deletion_queue_client: &DeletionQueueClient,
    ) -> Result<Utf8PathBuf, TenantStateError> {
@@ -1890,7 +1881,7 @@ impl TenantManager {
        };

        let removal_result = remove_tenant_from_memory(
-            tenants,
+            self.tenants,
            tenant_shard_id,
            tenant_dir_rename_operation(tenant_shard_id),
        )
@@ -1906,7 +1897,7 @@ impl TenantManager {
    pub(crate) fn list_tenants(
        &self,
    ) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
-        let tenants = TENANTS.read().unwrap();
+        let tenants = self.tenants.read().unwrap();
        let m = match &*tenants {
            TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
@@ -2007,7 +1998,6 @@ impl TenantManager {
            AttachedTenantConf::try_from(config)?,
            shard_identity,
            None,
-            self.tenants,
            SpawnMode::Eager,
            ctx,
        )?;
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5235,10 +5235,16 @@ impl Timeline {
                    .map_err(PageReconstructError::WalRedo)?
                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
                    .await
-                    .context("reconstruct a page image")
                {
                    Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
+                    Err(e) => {
+                        return Err(match e {
+                            crate::walredo::Error::Cancelled => PageReconstructError::Cancelled,
+                            crate::walredo::Error::Other(e) => {
+                                PageReconstructError::WalRedo(e.context("reconstruct a page image"))
+                            }
+                        })
+                    }
                };

                Ok(img)
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -255,7 +255,6 @@ impl DeleteTimelineFlow {
    }

    /// Shortcut to create Timeline in stopping state and spawn deletion task.
-    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
    #[instrument(skip_all, fields(%timeline_id))]
    pub async fn resume_deletion(
        tenant: Arc<Tenant>,
@@ -420,10 +419,6 @@ impl DeleteTimelineFlow {
        Ok(())
    }

-    pub(crate) fn is_finished(&self) -> bool {
-        matches!(self, Self::Finished)
-    }
-
    pub(crate) fn is_not_started(&self) -> bool {
        matches!(self, Self::NotStarted)
    }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -40,8 +40,30 @@ use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
 use utils::lsn::Lsn;
+use utils::sync::gate::Gate;
 use utils::sync::heavier_once_cell;

+pub struct GlobalState {
+    conf: &'static PageServerConf,
+    pub(self) spawn_gate: Gate,
+}
+
+impl GlobalState {
+    pub async fn spawn(conf: &'static PageServerConf) -> Arc<GlobalState> {
+        let state = Arc::new(GlobalState {
+            conf,
+            spawn_gate: Gate::default(),
+        });
+        state
+    }
+    pub(crate) async fn shutdown(self: &Arc<Self>) {
+        self.spawn_gate.close().await
+        // The destructor of WalRedoProcess SIGKILLs and `wait()`s for the process
+        // The gate guard is stored in WalRedoProcess.
+        // So, we arrive here once all WalRedoProcess structs are gone.
+    }
+}
+
 ///
 /// This is the real implementation that uses a Postgres process to
 /// perform WAL replay. Only one thread can use the process at a time,
@@ -50,8 +72,8 @@ use utils::sync::heavier_once_cell;
 /// records.
 ///
 pub struct PostgresRedoManager {
+    global_state: Arc<GlobalState>,
    tenant_shard_id: TenantShardId,
-    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
    /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
@@ -65,9 +87,30 @@ pub struct PostgresRedoManager {
    /// still be using the old redo process. But, those other tasks will most likely
    /// encounter an error as well, and errors are an unexpected condition anyway.
    /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
+    redo_process: heavier_once_cell::OnceCell<RedoProcessState>,
+    launch_process_gate: Gate,
 }

+enum RedoProcessState {
+    Launched(Arc<process::WalRedoProcess>),
+    ManagerShutDown,
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+macro_rules! bail {
+    ($($arg:tt)*) => {
+        return Err($crate::walredo::Error::Other(::anyhow::anyhow!($($arg)*)));
+    }
+}
+pub(self) use bail;
+
 ///
 /// Public interface of WAL redo manager
 ///
@@ -88,9 +131,9 @@ impl PostgresRedoManager {
        base_img: Option<(Lsn, Bytes)>,
        records: Vec<(Lsn, NeonWalRecord)>,
        pg_version: u32,
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, Error> {
        if records.is_empty() {
-            anyhow::bail!("invalid WAL redo request with no records");
+            bail!("invalid WAL redo request with no records");
        }

        let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
@@ -110,7 +153,7 @@ impl PostgresRedoManager {
                        img,
                        base_img_lsn,
                        &records[batch_start..i],
-                        self.conf.wal_redo_timeout,
+                        self.global_state.conf.wal_redo_timeout,
                        pg_version,
                    )
                    .await
@@ -131,7 +174,7 @@ impl PostgresRedoManager {
                img,
                base_img_lsn,
                &records[batch_start..],
-                self.conf.wal_redo_timeout,
+                self.global_state.conf.wal_redo_timeout,
                pg_version,
            )
            .await
@@ -148,10 +191,10 @@ impl PostgresRedoManager {
                    chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                })
            },
-            process: self
-                .redo_process
-                .get()
-                .map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
+            process: self.redo_process.get().and_then(|p| match &*p {
+                RedoProcessState::Launched(p) => Some(WalRedoManagerProcessStatus { pid: p.id() }),
+                RedoProcessState::ManagerShutDown => None,
+            }),
        }
    }
 }
@@ -161,18 +204,36 @@ impl PostgresRedoManager {
    /// Create a new PostgresRedoManager.
    ///
    pub fn new(
-        conf: &'static PageServerConf,
+        global_state: Arc<GlobalState>,
        tenant_shard_id: TenantShardId,
    ) -> PostgresRedoManager {
        // The actual process is launched lazily, on first request.
        PostgresRedoManager {
+            global_state,
            tenant_shard_id,
-            conf,
            last_redo_at: std::sync::Mutex::default(),
            redo_process: heavier_once_cell::OnceCell::default(),
+            launch_process_gate: Gate::default(),
        }
    }

+    pub async fn shutdown(&self) {
+        // prevent new launches
+        let permit = match self.redo_process.get_or_init_detached().await {
+            Ok(guard) => {
+                let (proc, permit) = guard.take_and_deinit();
+                drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                permit
+            }
+            Err(permit) => permit,
+        };
+        self.redo_process
+            .set(RedoProcessState::ManagerShutDown, permit);
+
+        // wait for all WalRedoProcess objects to get dropped
+        self.launch_process_gate.close().await;
+    }
+
    /// This type doesn't have its own background task to check for idleness: we
    /// rely on our owner calling this function periodically in its own housekeeping
    /// loops.
@@ -203,7 +264,7 @@ impl PostgresRedoManager {
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
        pg_version: u32,
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, Error> {
        *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());

        let (rel, blknum) = key.to_rel_block().context("invalid record")?;
@@ -212,17 +273,27 @@ impl PostgresRedoManager {
        loop {
            let proc: Arc<process::WalRedoProcess> =
                match self.redo_process.get_or_init_detached().await {
-                    Ok(guard) => Arc::clone(&guard),
+                    Ok(guard) => match &*guard {
+                        RedoProcessState::Launched(proc) => Arc::clone(proc),
+                        RedoProcessState::ManagerShutDown => {
+                            return Err(Error::Cancelled);
+                        }
+                    },
                    Err(permit) => {
                        // don't hold poison_guard, the launch code can bail
                        let start = Instant::now();
                        let proc = Arc::new(
                            process::WalRedoProcess::launch(
-                                self.conf,
+                                &self.global_state,
                                self.tenant_shard_id,
                                pg_version,
                            )
-                            .context("launch walredo process")?,
+                            .map_err(|e| match e {
+                                process::LaunchError::Cancelled => Error::Cancelled,
+                                process::LaunchError::Other(e) => {
+                                    Error::Other(e.context("launch walredo process"))
+                                }
+                            })?,
                        );
                        let duration = start.elapsed();
                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
@@ -231,7 +302,8 @@ impl PostgresRedoManager {
                            pid = proc.id(),
                            "launched walredo process"
                        );
-                        self.redo_process.set(Arc::clone(&proc), permit);
+                        self.redo_process
+                            .set(RedoProcessState::Launched(Arc::clone(&proc)), permit);
                        proc
                    }
                };
@@ -242,7 +314,14 @@ impl PostgresRedoManager {
            let result = proc
                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
                .await
-                .context("apply_wal_records");
+                .map_err(|e| match e {
+                    Error::Cancelled => Error::Cancelled,
+                    Error::Other(e) => Error::Other(e.context("apply_wal_records")),
+                });
+            if matches!(result, Err(Error::Cancelled)) {
+                // bail asap and also avoid log noise due to the error reporting below
+                return Err(Error::Cancelled);
+            }

            let duration = started_at.elapsed();

@@ -299,12 +378,17 @@ impl PostgresRedoManager {
                match self.redo_process.get() {
                    None => (),
                    Some(guard) => {
-                        if Arc::ptr_eq(&proc, &*guard) {
-                            // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                            guard.take_and_deinit();
-                        } else {
-                            // Another task already spawned another redo process (further up in this method)
-                            // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                        match &*guard {
+                            RedoProcessState::ManagerShutDown => {}
+                            RedoProcessState::Launched(guard_proc) => {
+                                if Arc::ptr_eq(&proc, &*guard_proc) {
+                                    // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                    guard.take_and_deinit();
+                                } else {
+                                    // Another task already spawned another redo process (further up in this method)
+                                    // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                                }
+                            }
                        }
                    }
                }
@@ -329,7 +413,7 @@ impl PostgresRedoManager {
        lsn: Lsn,
        base_img: Option<Bytes>,
        records: &[(Lsn, NeonWalRecord)],
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, Error> {
        let start_time = Instant::now();

        let mut page = BytesMut::new();
@@ -338,7 +422,7 @@ impl PostgresRedoManager {
            page.extend_from_slice(&fpi[..]);
        } else {
            // All the current WAL record types that we can handle require a base image.
-            anyhow::bail!("invalid neon WAL redo request with no base image");
+            bail!("invalid neon WAL redo request with no base image");
        }

        // Apply all the WAL records in the batch
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -18,13 +18,21 @@ use std::sync::atomic::AtomicUsize;
 use std::{
    collections::VecDeque,
    process::{Command, Stdio},
+    sync::Arc,
    time::Duration,
 };
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
 use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, poison::Poison};
+use utils::{
+    lsn::Lsn,
+    poison::Poison,
+    sync::gate::{GateError, GateGuard},
+};
+
+use super::GlobalState;

 pub struct WalRedoProcess {
+    _spawn_gate_guard: GateGuard,
    #[allow(dead_code)]
    conf: &'static PageServerConf,
    #[cfg(feature = "testing")]
@@ -49,18 +57,33 @@ struct ProcessOutput {
    n_processed_responses: usize,
 }

+#[derive(Debug, thiserror::Error)]
+pub(super) enum LaunchError {
+    #[error("cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 impl WalRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
    #[instrument(skip_all,fields(pg_version=pg_version))]
    pub(crate) fn launch(
-        conf: &'static PageServerConf,
+        global_state: &Arc<GlobalState>,
        tenant_shard_id: TenantShardId,
        pg_version: u32,
-    ) -> anyhow::Result<Self> {
+    ) -> Result<Self, LaunchError> {
        crate::span::debug_assert_current_span_has_tenant_id();

+        let conf = global_state.conf;
+
+        let spawn_gate_guard = match global_state.spawn_gate.enter() {
+            Ok(guard) => guard,
+            Err(GateError::GateClosed) => return Err(LaunchError::Cancelled),
+        };
+
        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;

@@ -144,6 +167,7 @@ impl WalRedoProcess {
        );

        Ok(Self {
+            _spawn_gate_guard: spawn_gate_guard,
            conf,
            #[cfg(feature = "testing")]
            tenant_shard_id,
@@ -189,7 +213,7 @@ impl WalRedoProcess {
        base_img: &Option<Bytes>,
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, super::Error> {
        debug_assert_current_span_has_tenant_id();

        let tag = protocol::BufferTag { rel, blknum };
@@ -216,17 +240,19 @@ impl WalRedoProcess {
            {
                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+                super::bail!("tried to pass neon wal record to postgres WAL redo");
            }
        }
        protocol::build_get_page_msg(tag, &mut writebuf);
        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);

-        let Ok(res) =
-            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
-        else {
-            anyhow::bail!("WAL redo timed out");
-        };
+        let res =
+            // TODO: we should tokio::select! on the self.global_state.shutdown here,
+            // but, that requires thinking through the perf implications
+            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf))
+                .await
+                .map_err(|_elapsed| anyhow::anyhow!("WAL redo timed out"))?
+                .map_err(super::Error::Other);

        if res.is_err() {
            // not all of these can be caused by this particular input, however these are so rare
@@ -377,6 +403,11 @@ impl Drop for WalRedoProcess {
            .take()
            .expect("we only do this once")
            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+
+        // The spawn gate is supposed to track running walredo processes.
+        // => must keep guard alive until the process is dead.
+        let _ = &self._spawn_gate_guard;
+
        // no way to wait for stderr_logger_task from Drop because that is async only
    }
 }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -103,12 +103,8 @@ impl ConnCfg {

    /// Reuse password or auth keys from the other config.
    pub fn reuse_password(&mut self, other: Self) {
-        if let Some(password) = other.get_password() {
-            self.password(password);
-        }
-
-        if let Some(keys) = other.get_auth_keys() {
-            self.auth_keys(keys);
+        if let Some(password) = other.get_auth() {
+            self.auth(password);
        }
    }

@@ -124,48 +120,64 @@ impl ConnCfg {

    /// Apply startup message params to the connection config.
    pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        // Only set `user` if it's not present in the config.
-        // Link auth flow takes username from the console's response.
-        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
-            self.user(user);
-        }
-
-        // Only set `dbname` if it's not present in the config.
-        // Link auth flow takes dbname from the console's response.
-        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
-            self.dbname(dbname);
-        }
-
-        // Don't add `options` if they were only used for specifying a project.
-        // Connection pools don't support `options`, because they affect backend startup.
-        if let Some(options) = filtered_options(params) {
-            self.options(&options);
-        }
-
-        if let Some(app_name) = params.get("application_name") {
-            self.application_name(app_name);
-        }
-
-        // TODO: This is especially ugly...
-        if let Some(replication) = params.get("replication") {
-            use tokio_postgres::config::ReplicationMode;
-            match replication {
-                "true" | "on" | "yes" | "1" => {
-                    self.replication_mode(ReplicationMode::Physical);
+        let mut client_encoding = false;
+        for (k, v) in params.iter() {
+            match k {
+                "user" => {
+                    // Only set `user` if it's not present in the config.
+                    // Link auth flow takes username from the console's response.
+                    if self.get_user().is_none() {
+                        self.user(v);
+                    }
                }
                "database" => {
-                    self.replication_mode(ReplicationMode::Logical);
+                    // Only set `dbname` if it's not present in the config.
+                    // Link auth flow takes dbname from the console's response.
+                    if self.get_dbname().is_none() {
+                        self.dbname(v);
+                    }
+                }
+                "options" => {
+                    // Don't add `options` if they were only used for specifying a project.
+                    // Connection pools don't support `options`, because they affect backend startup.
+                    if let Some(options) = filtered_options(v) {
+                        self.options(&options);
+                    }
+                }
+
+                // the special ones in tokio-postgres that we don't want being set by the user
+                "dbname" => {}
+                "password" => {}
+                "sslmode" => {}
+                "host" => {}
+                "port" => {}
+                "connect_timeout" => {}
+                "keepalives" => {}
+                "keepalives_idle" => {}
+                "keepalives_interval" => {}
+                "keepalives_retries" => {}
+                "target_session_attrs" => {}
+                "channel_binding" => {}
+                "max_backend_message_size" => {}
+
+                "client_encoding" => {
+                    client_encoding = true;
+                    // only error should be from bad null bytes,
+                    // but we've already checked for those.
+                    _ = self.param("client_encoding", v);
+                }
+
+                _ => {
+                    // only error should be from bad null bytes,
+                    // but we've already checked for those.
+                    _ = self.param(k, v);
                }
-                _other => {}
            }
        }
-
-        // TODO: extend the list of the forwarded startup parameters.
-        // Currently, tokio-postgres doesn't allow us to pass
-        // arbitrary parameters, but the ones above are a good start.
-        //
-        // This and the reverse params problem can be better addressed
-        // in a bespoke connection machinery (a new library for that sake).
+        if !client_encoding {
+            // for compatibility since we removed it from tokio-postgres
+            self.param("client_encoding", "UTF8").unwrap();
+        }
    }
 }

@@ -338,10 +350,9 @@ impl ConnCfg {
 }

 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(params: &StartupMessageParams) -> Option<String> {
+fn filtered_options(options: &str) -> Option<String> {
    #[allow(unstable_name_collisions)]
-    let options: String = params
-        .options_raw()?
+    let options: String = StartupMessageParams::parse_options_raw(options)
        .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
        .intersperse(" ") // TODO: use impl from std once it's stabilized
        .collect();
@@ -413,27 +424,23 @@ mod tests {
    #[test]
    fn test_filtered_options() {
        // Empty options is unlikely to be useful anyway.
-        let params = StartupMessageParams::new([("options", "")]);
-        assert_eq!(filtered_options(&params), None);
+        assert_eq!(filtered_options(""), None);

        // It's likely that clients will only use options to specify endpoint/project.
-        let params = StartupMessageParams::new([("options", "project=foo")]);
-        assert_eq!(filtered_options(&params), None);
+        let params = "project=foo";
+        assert_eq!(filtered_options(params), None);

        // Same, because unescaped whitespaces are no-op.
-        let params = StartupMessageParams::new([("options", " project=foo ")]);
-        assert_eq!(filtered_options(&params).as_deref(), None);
+        let params = " project=foo ";
+        assert_eq!(filtered_options(params), None);

-        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
+        let params = r"\  project=foo \ ";
+        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));

-        let params = StartupMessageParams::new([("options", "project = foo")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));

-        let params = StartupMessageParams::new([(
-            "options",
-            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
-        )]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
    }
 }
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -543,7 +543,9 @@ mod tests {
        rx: impl Stream<Item = RequestData>,
    ) -> Vec<(u64, usize, i64)> {
        let remote_storage_config = RemoteStorageConfig {
-            storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
+            storage: RemoteStorageKind::LocalFs {
+                local_path: tmpdir.to_path_buf(),
+            },
            timeout: std::time::Duration::from_secs(120),
        };
        let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -231,6 +231,10 @@ impl ConnectMechanism for TokioMechanism {
            .dbname(&self.conn_info.dbname)
            .connect_timeout(timeout);

+        config
+            .param("client_encoding", "UTF8")
+            .expect("client encoding UTF8 is always valid");
+
        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
        let res = config.connect(tokio_postgres::NoTls).await;
        drop(pause);
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -202,6 +202,7 @@ fn get_conn_info(
            options = Some(NeonOptions::parse_options_raw(&value));
        }
    }
+    ctx.set_db_options(params.freeze());

    let user_info = ComputeUserInfo {
        endpoint,
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -146,6 +146,9 @@ pub(crate) enum NotifyError {
    // A response indicates we will never succeed, such as 400 or 404
    #[error("Non-retryable error {0}")]
    Fatal(StatusCode),
+
+    #[error("neon_local error: {0}")]
+    NeonLocal(anyhow::Error),
 }

 enum MaybeSendResult {
@@ -278,7 +281,7 @@ impl ComputeHook {
    async fn do_notify_local(
        &self,
        reconfigure_request: &ComputeHookNotifyRequest,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), NotifyError> {
        // neon_local updates are not safe to call concurrently, use a lock to serialize
        // all calls to this function
        let _locked = self.neon_local_lock.lock().await;
@@ -321,7 +324,8 @@ impl ComputeHook {
                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                endpoint
                    .reconfigure(compute_pageservers.clone(), *stripe_size)
-                    .await?;
+                    .await
+                    .map_err(NotifyError::NeonLocal)?;
            }
        }

@@ -510,7 +514,7 @@ impl ComputeHook {
        } else {
            self.do_notify_local(&request).await.map_err(|e| {
                // This path is for testing only, so munge the error into our prod-style error type.
-                tracing::error!("Local notification hook failed: {e}");
+                tracing::error!("neon_local notification hook failed: {e}");
                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
            })
        };
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -502,6 +502,17 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro
    json_response(StatusCode::ACCEPTED, ())
 }

+async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    state.service.cancel_node_drain(node_id).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -513,6 +524,17 @@ async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError
    json_response(StatusCode::ACCEPTED, ())
 }

+async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    state.service.cancel_node_fill(node_id).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -871,9 +893,23 @@ pub fn make_router(
        .put("/control/v1/node/:node_id/drain", |r| {
            named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
        })
+        .delete("/control/v1/node/:node_id/drain", |r| {
+            named_request_span(
+                r,
+                handle_cancel_node_drain,
+                RequestName("control_v1_cancel_node_drain"),
+            )
+        })
        .put("/control/v1/node/:node_id/fill", |r| {
            named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill"))
        })
+        .delete("/control/v1/node/:node_id/fill", |r| {
+            named_request_span(
+                r,
+                handle_cancel_node_fill,
+                RequestName("control_v1_cancel_node_fill"),
+            )
+        })
        // TODO(vlad): endpoint for cancelling drain and fill
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4541,7 +4541,8 @@ impl Service {
                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
                    .await?;

-                let cancel = CancellationToken::new();
+                let cancel = self.cancel.child_token();
+                let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;

                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
                    operation: Operation::Drain(Drain { node_id }),
@@ -4552,6 +4553,8 @@ impl Service {
                    let service = self.clone();
                    let cancel = cancel.clone();
                    async move {
+                        let _gate_guard = gate_guard;
+
                        scopeguard::defer! {
                            let prev = service.inner.write().unwrap().ongoing_operation.take();

@@ -4593,6 +4596,44 @@ impl Service {
        Ok(())
    }

+    pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
+        let (node_available, node_policy) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+
+            (node.is_available(), node.get_scheduling())
+        };
+
+        if !node_available {
+            return Err(ApiError::ResourceUnavailable(
+                format!("Node {node_id} is currently unavailable").into(),
+            ));
+        }
+
+        if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
+            return Err(ApiError::PreconditionFailed(
+                format!("Node {node_id} has no drain in progress").into(),
+            ));
+        }
+
+        if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
+            if let Operation::Drain(drain) = op_handler.operation {
+                if drain.node_id == node_id {
+                    tracing::info!("Cancelling background drain operation for node {node_id}");
+                    op_handler.cancel.cancel();
+                    return Ok(());
+                }
+            }
+        }
+
+        Err(ApiError::PreconditionFailed(
+            format!("Node {node_id} has no drain in progress").into(),
+        ))
+    }
+
    pub(crate) async fn start_node_fill(self: &Arc<Self>, node_id: NodeId) -> Result<(), ApiError> {
        let (ongoing_op, node_available, node_policy, total_nodes_count) = {
            let locked = self.inner.read().unwrap();
@@ -4635,7 +4676,8 @@ impl Service {
                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling))
                    .await?;

-                let cancel = CancellationToken::new();
+                let cancel = self.cancel.child_token();
+                let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;

                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
                    operation: Operation::Fill(Fill { node_id }),
@@ -4646,6 +4688,8 @@ impl Service {
                    let service = self.clone();
                    let cancel = cancel.clone();
                    async move {
+                        let _gate_guard = gate_guard;
+
                        scopeguard::defer! {
                            let prev = service.inner.write().unwrap().ongoing_operation.take();

@@ -4687,6 +4731,44 @@ impl Service {
        Ok(())
    }

+    pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
+        let (node_available, node_policy) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+
+            (node.is_available(), node.get_scheduling())
+        };
+
+        if !node_available {
+            return Err(ApiError::ResourceUnavailable(
+                format!("Node {node_id} is currently unavailable").into(),
+            ));
+        }
+
+        if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
+            return Err(ApiError::PreconditionFailed(
+                format!("Node {node_id} has no fill in progress").into(),
+            ));
+        }
+
+        if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
+            if let Operation::Fill(fill) = op_handler.operation {
+                if fill.node_id == node_id {
+                    tracing::info!("Cancelling background drain operation for node {node_id}");
+                    op_handler.cancel.cancel();
+                    return Ok(());
+                }
+            }
+        }
+
+        Err(ApiError::PreconditionFailed(
+            format!("Node {node_id} has no fill in progress").into(),
+        ))
+    }
+
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
@@ -5282,11 +5364,24 @@ impl Service {
        let mut last_inspected_shard: Option<TenantShardId> = None;
        let mut inspected_all_shards = false;
        let mut waiters = Vec::new();
-        let mut schedule_context = ScheduleContext::default();

        while !inspected_all_shards {
            if cancel.is_cancelled() {
-                return Err(OperationError::Cancelled);
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
            }

            {
@@ -5323,28 +5418,32 @@ impl Service {
                        }
                    };

-                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
-                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
-                            Err(e) => {
-                                tracing::warn!(
-                                    tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                    "Scheduling error when draining pageserver {} : {e}", node_id
-                                );
-                            }
-                            Ok(()) => {
-                                let scheduled_to = tenant_shard.intent.get_attached();
-                                tracing::info!(
-                                    tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
-                                    "Rescheduled shard while draining node {}: {} -> {:?}",
-                                    node_id,
-                                    node_id,
-                                    scheduled_to
-                                );
+                    // If the shard is not attached to the node being drained, skip it.
+                    if *tenant_shard.intent.get_attached() != Some(node_id) {
+                        last_inspected_shard = Some(*tid);
+                        continue;
+                    }

-                                let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
-                                if let Some(some) = waiter {
-                                    waiters.push(some);
-                                }
+                    match tenant_shard.reschedule_to_secondary(None, scheduler) {
+                        Err(e) => {
+                            tracing::warn!(
+                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                "Scheduling error when draining pageserver {} : {e}", node_id
+                            );
+                        }
+                        Ok(()) => {
+                            let scheduled_to = tenant_shard.intent.get_attached();
+                            tracing::info!(
+                                tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                                "Rescheduled shard while draining node {}: {} -> {:?}",
+                                node_id,
+                                node_id,
+                                scheduled_to
+                            );
+
+                            let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
+                            if let Some(some) = waiter {
+                                waiters.push(some);
                            }
                        }
                    }
@@ -5356,9 +5455,29 @@ impl Service {
            waiters = self
                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
                .await;
+
+            failpoint_support::sleep_millis_async!("sleepy-drain-loop");
        }

        while !waiters.is_empty() {
+            if cancel.is_cancelled() {
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
+            }
+
            tracing::info!("Awaiting {} pending drain reconciliations", waiters.len());

            waiters = self
@@ -5487,15 +5606,27 @@ impl Service {
        // secondaries are warm. This is not always true (e.g. we just migrated the
        // tenant). Take that into consideration by checking the secondary status.
        let mut tids_to_promote = self.fill_node_plan(node_id);
-
        let mut waiters = Vec::new();
-        let mut schedule_context = ScheduleContext::default();

        // Execute the plan we've composed above. Before aplying each move from the plan,
        // we validate to ensure that it has not gone stale in the meantime.
        while !tids_to_promote.is_empty() {
            if cancel.is_cancelled() {
-                return Err(OperationError::Cancelled);
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
            }

            {
@@ -5525,9 +5656,7 @@ impl Service {
                            }

                            let previously_attached_to = *tenant_shard.intent.get_attached();
-
-                            tenant_shard.intent.promote_attached(scheduler, node_id);
-                            match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                            match tenant_shard.reschedule_to_secondary(Some(node_id), scheduler) {
                                Err(e) => {
                                    tracing::warn!(
                                        tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
@@ -5563,6 +5692,24 @@ impl Service {
        }

        while !waiters.is_empty() {
+            if cancel.is_cancelled() {
+                match self
+                    .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
+                                node_id, err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
+            }
+
            tracing::info!("Awaiting {} pending fill reconciliations", waiters.len());

            waiters = self
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -646,6 +646,48 @@ impl TenantShard {
        Ok(())
    }

+    /// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error
+    /// if the swap is not possible and leaves the intent state in its original state.
+    ///
+    /// Arguments:
+    /// `attached_to`: the currently attached location matching the intent state (may be None if the
+    /// shard is not attached)
+    /// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask
+    /// the scheduler to recommend a node
+    pub(crate) fn reschedule_to_secondary(
+        &mut self,
+        promote_to: Option<NodeId>,
+        scheduler: &mut Scheduler,
+    ) -> Result<(), ScheduleError> {
+        let promote_to = match promote_to {
+            Some(node) => node,
+            None => match scheduler.node_preferred(self.intent.get_secondary()) {
+                Some(node) => node,
+                None => {
+                    return Err(ScheduleError::ImpossibleConstraint);
+                }
+            },
+        };
+
+        assert!(self.intent.get_secondary().contains(&promote_to));
+
+        if let Some(node) = self.intent.get_attached() {
+            let demoted = self.intent.demote_attached(scheduler, *node);
+            if !demoted {
+                return Err(ScheduleError::ImpossibleConstraint);
+            }
+        }
+
+        self.intent.promote_attached(scheduler, promote_to);
+
+        // Increment the sequence number for the edge case where a
+        // reconciler is already running to avoid waiting on the
+        // current reconcile instead of spawning a new one.
+        self.sequence = self.sequence.next();
+
+        Ok(())
+    }
+
    /// Optimize attachments: if a shard has a secondary location that is preferable to
    /// its primary location based on soft constraints, switch that secondary location
    /// to be attached.
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2249,6 +2249,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

+    def cancel_node_drain(self, node_id):
+        log.info(f"cancel_node_drain({node_id})")
+        self.request(
+            "DELETE",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
    def node_fill(self, node_id):
        log.info(f"node_fill({node_id})")
        self.request(
@@ -2257,6 +2265,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

+    def cancel_node_fill(self, node_id):
+        log.info(f"cancel_node_fill({node_id})")
+        self.request(
+            "DELETE",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
    def node_status(self, node_id):
        response = self.request(
            "GET",
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -106,6 +106,11 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
    ".*startup_reconcile: Could not scan node.*",
    # Tests run in dev mode
    ".*Starting in dev mode.*",
+    # Tests that stop endpoints & use the storage controller's neon_local notification
+    # mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage
+    # controller's attempts to notify the endpoint).
+    ".*reconciler.*neon_local notification hook failed.*",
+    ".*reconciler.*neon_local error.*",
 ]


--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -53,6 +53,25 @@ def test_proxy_select_1(static_proxy: NeonProxy):
    assert out[0][0] == 42


+def test_proxy_server_params(static_proxy: NeonProxy):
+    """
+    Test that server params are passing through to postgres
+    """
+
+    out = static_proxy.safe_psql(
+        "select to_json('0 seconds'::interval)", options="-c intervalstyle=iso_8601"
+    )
+    assert out[0][0] == "PT0S"
+    out = static_proxy.safe_psql(
+        "select to_json('0 seconds'::interval)", options="-c intervalstyle=sql_standard"
+    )
+    assert out[0][0] == "0"
+    out = static_proxy.safe_psql(
+        "select to_json('0 seconds'::interval)", options="-c intervalstyle=postgres"
+    )
+    assert out[0][0] == "00:00:00"
+
+
 def test_password_hack(static_proxy: NeonProxy):
    """
    Check the PasswordHack auth flow: an alternative to SCRAM auth for
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1518,6 +1518,49 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
        workload.validate()


+def retryable_node_operation(op, ps_id, max_attempts, backoff):
+    while max_attempts > 0:
+        try:
+            op(ps_id)
+            return
+        except StorageControllerApiException as e:
+            max_attempts -= 1
+            log.info(f"Operation failed ({max_attempts} attempts left): {e}")
+
+            if max_attempts == 0:
+                raise e
+
+            time.sleep(backoff)
+
+
+def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff):
+    log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
+    while max_attempts > 0:
+        try:
+            status = env.storage_controller.node_status(node_id)
+            policy = status["scheduling"]
+            if policy == desired_scheduling_policy:
+                return
+            else:
+                max_attempts -= 1
+                log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
+
+                if max_attempts == 0:
+                    raise AssertionError(
+                        f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
+                    )
+
+                time.sleep(backoff)
+        except StorageControllerApiException as e:
+            max_attempts -= 1
+            log.info(f"Status call failed ({max_attempts} retries left): {e}")
+
+            if max_attempts == 0:
+                raise e
+
+            time.sleep(backoff)
+
+
 def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
    """
    Graceful reststart of storage controller clusters use the drain and
@@ -1546,47 +1589,6 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
    nodes = env.storage_controller.node_list()
    assert len(nodes) == 2

-    def retryable_node_operation(op, ps_id, max_attempts, backoff):
-        while max_attempts > 0:
-            try:
-                op(ps_id)
-                return
-            except StorageControllerApiException as e:
-                max_attempts -= 1
-                log.info(f"Operation failed ({max_attempts} attempts left): {e}")
-
-                if max_attempts == 0:
-                    raise e
-
-                time.sleep(backoff)
-
-    def poll_node_status(node_id, desired_scheduling_policy, max_attempts, backoff):
-        log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
-        while max_attempts > 0:
-            try:
-                status = env.storage_controller.node_status(node_id)
-                policy = status["scheduling"]
-                if policy == desired_scheduling_policy:
-                    return
-                else:
-                    max_attempts -= 1
-                    log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
-
-                    if max_attempts == 0:
-                        raise AssertionError(
-                            f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
-                        )
-
-                    time.sleep(backoff)
-            except StorageControllerApiException as e:
-                max_attempts -= 1
-                log.info(f"Status call failed ({max_attempts} retries left): {e}")
-
-                if max_attempts == 0:
-                    raise e
-
-                time.sleep(backoff)
-
    def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards):
        # Assert that all nodes have some attached shards
        assert len(shard_counts) == len(env.pageservers)
@@ -1602,7 +1604,7 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
        retryable_node_operation(
            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
        )
-        poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5)
+        poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5)

        shard_counts = get_node_shard_counts(env, tenant_ids)
        log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
@@ -1612,12 +1614,12 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
        assert sum(shard_counts.values()) == total_shards

        ps.restart()
-        poll_node_status(ps.id, "Active", max_attempts=10, backoff=1)
+        poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1)

        retryable_node_operation(
            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
        )
-        poll_node_status(ps.id, "Active", max_attempts=6, backoff=5)
+        poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5)

        shard_counts = get_node_shard_counts(env, tenant_ids)
        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
@@ -1627,3 +1629,43 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
    shard_counts = get_node_shard_counts(env, tenant_ids)
    log.info(f"Shard counts after rolling restart: {shard_counts}")
    assert_shard_counts_balanced(env, shard_counts, total_shards)
+
+
+def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_count = 5
+    shard_count_per_tenant = 8
+    tenant_ids = []
+
+    for _ in range(0, tenant_count):
+        tid = TenantId.generate()
+        tenant_ids.append(tid)
+        env.neon_cli.create_tenant(
+            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
+        )
+
+    # See sleep comment in the test above.
+    time.sleep(2)
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 2
+
+    env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(2000)"))
+
+    ps_id_to_drain = env.pageservers[0].id
+
+    retryable_node_operation(
+        lambda ps_id: env.storage_controller.node_drain(ps_id),
+        ps_id_to_drain,
+        max_attempts=3,
+        backoff=2,
+    )
+
+    poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
+
+    env.storage_controller.cancel_node_drain(ps_id_to_drain)
+
+    poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2)
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -25,6 +25,7 @@ axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
 bytes = { version = "1", features = ["serde"] }
+camino = { version = "1", default-features = false, features = ["serde1"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
Author	SHA1	Message	Date
Christian Schwarz	c8a3b6f930	the fix: shutdown method for WalRedoManager, so the Arc<> can outlive the process	2024-06-24 19:29:31 +00:00
Christian Schwarz	6779c908eb	Revert "WIP: solution approach 1: propagate cancellationtoken from tenant" This reverts commit `5202d2dc98`.	2024-06-24 19:04:02 +00:00
Christian Schwarz	5202d2dc98	WIP: solution approach 1: propagate cancellationtoken from tenant not enough, we need its gate, and there's no concept of child gate	2024-06-24 19:03:30 +00:00
Christian Schwarz	d7e0c99616	trim down the PR to just keeping track of walredo processes	2024-06-24 18:46:29 +00:00
Christian Schwarz	7507d137de	WIP	2024-06-24 18:25:57 +00:00
Christian Schwarz	34f42669fa	WIP	2024-06-24 17:56:36 +00:00
Christian Schwarz	943220df9b	implement a global gate + cancellation mechanism for live walredo processes, hook up to shutdown	2024-06-24 17:18:54 +00:00
Christian Schwarz	5e0ef715aa	add distinguished "Cancelled" error for walredo (don't use it yet)	2024-06-24 16:05:04 +00:00
John Spray	de05f90735	pageserver: add more info-level logging in shard splits (#8137 ) ## Problem `test_sharding_autosplit` is occasionally failing on warnings about shard splits taking longer than expected (`Exclusive lock by ShardSplit was held for`...) It's not obvious which part is taking the time (I suspect remote storage uploads). Example: https://neon-github-public-dev.s3.amazonaws.com/reports/main/9618788427/index.html#testresult/b395294d5bdeb783/ ## Summary of changes - Since shard splits are infrequent events, we can afford to be very chatty: add a bunch of info-level logging throughout the process.	2024-06-24 11:53:43 +01:00
John Spray	188797f048	pageserver: remove code that resumes tenant deletions after restarts (#8091 ) #8082 removed the legacy deletion path, but retained code for completing deletions that were started before a pageserver restart. This PR cleans up that remaining code, and removes all the pageserver code that dealt with tenant deletion markers and resuming tenant deletions. The release at https://github.com/neondatabase/neon/pull/8138 contains https://github.com/neondatabase/neon/pull/8082, so we can now merge this to `main`	2024-06-24 11:41:11 +01:00
Arpad Müller	5446e08891	Move remote_storage config related code into dedicated module (#8132 ) Moves `RemoteStorageConfig` and related structs and functions into a dedicated module. Also implements `Serialize` for the config structs (requested in #8126). Follow-up of #8126	2024-06-24 12:29:54 +02:00
Conrad Ludgate	78d9059fc7	proxy: update tokio-postgres to allow arbitrary config params (#8076 ) ## Problem Fixes https://github.com/neondatabase/neon/issues/1287 ## Summary of changes tokio-postgres now supports arbitrary server params through the `param(key, value)` method. Some keys are special so we explicitly filter them out.	2024-06-24 10:20:27 +00:00
Arpad Müller	75747cdbff	Use serde for RemoteStorageConfig parsing (#8126 ) Adds a `Deserialize` impl to `RemoteStorageConfig`. We thus achieve the same as #7743 but with less repetitive code, by deriving `Deserialize` impls on `S3Config`, `AzureConfig`, and `RemoteStorageConfig`. The disadvantage is less useful error messages. The git history of this PR contains a state where we go via an intermediate representation, leveraging the `serde_json` crate, without it ever being actual json though. Also, the PR adds deserialization tests. Alternative to #7743 .	2024-06-22 17:57:09 +00:00
Vlad Lazar	8fe3f17c47	storcon: improve drain and fill shard placement (#8119 ) ## Problem While adapting the storage controller scale test to do graceful rolling restarts via drain and fill, I noticed that secondaries are also being rescheduled, which, in turn, caused the storage controller to optimise attachments. ## Summary of changes * Introduce a transactional looking rescheduling primitive (i.e. "try to schedule to this secondary, but leave everything as is if you can't") * Use it for the drain and fill stages to avoid calling into `Scheduler::schedule` and having secondaries move around.	2024-06-22 14:20:58 +00:00
Anastasia Lubennikova	8776089c70	Remove kq_imcx extension support per customer request neondatabase/cloud#13648	2024-06-21 20:22:54 +01:00
John Spray	b74232eb4d	tests: allow-list neon_local endpoint errors from storage controller (#8123 ) ## Problem For testing, the storage controller has a built-in hack that loads neon_local endpoint config from disk, and uses it to reconfigure endpoints when the attached pageserver changes. Some tests that stop an endpoint while the storage controller is running could occasionally fail on log errors from the controller trying to use its special test-mode calls into neon local Endpoint. Example: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8117/9592392425/index.html#/testresult/9d2bb8623d0d53f8 ## Summary of changes - Give NotifyError an explicit NeonLocal variant, to avoid munging these into generic 500s (I don't want to ignore 500s in general) - Allow-list errors related to the local notification hook. The expectation is that tests using endpoints/workloads should be independently checking that those endpoints work: if neon_local generates an error inside the storage controller, that's ignorable.	2024-06-21 17:23:31 +00:00
Vlad Lazar	ee3081863e	storcon: implement endpoints for cancellation of drain and fill operations (#8029 ) ## Problem There's no way to cancel drain and fill operations. ## Summary of changes Implement HTTP endpoints to allow cancelling of background operations. When the operationis cancelled successfully, the node scheduling policy will revert to `Active`.	2024-06-21 17:13:51 +01:00