proxy: force no retry some errors

2026-02-04 19:20:36 +00:00 · 2023-09-29 17:33:13 +01:00
28 changed files with 247 additions and 562 deletions
--- a/2
+++ b/2
@@ -5,7 +5,7 @@
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
-/pageserver/ @neondatabase/storage
+/pageserver/ @neondatabase/compute @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1780,9 +1780,18 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"

 [[package]]
 name = "hermit-abi"
-version = "0.3.3"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
+checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"

 [[package]]
 name = "hex"
@@ -2044,7 +2053,7 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
 dependencies = [
- "hermit-abi",
+ "hermit-abi 0.3.1",
 "libc",
 "windows-sys 0.48.0",
 ]
@@ -2061,7 +2070,7 @@ version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
- "hermit-abi",
+ "hermit-abi 0.3.1",
 "io-lifetimes",
 "rustix 0.37.19",
 "windows-sys 0.48.0",
@@ -2435,11 +2444,11 @@ dependencies = [

 [[package]]
 name = "num_cpus"
-version = "1.16.0"
+version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
+checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
 dependencies = [
- "hermit-abi",
+ "hermit-abi 0.2.6",
 "libc",
 ]

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -615,7 +615,11 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
 #########################################################################################
 #
 # Layer "rust extensions"
-# This layer is used to build `pgrx` deps
+# This layer is used to build `pgx` deps
+#
+# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
+# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
+# dependency on all the rust extension that depend on it, too.
 #
 #########################################################################################
 FROM build-deps AS rust-extensions-build
@@ -631,12 +635,22 @@ USER nonroot
 WORKDIR /home/nonroot
 ARG PG_VERSION

-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
+        ;; \
+    esac && \
+    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
-    cargo install --locked --version 0.10.2 cargo-pgrx && \
-    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+    cargo install --locked --version 0.7.3 cargo-pgx && \
+    /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'

 USER root

@@ -650,11 +664,23 @@ USER root
 FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION

-RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
+# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
+# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release && \
+    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control

 #########################################################################################
@@ -667,11 +693,26 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.
 FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION

-RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
-    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
+# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
+# Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
+# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
+# same 1.1 version we've used before.
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
+    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release && \
+    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
+    cargo pgx install --release && \
    # it's needed to enable extension because it uses untrusted C language
    sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
@@ -686,11 +727,21 @@ RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz
 FROM rust-extensions-build AS pg-tiktoken-pg-build
 ARG PG_VERSION

-# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
-RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
-    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
+# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
+    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
-    cargo pgrx install --release && \
+    cargo pgx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

 #########################################################################################
@@ -703,15 +754,21 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION

-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
-    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
+    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
-    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
-    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
-    echo "********************************************************************************************************" && \
-    sed -i 's/pgrx       = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgrx install --release && \
+    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control

 #########################################################################################
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -1,5 +1,5 @@
 use std::sync::Arc;
-use std::{thread, time::Duration};
+use std::{thread, time};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
@@ -7,7 +7,7 @@ use tracing::{debug, info};

 use crate::compute::ComputeNode;

-const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
+const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds

 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
@@ -17,12 +17,13 @@ fn watch_compute_activity(compute: &ComputeNode) {
    let connstr = compute.connstr.as_str();
    // Define `client` outside of the loop to reuse existing connection if it's active.
    let mut client = Client::connect(connstr, NoTls);
+    let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);

    info!("watching Postgres activity at {}", connstr);

    loop {
        // Should be outside of the write lock to allow others to read while we sleep.
-        thread::sleep(MONITOR_CHECK_INTERVAL);
+        thread::sleep(timeout);

        match &mut client {
            Ok(cli) => {
--- a/deny.toml
+++ b/deny.toml
@@ -23,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = []
+ignore = ["RUSTSEC-2023-0052"]

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -47,47 +47,10 @@ pub struct S3Bucket {
    bucket_name: String,
    prefix_in_bucket: Option<String>,
    max_keys_per_list_response: Option<i32>,
-    concurrency_limiter: ConcurrencyLimiter,
-}
-
-struct ConcurrencyLimiter {
    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
    // The helps to ensure we don't exceed the thresholds.
-    write: Arc<Semaphore>,
-    read: Arc<Semaphore>,
-}
-
-impl ConcurrencyLimiter {
-    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
-        match kind {
-            RequestKind::Get => &self.read,
-            RequestKind::Put => &self.write,
-            RequestKind::List => &self.read,
-            RequestKind::Delete => &self.write,
-        }
-    }
-
-    async fn acquire(
-        &self,
-        kind: RequestKind,
-    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
-        self.for_kind(kind).acquire().await
-    }
-
-    async fn acquire_owned(
-        &self,
-        kind: RequestKind,
-    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
-        Arc::clone(self.for_kind(kind)).acquire_owned().await
-    }
-
-    fn new(limit: usize) -> ConcurrencyLimiter {
-        Self {
-            read: Arc::new(Semaphore::new(limit)),
-            write: Arc::new(Semaphore::new(limit)),
-        }
-    }
+    concurrency_limiter: Arc<Semaphore>,
 }

 #[derive(Default)]
@@ -154,7 +117,7 @@ impl S3Bucket {
            bucket_name: aws_config.bucket_name.clone(),
            max_keys_per_list_response: aws_config.max_keys_per_list_response,
            prefix_in_bucket,
-            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
+            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
        })
    }

@@ -193,7 +156,7 @@ impl S3Bucket {
        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
-            .acquire(kind)
+            .acquire()
            .await
            .expect("semaphore is never closed");

@@ -209,7 +172,8 @@ impl S3Bucket {
        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
-            .acquire_owned(kind)
+            .clone()
+            .acquire_owned()
            .await
            .expect("semaphore is never closed");

--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -24,9 +24,6 @@ pub enum ApiError {
    #[error("Precondition failed: {0}")]
    PreconditionFailed(Box<str>),

-    #[error("Resource temporarily unavailable: {0}")]
-    ResourceUnavailable(String),
-
    #[error("Shutting down")]
    ShuttingDown,

@@ -62,10 +59,6 @@ impl ApiError {
                "Shutting down".to_string(),
                StatusCode::SERVICE_UNAVAILABLE,
            ),
-            ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
-                err.to_string(),
-                StatusCode::SERVICE_UNAVAILABLE,
-            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -132,7 +132,7 @@ impl From<PageReconstructError> for ApiError {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
            PageReconstructError::AncestorStopping(_) => {
-                ApiError::ResourceUnavailable(format!("{pre}"))
+                ApiError::InternalServerError(anyhow::Error::new(pre))
            }
            PageReconstructError::WalRedo(pre) => {
                ApiError::InternalServerError(anyhow::Error::new(pre))
@@ -145,7 +145,7 @@ impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-                ApiError::ResourceUnavailable(format!("{tmie}"))
+                ApiError::InternalServerError(anyhow::Error::new(tmie))
            }
            TenantMapInsertError::TenantAlreadyExists(id, state) => {
                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
@@ -159,12 +159,6 @@ impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
-            TenantStateError::NotActive(_) => {
-                ApiError::ResourceUnavailable("Tenant not yet active".into())
-            }
-            TenantStateError::IsStopping(_) => {
-                ApiError::ResourceUnavailable("Tenant is stopping".into())
-            }
            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
@@ -174,17 +168,14 @@ impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
-            GetTenantError::Broken(reason) => {
-                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
-            }
-            GetTenantError::NotActive(_) => {
+            e @ GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
                // in fact exist locally. If we did, the caller could draw the conclusion
                // that it can attach the tenant to another PS and we'd be in split-brain.
                //
                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
-                ApiError::ResourceUnavailable("Tenant not yet active".into())
+                ApiError::InternalServerError(anyhow::Error::new(e))
            }
        }
    }
@@ -631,9 +622,8 @@ async fn tenant_list_handler(
    let response_data = mgr::list_tenants()
        .instrument(info_span!("tenant_list"))
        .await
-        .map_err(|_| {
-            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".to_string())
-        })?
+        .map_err(anyhow::Error::new)
+        .map_err(ApiError::InternalServerError)?
        .iter()
        .map(|(id, state)| TenantInfo {
            id: *id,
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -264,46 +264,6 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

-pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_page_cache_acquire_pinned_slot_seconds",
-        "Time spent acquiring a pinned slot in the page cache",
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_page_cache_find_victim_iters_total",
-        "Counter for the number of iterations in the find_victim loop",
-    )
-    .expect("failed to define a metric")
-});
-
-static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "page_cache_errors_total",
-        "Number of timeouts while acquiring a pinned slot in the page cache",
-        &["error_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-#[derive(IntoStaticStr)]
-#[strum(serialize_all = "kebab_case")]
-pub(crate) enum PageCacheErrorKind {
-    AcquirePinnedSlotTimeout,
-    EvictIterLimit,
-}
-
-pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
-    PAGE_CACHE_ERRORS
-        .get_metric_with_label_values(&[error_kind.into()])
-        .unwrap()
-        .inc();
-}
-
 pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wait_lsn_seconds",
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -75,11 +75,7 @@
 use std::{
    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
-    sync::{
-        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
-        Arc, Weak,
-    },
-    time::Duration,
+    sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
 };

 use anyhow::Context;
@@ -169,8 +165,6 @@ struct Slot {

 struct SlotInner {
    key: Option<CacheKey>,
-    // for `coalesce_readers_permit`
-    permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
    buf: &'static mut [u8; PAGE_SZ],
 }

@@ -213,22 +207,6 @@ impl Slot {
    }
 }

-impl SlotInner {
-    /// If there is aready a reader, drop our permit and share its permit, just like we share read access.
-    fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
-        let mut guard = self.permit.lock().unwrap();
-        if let Some(existing_permit) = guard.upgrade() {
-            drop(guard);
-            drop(permit);
-            existing_permit
-        } else {
-            let permit = Arc::new(permit);
-            *guard = Arc::downgrade(&permit);
-            permit
-        }
-    }
-}
-
 pub struct PageCache {
    /// This contains the mapping from the cache key to buffer slot that currently
    /// contains the page, if any.
@@ -246,8 +224,6 @@ pub struct PageCache {
    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,

-    pinned_slots: Arc<tokio::sync::Semaphore>,
-
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
@@ -255,28 +231,23 @@ pub struct PageCache {
    size_metrics: &'static PageCacheSizeMetrics,
 }

-struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
-
 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
 /// until the guard is dropped.
 ///
-pub struct PageReadGuard<'i> {
-    _permit: Arc<PinnedSlotsPermit>,
-    slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
-}
+pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);

 impl std::ops::Deref for PageReadGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        self.slot_guard.buf
+        self.0.buf
    }
 }

 impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
    fn as_ref(&self) -> &[u8; PAGE_SZ] {
-        self.slot_guard.buf
+        self.0.buf
    }
 }

@@ -293,8 +264,6 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 pub struct PageWriteGuard<'i> {
    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,

-    _permit: PinnedSlotsPermit,
-
    // Are the page contents currently valid?
    // Used to mark pages as invalid that are assigned but not yet filled with data.
    valid: bool,
@@ -379,10 +348,6 @@ impl PageCache {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Option<(Lsn, PageReadGuard)> {
-        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
-            return None;
-        };
-
        crate::metrics::PAGE_CACHE
            .for_ctx(ctx)
            .read_accesses_materialized_page
@@ -397,10 +362,7 @@ impl PageCache {
            lsn,
        };

-        if let Some(guard) = self
-            .try_lock_for_read(&mut cache_key, &mut Some(permit))
-            .await
-        {
+        if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
            if let CacheKey::MaterializedPage {
                hash_key: _,
                lsn: available_lsn,
@@ -483,29 +445,6 @@ impl PageCache {
    // "mappings" after this section. But the routines in this section should
    // not require changes.

-    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
-        let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
-        match tokio::time::timeout(
-            // Choose small timeout, neon_smgr does its own retries.
-            // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
-            Duration::from_secs(10),
-            Arc::clone(&self.pinned_slots).acquire_owned(),
-        )
-        .await
-        {
-            Ok(res) => Ok(PinnedSlotsPermit(
-                res.expect("this semaphore is never closed"),
-            )),
-            Err(_timeout) => {
-                timer.stop_and_discard();
-                crate::metrics::page_cache_errors_inc(
-                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
-                );
-                anyhow::bail!("timeout: there were page guards alive for all page cache slots")
-            }
-        }
-    }
-
    /// Look up a page in the cache.
    ///
    /// If the search criteria is not exact, *cache_key is updated with the key
@@ -515,11 +454,7 @@ impl PageCache {
    ///
    /// If no page is found, returns None and *cache_key is left unmodified.
    ///
-    async fn try_lock_for_read(
-        &self,
-        cache_key: &mut CacheKey,
-        permit: &mut Option<PinnedSlotsPermit>,
-    ) -> Option<PageReadGuard> {
+    async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
        let cache_key_orig = cache_key.clone();
        if let Some(slot_idx) = self.search_mapping(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
@@ -529,10 +464,7 @@ impl PageCache {
            let inner = slot.inner.read().await;
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
-                return Some(PageReadGuard {
-                    _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
-                    slot_guard: inner,
-                });
+                return Some(PageReadGuard(inner));
            } else {
                // search_mapping might have modified the search key; restore it.
                *cache_key = cache_key_orig;
@@ -575,8 +507,6 @@ impl PageCache {
        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
-
        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
@@ -593,21 +523,17 @@ impl PageCache {
        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
-                debug_assert!(permit.is_none());
+            if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
                if is_first_iteration {
                    hit.inc();
                }
                return Ok(ReadBufResult::Found(read_guard));
            }
-            debug_assert!(permit.is_some());
            is_first_iteration = false;

            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self
-                .find_victim(permit.as_ref().unwrap())
-                .await
-                .context("Failed to find evict victim")?;
+            let (slot_idx, mut inner) =
+                self.find_victim().context("Failed to find evict victim")?;

            // Insert mapping for this. At this point, we may find that another
            // thread did the same thing concurrently. In that case, we evicted
@@ -629,16 +555,7 @@ impl PageCache {
            inner.key = Some(cache_key.clone());
            slot.set_usage_count(1);

-            debug_assert!(
-                {
-                    let guard = inner.permit.lock().unwrap();
-                    guard.upgrade().is_none()
-                },
-                "we hold a write lock, so, no one else should have a permit"
-            );
-
            return Ok(ReadBufResult::NotFound(PageWriteGuard {
-                _permit: permit.take().unwrap(),
                inner,
                valid: false,
            }));
@@ -649,11 +566,7 @@ impl PageCache {
    /// found, returns None.
    ///
    /// When locking a page for writing, the search criteria is always "exact".
-    async fn try_lock_for_write(
-        &self,
-        cache_key: &CacheKey,
-        permit: &mut Option<PinnedSlotsPermit>,
-    ) -> Option<PageWriteGuard> {
+    async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we don't released the mapping
@@ -662,18 +575,7 @@ impl PageCache {
            let inner = slot.inner.write().await;
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
-                debug_assert!(
-                    {
-                        let guard = inner.permit.lock().unwrap();
-                        guard.upgrade().is_none()
-                    },
-                    "we hold a write lock, so, no one else should have a permit"
-                );
-                return Some(PageWriteGuard {
-                    _permit: permit.take().unwrap(),
-                    inner,
-                    valid: true,
-                });
+                return Some(PageWriteGuard { inner, valid: true });
            }
        }
        None
@@ -684,20 +586,15 @@ impl PageCache {
    /// Similar to lock_for_read(), but the returned buffer is write-locked and
    /// may be modified by the caller even if it's already found in the cache.
    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
-        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
        loop {
            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
-                debug_assert!(permit.is_none());
+            if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
                return Ok(WriteBufResult::Found(write_guard));
            }
-            debug_assert!(permit.is_some());

            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self
-                .find_victim(permit.as_ref().unwrap())
-                .await
-                .context("Failed to find evict victim")?;
+            let (slot_idx, mut inner) =
+                self.find_victim().context("Failed to find evict victim")?;

            // Insert mapping for this. At this point, we may find that another
            // thread did the same thing concurrently. In that case, we evicted
@@ -719,16 +616,7 @@ impl PageCache {
            inner.key = Some(cache_key.clone());
            slot.set_usage_count(1);

-            debug_assert!(
-                {
-                    let guard = inner.permit.lock().unwrap();
-                    guard.upgrade().is_none()
-                },
-                "we hold a write lock, so, no one else should have a permit"
-            );
-
            return Ok(WriteBufResult::NotFound(PageWriteGuard {
-                _permit: permit.take().unwrap(),
                inner,
                valid: false,
            }));
@@ -881,10 +769,7 @@ impl PageCache {
    /// Find a slot to evict.
    ///
    /// On return, the slot is empty and write-locked.
-    async fn find_victim(
-        &self,
-        _permit_witness: &PinnedSlotsPermit,
-    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
+    fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
@@ -897,40 +782,13 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
+                        // If we have looped through the whole buffer pool 10 times
+                        // and still haven't found a victim buffer, something's wrong.
+                        // Maybe all the buffers were in locked. That could happen in
+                        // theory, if you have more threads holding buffers locked than
+                        // there are buffers in the pool. In practice, with a reasonably
+                        // large buffer pool it really shouldn't happen.
                        if iters > iter_limit {
-                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
-                            // any particular number of iterations: other threads might race ahead and acquire and
-                            // release pins just as we're scanning the array.
-                            //
-                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
-                            // slots. There are two threads running concurrently, A and B. A has just
-                            // acquired the permit from the semaphore.
-                            //
-                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
-                            //   B: Acquire permit.
-                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
-                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
-                            //   B: Release pin and permit again
-                            //   B: Acquire permit.
-                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
-                            //   B: Release pin and permit again
-                            //
-                            // Now we're back in the starting situation that both slots have
-                            // usage_count 1, but A has now been through one iteration of the
-                            // find_victim() loop. This can repeat indefinitely and on each
-                            // iteration, A's iteration count increases by one.
-                            //
-                            // So, even though the semaphore for the permits is fair, the victim search
-                            // itself happens in parallel and is not fair.
-                            // Hence even with a permit, a task can theoretically be starved.
-                            // To avoid this, we'd need tokio to give priority to tasks that are holding
-                            // permits for longer.
-                            // Note that just yielding to tokio during iteration without such
-                            // priority boosting is likely counter-productive. We'd just give more opportunities
-                            // for B to bump usage count, further starving A.
-                            crate::metrics::page_cache_errors_inc(
-                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
-                            );
                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
@@ -941,7 +799,6 @@ impl PageCache {
                    self.remove_mapping(old_key);
                    inner.key = None;
                }
-                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
                return Ok((slot_idx, inner));
            }
        }
@@ -969,11 +826,7 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner {
-                        key: None,
-                        buf,
-                        permit: std::sync::Mutex::new(Weak::new()),
-                    }),
+                    inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
                    usage_count: AtomicU8::new(0),
                }
            })
@@ -985,7 +838,6 @@ impl PageCache {
            slots,
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
-            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
        }
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1265,10 +1265,7 @@ async fn get_active_tenant_with_timeout(
        Ok(tenant) => tenant,
        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
        Err(GetTenantError::NotActive(_)) => {
-            unreachable!("we're calling get_tenant with active_only=false")
-        }
-        Err(GetTenantError::Broken(_)) => {
-            unreachable!("we're calling get_tenant with active_only=false")
+            unreachable!("we're calling get_tenant with active=false")
        }
    };
    let wait_time = Duration::from_secs(30);
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -130,15 +130,10 @@ pub async fn init_tenant_mgr(
        // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
        // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
        // are processed, even though we don't block on recovery completing here.
-        //
-        // Must only do this if remote storage is enabled, otherwise deletion queue
-        // is not running and channel push will fail.
-        if resources.remote_storage.is_some() {
-            resources
-                .deletion_queue_client
-                .recover(result.clone())
-                .await?;
-        }
+        resources
+            .deletion_queue_client
+            .recover(result.clone())
+            .await?;

        Some(result)
    } else {
@@ -510,11 +505,6 @@ pub enum GetTenantError {
    NotFound(TenantId),
    #[error("Tenant {0} is not active")]
    NotActive(TenantId),
-    /// Broken is logically a subset of NotActive, but a distinct error is useful as
-    /// NotActive is usually a retryable state for API purposes, whereas Broken
-    /// is a stuck error state
-    #[error("Tenant is broken: {0}")]
-    Broken(String),
 }

 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
@@ -529,20 +519,10 @@ pub async fn get_tenant(
    let tenant = m
        .get(&tenant_id)
        .ok_or(GetTenantError::NotFound(tenant_id))?;
-
-    match tenant.current_state() {
-        TenantState::Broken {
-            reason,
-            backtrace: _,
-        } if active_only => Err(GetTenantError::Broken(reason)),
-        TenantState::Active => Ok(Arc::clone(tenant)),
-        _ => {
-            if active_only {
-                Err(GetTenantError::NotActive(tenant_id))
-            } else {
-                Ok(Arc::clone(tenant))
-            }
-        }
+    if active_only && !tenant.is_active() {
+        Err(GetTenantError::NotActive(tenant_id))
+    } else {
+        Ok(Arc::clone(tenant))
    }
 }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -496,36 +496,13 @@ impl Timeline {
        };

        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
-        let path = self
-            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
+        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
        timer.stop_and_record();

-        let timer = RECONSTRUCT_TIME.start_timer();
-        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
-        timer.stop_and_record();
-
-        if cfg!(feature = "testing") && res.is_err() {
-            // it can only be walredo issue
-            use std::fmt::Write;
-
-            let mut msg = String::new();
-
-            path.into_iter().for_each(|(res, cont_lsn, layer)| {
-                writeln!(
-                    msg,
-                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer(),
-                )
-                .expect("string grows")
-            });
-
-            // this is to rule out or provide evidence that we could in some cases read a duplicate
-            // walrecord
-            tracing::info!("walredo failed, path:\n{msg}");
-        }
-
-        res
+        RECONSTRUCT_TIME
+            .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
+            .await
    }

    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
@@ -2247,7 +2224,7 @@ impl Timeline {
        request_lsn: Lsn,
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
-    ) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
+    ) -> Result<(), PageReconstructError> {
        // Start from the current timeline.
        let mut timeline_owned;
        let mut timeline = self;
@@ -2278,12 +2255,12 @@ impl Timeline {
            // The function should have updated 'state'
            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
            match result {
-                ValueReconstructResult::Complete => return Ok(traversal_path),
+                ValueReconstructResult::Complete => return Ok(()),
                ValueReconstructResult::Continue => {
                    // If we reached an earlier cached page image, we're done.
                    if cont_lsn == cached_lsn + 1 {
                        MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
-                        return Ok(traversal_path);
+                        return Ok(());
                    }
                    if prev_lsn <= cont_lsn {
                        // Didn't make any progress in last iteration. Error out to avoid
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -177,19 +177,19 @@ impl OpenFiles {
 pub enum CrashsafeOverwriteError {
    #[error("final path has no parent dir")]
    FinalPathHasNoParentDir,
-    #[error("remove tempfile")]
+    #[error("remove tempfile: {0}")]
    RemovePreviousTempfile(#[source] std::io::Error),
-    #[error("create tempfile")]
+    #[error("create tempfile: {0}")]
    CreateTempfile(#[source] std::io::Error),
-    #[error("write tempfile")]
+    #[error("write tempfile: {0}")]
    WriteContents(#[source] std::io::Error),
-    #[error("sync tempfile")]
+    #[error("sync tempfile: {0}")]
    SyncTempfile(#[source] std::io::Error),
-    #[error("rename tempfile to final path")]
+    #[error("rename tempfile to final path: {0}")]
    RenameTempfileToFinalPath(#[source] std::io::Error),
-    #[error("open final path parent dir")]
+    #[error("open final path parent dir: {0}")]
    OpenFinalPathParentDir(#[source] std::io::Error),
-    #[error("sync final path parent dir")]
+    #[error("sync final path parent dir: {0}")]
    SyncFinalPathParentDir(#[source] std::io::Error),
 }
 impl CrashsafeOverwriteError {
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -38,9 +38,6 @@ use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

-#[cfg(feature = "testing")]
-use std::sync::atomic::{AtomicUsize, Ordering};
-
 use crate::metrics::{
    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
    WAL_REDO_WAIT_TIME,
@@ -116,9 +113,6 @@ struct ProcessOutput {
 pub struct PostgresRedoManager {
    tenant_id: TenantId,
    conf: &'static PageServerConf,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,

    stdout: Mutex<Option<ProcessOutput>>,
    stdin: Mutex<Option<ProcessInput>>,
@@ -230,8 +224,6 @@ impl PostgresRedoManager {
        PostgresRedoManager {
            tenant_id,
            conf,
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
            stdin: Mutex::new(None),
            stdout: Mutex::new(None),
            stderr: Mutex::new(None),
@@ -298,25 +290,25 @@ impl PostgresRedoManager {
            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);

            debug!(
-                "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-                len,
-                nbytes,
-                duration.as_micros(),
-                lsn
-            );
+				"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+				len,
+				nbytes,
+				duration.as_micros(),
+				lsn
+			);

            // If something went wrong, don't try to reuse the process. Kill it, and
            // next request will launch a new one.
            if result.is_err() {
                error!(
-                    "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
-                    records.len(),
-                    records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                    records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                    nbytes,
-                    base_img_lsn,
-                    lsn
-                );
+                "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
+                records.len(),
+				records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+				records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                nbytes,
+				base_img_lsn,
+                lsn
+            );
                // self.stdin only holds stdin & stderr as_raw_fd().
                // Dropping it as part of take() doesn't close them.
                // The owning objects (ChildStdout and ChildStderr) are stored in
@@ -750,7 +742,7 @@ impl PostgresRedoManager {
    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))]
    fn apply_wal_records(
        &self,
-        input: MutexGuard<Option<ProcessInput>>,
+        mut input: MutexGuard<Option<ProcessInput>>,
        tag: BufferTag,
        base_img: &Option<Bytes>,
        records: &[(Lsn, NeonWalRecord)],
@@ -787,23 +779,6 @@ impl PostgresRedoManager {
        build_get_page_msg(tag, &mut writebuf);
        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);

-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        mut input: MutexGuard<Option<ProcessInput>>,
-        wal_redo_timeout: Duration,
-    ) -> Result<Bytes, std::io::Error> {
        let proc = input.as_mut().unwrap();
        let mut nwrite = 0usize;
        let stdout_fd = proc.stdout_fd;
@@ -1009,38 +984,6 @@ impl PostgresRedoManager {
        }
        Ok(res)
    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
 }

 /// Wrapper type around `std::process::Child` which guarantees that the child
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -741,13 +741,6 @@ NeonProcessUtility(
 			break;
 		case T_DropdbStmt:
 			HandleDropDb(castNode(DropdbStmt, parseTree));
-			/*
-			 * We do this here to hack around the fact that Postgres performs the drop
-			 * INSIDE of standard_ProcessUtility, which means that if we try to
-			 * abort the drop normally it'll be too late. DROP DATABASE can't be inside
-			 * of a transaction block anyway, so this should be fine to do.
-			 */
-			NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL);
 			break;
 		case T_CreateRoleStmt:
 			HandleCreateRole(castNode(CreateRoleStmt, parseTree));
--- a/poetry.lock
+++ b/poetry.lock
@@ -2415,18 +2415,18 @@ files = [

 [[package]]
 name = "urllib3"
-version = "1.26.17"
+version = "1.26.11"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4"
 files = [
-    {file = "urllib3-1.26.17-py2.py3-none-any.whl", hash = "sha256:94a757d178c9be92ef5539b8840d48dc9cf1b2709c9d6b588232a055c524458b"},
-    {file = "urllib3-1.26.17.tar.gz", hash = "sha256:24d6a242c28d29af46c3fae832c36db3bbebcc533dd1bb549172cd739c82df21"},
+    {file = "urllib3-1.26.11-py2.py3-none-any.whl", hash = "sha256:c33ccba33c819596124764c23a97d25f32b28433ba0dedeb77d873a38722c9bc"},
+    {file = "urllib3-1.26.11.tar.gz", hash = "sha256:ea6e8fb210b19d950fab93b60c9009226c63a28808bc8386e05301e25883ac0a"},
 ]

 [package.extras]
-brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
-secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
+secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"]
 socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]

 [[package]]
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -160,19 +160,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Test(_) => Some("test".to_owned()),
        }
    }
-
-    /// Get username from the credentials.
-    pub fn get_user(&self) -> &str {
-        use BackendType::*;
-
-        match self {
-            Console(_, creds) => creds.user,
-            Postgres(_, creds) => creds.user,
-            Link(_) => "link",
-            Test(_) => "test",
-        }
-    }
-
    /// Authenticate the client via the requested backend, possibly using credentials.
    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
    pub async fn authenticate(
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -95,6 +95,20 @@ pub mod errors {
                _ => false,
            }
        }
+
+        fn cannot_retry(&self) -> bool {
+            match self {
+                // retry some transport errors
+                Self::Transport(io) => io.cannot_retry(),
+                // locked can be returned when the endpoint was in transition
+                // or when quotas are exceeded. don't retry when quotas are exceeded
+                Self::Console {
+                    status: http::StatusCode::LOCKED,
+                    ref text,
+                } => text.contains("quota"),
+                _ => false,
+            }
+        }
    }

    impl From<reqwest::Error> for ApiError {
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -414,6 +414,9 @@ where
        Ok(res) => return Ok(res),
        Err(e) => {
            error!(error = ?e, "could not connect to compute node");
+            if e.cannot_retry() {
+                return Err(e.into());
+            }
            (invalidate_cache(node_info), e)
        }
    };
@@ -501,6 +504,8 @@ pub fn handle_try_wake(

 pub trait ShouldRetry {
    fn could_retry(&self) -> bool;
+    /// return true if retrying definitely won't make a difference - or is harmful
+    fn cannot_retry(&self) -> bool;
    fn should_retry(&self, num_retries: u32) -> bool {
        match self {
            _ if num_retries >= NUM_RETRIES_CONNECT => false,
@@ -517,6 +522,9 @@ impl ShouldRetry for io::Error {
            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
        )
    }
+    fn cannot_retry(&self) -> bool {
+        false
+    }
 }

 impl ShouldRetry for tokio_postgres::error::DbError {
@@ -530,6 +538,22 @@ impl ShouldRetry for tokio_postgres::error::DbError {
                | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
        )
    }
+    fn cannot_retry(&self) -> bool {
+        use tokio_postgres::error::SqlState;
+        match self.code() {
+            &SqlState::TOO_MANY_CONNECTIONS
+            | &SqlState::INVALID_CATALOG_NAME
+            | &SqlState::INVALID_PASSWORD => true,
+            // pgbouncer errors?
+            &SqlState::PROTOCOL_VIOLATION => matches!(
+                self.message(),
+                "no more connections allowed (max_client_conn)"
+                    | "server login has been failing, try again later (server_login_retry)"
+                    | "query_wait_timeout"
+            ),
+            _ => false,
+        }
+    }
 }

 impl ShouldRetry for tokio_postgres::Error {
@@ -542,6 +566,13 @@ impl ShouldRetry for tokio_postgres::Error {
            false
        }
    }
+    fn cannot_retry(&self) -> bool {
+        if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
+            tokio_postgres::error::DbError::cannot_retry(db_err)
+        } else {
+            false
+        }
+    }
 }

 impl ShouldRetry for compute::ConnectionError {
@@ -552,6 +583,13 @@ impl ShouldRetry for compute::ConnectionError {
            _ => false,
        }
    }
+    fn cannot_retry(&self) -> bool {
+        match self {
+            compute::ConnectionError::Postgres(err) => err.cannot_retry(),
+            compute::ConnectionError::CouldNotConnect(err) => err.cannot_retry(),
+            _ => false,
+        }
+    }
 }

 pub fn retry_after(num_retries: u32) -> time::Duration {
@@ -697,14 +735,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            .await
        {
            Ok(auth_result) => auth_result,
-            Err(e) => {
-                let user = creds.get_user();
-                let db = params.get("database");
-                let app = params.get("application_name");
-                let params_span = tracing::info_span!("", ?user, ?db, ?app);
-
-                return stream.throw_error(e).instrument(params_span).await;
-            }
+            Err(e) => return stream.throw_error(e).await,
        };

        let AuthSuccess {
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -365,6 +365,9 @@ impl ShouldRetry for TestConnectError {
    fn could_retry(&self) -> bool {
        self.retryable
    }
+    fn cannot_retry(&self) -> bool {
+        false
+    }
 }

 #[async_trait]
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -37,7 +37,6 @@ from psycopg2.extensions import connection as PgConnection
 from psycopg2.extensions import cursor as PgCursor
 from psycopg2.extensions import make_dsn, parse_dsn
 from typing_extensions import Literal
-from urllib3.util.retry import Retry

 from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
@@ -1652,14 +1651,11 @@ class NeonPageserver(PgProtocol):
        if '"testing"' not in self.version:
            pytest.skip("pageserver was built without 'testing' feature")

-    def http_client(
-        self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
-    ) -> PageserverHttpClient:
+    def http_client(self, auth_token: Optional[str] = None) -> PageserverHttpClient:
        return PageserverHttpClient(
            port=self.service_port.http,
            auth_token=auth_token,
            is_testing_enabled_or_skip=self.is_testing_enabled_or_skip,
-            retries=retries,
        )

    @property
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -7,8 +7,6 @@ from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple

 import requests
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry

 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, parse_metrics
@@ -115,40 +113,12 @@ class TenantConfig:


 class PageserverHttpClient(requests.Session):
-    def __init__(
-        self,
-        port: int,
-        is_testing_enabled_or_skip: Fn,
-        auth_token: Optional[str] = None,
-        retries: Optional[Retry] = None,
-    ):
+    def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
        super().__init__()
        self.port = port
        self.auth_token = auth_token
        self.is_testing_enabled_or_skip = is_testing_enabled_or_skip

-        if retries is None:
-            # We apply a retry policy that is different to the default `requests` behavior,
-            # because the pageserver has various transiently unavailable states that benefit
-            # from a client retrying on 503
-
-            retries = Retry(
-                # Status retries are for retrying on 503 while e.g. waiting for tenants to activate
-                status=5,
-                # Connection retries are for waiting for the pageserver to come up and listen
-                connect=5,
-                # No read retries: if a request hangs that is not expected behavior
-                # (this may change in future if we do fault injection of a kind that causes
-                #  requests TCP flows to stick)
-                read=False,
-                backoff_factor=0,
-                status_forcelist=[503],
-                allowed_methods=None,
-                remove_headers_on_redirect=[],
-            )
-
-        self.mount("http://", HTTPAdapter(max_retries=retries))
-
        if auth_token is not None:
            self.headers["Authorization"] = f"Bearer {auth_token}"

--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -222,7 +222,7 @@ def get_scale_for_db(size_mb: int) -> int:


 ATTACHMENT_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
+    r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
 )


@@ -250,9 +250,6 @@ def allure_attach_from_dir(dir: Path):
            elif source.endswith(".html"):
                attachment_type = "text/html"
                extension = "html"
-            elif source.endswith(".walredo"):
-                attachment_type = "application/octet-stream"
-                extension = "walredo"
            else:
                attachment_type = "text/plain"
                extension = attachment.suffix.removeprefix(".")
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -211,12 +211,4 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
        ddl.wait()

    ddl.failures(False)
-    cur.execute("CREATE DATABASE failure WITH OWNER=cork")
-    ddl.wait()
-    with pytest.raises(psycopg2.InternalError):
-        ddl.failures(True)
-        cur.execute("DROP DATABASE failure")
-        ddl.wait()
-    ddl.pg.connect(dbname="failure")  # Ensure we can connect after a failed drop
-
    conn.close()
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -169,6 +169,3 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
        # Check that all the updates are visible
        num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
        assert num_updates == i * 100000
-
-    with open(neon_env_builder.test_output_dir / "foobar.walredo", "w") as file:
-        file.write("lets see if this ends in the report")
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.utils import (
    timeline_delete_wait_completed,
    wait_until_tenant_active,
 )
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, xfail_on_postgres
 from fixtures.types import Lsn, TenantId, TimelineId


@@ -532,24 +532,7 @@ def test_single_branch_get_tenant_size_grows(
    assert size_after == prev, "size after restarting pageserver should not have changed"


-def assert_size_approx_equal(size_a, size_b):
-    """
-    Tests that evaluate sizes are checking the pageserver space consumption
-    that sits many layers below the user input.  The exact space needed
-    varies slightly depending on postgres behavior.
-
-    Rather than expecting postgres to be determinstic and occasionally
-    failing the test, we permit sizes for the same data to vary by a few pages.
-    """
-
-    # Determined empirically from examples of equality failures: they differ
-    # by page multiples of 8272, and usually by 1-3 pages.  Tolerate 4 to avoid
-    # failing on outliers from that observed range.
-    threshold = 4 * 8272
-
-    assert size_a == pytest.approx(size_b, abs=threshold)
-
-
+@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
 def test_get_tenant_size_with_multiple_branches(
    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
 ):
@@ -590,7 +573,7 @@ def test_get_tenant_size_with_multiple_branches(
    )

    size_after_first_branch = http_client.tenant_size(tenant_id)
-    assert_size_approx_equal(size_after_first_branch, size_at_branch)
+    assert size_after_first_branch == size_at_branch

    first_branch_endpoint = env.endpoints.create_start("first-branch", tenant_id=tenant_id)

@@ -616,7 +599,7 @@ def test_get_tenant_size_with_multiple_branches(
        "second-branch", main_branch_name, tenant_id
    )
    size_after_second_branch = http_client.tenant_size(tenant_id)
-    assert_size_approx_equal(size_after_second_branch, size_after_continuing_on_main)
+    assert size_after_second_branch == size_after_continuing_on_main

    second_branch_endpoint = env.endpoints.create_start("second-branch", tenant_id=tenant_id)

@@ -652,7 +635,7 @@ def test_get_tenant_size_with_multiple_branches(
    # tenant_size but so far this has been reliable, even though at least gc
    # and tenant_size race for the same locks
    size_after = http_client.tenant_size(tenant_id)
-    assert_size_approx_equal(size_after, size_after_thinning_branch)
+    assert size_after == size_after_thinning_branch

    size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
    size_debug = http_client.tenant_size_debug(tenant_id)
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -34,7 +34,6 @@ from fixtures.remote_storage import (
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, run_pg_bench_small, wait_until
-from urllib3.util.retry import Retry


 def test_timeline_delete(neon_simple_env: NeonEnv):
@@ -615,7 +614,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):

    child_timeline_id = env.neon_cli.create_branch("child", "main")

-    ps_http = env.pageserver.http_client(retries=Retry(0, read=False))
+    ps_http = env.pageserver.http_client()

    failpoint_name = "persist_deleted_index_part"
    ps_http.configure_failpoints((failpoint_name, "pause"))
@@ -855,7 +854,7 @@ def test_timeline_delete_resumed_on_attach(
            # error from http response is also logged
            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
            # Polling after attach may fail with this
-            ".*Resource temporarily unavailable.*Tenant not yet active",
+            f".*InternalServerError\\(Tenant {tenant_id} is not active.*",
            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
        )
    )