Use GetRedoStartLsn() instead of walproposer epoch start LSN for on-demand WAL download

Remove unnecessary dependencies from postgis-build image (#9211 )
The apt install stage before this commit: 0 upgraded, 391 newly installed, 0 to remove and 9 not upgraded. Need to get 261 MB of archives. after: 0 upgraded, 367 newly installed, 0 to remove and 9 not upgraded. Need to get 220 MB of archives.
2026-05-29 19:10:38 +00:00 · 2024-10-03 18:27:28 +03:00 · 2024-10-03 10:05:23 +03:00 · 2024-10-03 02:33:09 +01:00 · 2024-10-03 00:48:12 +02:00 · 2024-10-03 00:31:19 +03:00
109 changed files with 2821 additions and 1751 deletions
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -557,7 +557,7 @@ jobs:
        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib

-        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH}"
+        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
        export LD_LIBRARY_PATH
        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -341,7 +341,7 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          SYNC_AFTER_EACH_TEST: true
+          SYNC_BETWEEN_TESTS: true
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -773,7 +773,7 @@ jobs:
      matrix:
        version: [ v14, v15, v16, v17 ]
    env:
-      VM_BUILDER_VERSION: v0.29.3
+      VM_BUILDER_VERSION: v0.35.0

    steps:
      - uses: actions/checkout@v4
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -102,12 +102,17 @@ jobs:
          # Default set of platforms to run e2e tests on
          platforms='["docker", "k8s"]'

-          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or compute/Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
+          # If a PR changes anything that affects computes, add k8s-neonvm to the list of platforms.
          # If the workflow run is not a pull request, add k8s-neonvm to the list.
          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
              case "$f" in
-                vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
+                # List of directories that contain code which affect compute images.
+                #
+                # This isn't exhaustive, just the paths that are most directly compute-related.
+                # For example, compute_ctl also depends on libs/utils, but we don't trigger
+                # an e2e run on that.
+                vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                  ;;
                *)
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,14 +53,14 @@ azure_storage_blobs = { version = "0.19", default-features = false, features = [
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "1.3", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "1.26"
-aws-sdk-iam = "1.15.0"
+aws-config = { version = "1.5", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.52"
+aws-sdk-iam = "1.46.0"
 aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
-aws-smithy-types = "1.1.9"
+aws-smithy-types = "1.2"
 aws-credential-types = "1.2.0"
-aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
-aws-types = "1.2.0"
+aws-sigv4 = { version = "1.2", features = ["sign-http"] }
+aws-types = "1.3"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -96,9 +96,12 @@ hmac = "0.12.1"
 hostname = "0.4"
 http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
+http-body-util = "0.1.2"
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
+hyper_1 = { package = "hyper", version = "1.4" }
+hyper-util = "0.1"
 tokio-tungstenite = "0.20.0"
 indexmap = "2"
 indoc = "2"
@@ -116,9 +119,10 @@ notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.20.0"
-opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.12.0"
+opentelemetry = "0.24"
+opentelemetry_sdk = "0.24"
+opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.16"
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
@@ -126,12 +130,12 @@ pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
-prost = "0.11"
+prost = "0.13"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] }
 reqwest-middleware = "0.3.0"
 reqwest-retry = "0.5"
 routerify = "3"
@@ -174,11 +178,11 @@ tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
-tonic = {version = "0.9", features = ["tls", "tls-roots"]}
+tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
-tracing-error = "0.2.0"
-tracing-opentelemetry = "0.21.0"
+tracing-error = "0.2"
+tracing-opentelemetry = "0.25"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
@@ -242,7 +246,7 @@ criterion = "0.5.1"
 rcgen = "0.12"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
-tonic-build = "0.9"
+tonic-build = "0.12"

 [patch.crates-io]

--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -12,10 +12,25 @@ ARG DEBIAN_FLAVOR=bullseye-slim
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR AS build-deps
 ARG DEBIAN_FLAVOR
-RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
+
+RUN case $DEBIAN_FLAVOR in \
+      # Version-specific installs for Bullseye (PG14-PG16):
+      # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
+      # Install newer version (3.25) from backports.
+      bullseye*) \
+        echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
+        VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \
+      ;; \
+      # Version-specific installs for Bookworm (PG17):
+      bookworm*) \
+        VERSION_INSTALLS="cmake"; \
+      ;; \
+    esac && \
+    apt update &&  \
+    apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \
+    $VERSION_INSTALLS

 #########################################################################################
 #
@@ -89,7 +104,7 @@ FROM build-deps AS postgis-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
+    apt install --no-install-recommends -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
    libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
    libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
    protobuf-c-compiler xsltproc
@@ -167,7 +182,7 @@ RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
    esac && \
    apt update && \
-    apt install -y ninja-build python3-dev libncurses5 binutils clang
+    apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang

 RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -200,27 +215,6 @@ FROM build-deps AS h3-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    case "$(uname -m)" in \
-      "x86_64") \
-        export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
-        ;; \
-      "aarch64") \
-        export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
-        ;; \
-      *) \
-        echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
-      -q -O /tmp/cmake-install.sh \
-      && echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
-      && chmod u+x /tmp/cmake-install.sh \
-      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
-      && rm /tmp/cmake-install.sh
-
 RUN case "${PG_VERSION}" in "v17") \
        mkdir -p /h3/usr/ && \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -506,8 +500,6 @@ RUN case "${PG_VERSION}" in "v17") \
        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
        ;; \
    esac && \
-    apt-get update && \
-    apt-get install -y cmake && \
    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
@@ -595,8 +587,7 @@ RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
    esac && \
    apt-get update && \
-    apt-get install -y \
-        cmake \
+    apt-get install --no-install-recommends -y \
        libboost-iostreams1.74-dev \
        libboost-regex1.74-dev \
        libboost-serialization1.74-dev \
@@ -761,7 +752,7 @@ ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN apt-get update && \
-    apt-get install -y curl libclang-dev cmake && \
+    apt-get install --no-install-recommends -y curl libclang-dev && \
    useradd -ms /bin/bash nonroot -b /home

 ENV HOME=/home/nonroot
@@ -871,6 +862,28 @@ RUN case "${PG_VERSION}" in "v17") \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control

+#########################################################################################
+#
+# Layer "pg-session-jwt-build"
+# Compile "pg_session_jwt" extension
+#
+#########################################################################################
+
+FROM rust-extensions-build AS pg-session-jwt-build
+ARG PG_VERSION
+
+RUN case "${PG_VERSION}" in "v17") \
+    echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
+    esac && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \
+    mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
+    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release
+    # it's needed to enable extension because it uses untrusted C language
+    # sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_session_jwt.control && \
+    # echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_session_jwt.control
+
 #########################################################################################
 #
 # Layer "wal2json-build"
@@ -967,6 +980,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1044,9 +1058,12 @@ FROM debian:$DEBIAN_FLAVOR AS pgbouncer
 ARG DEBIAN_FLAVOR
 RUN set -e \
    && apt-get update \
-    && apt-get install -y \
+    && apt-get install --no-install-recommends -y \
        build-essential \
        git \
+        ca-certificates \
+        autoconf \
+        automake \
        libevent-dev \
        libtool \
        pkg-config
@@ -1154,11 +1171,6 @@ RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
    esac && \
    cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-# cmake is required for the h3 test
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    apt-get update && apt-get install -y cmake
 RUN case "${PG_VERSION}" in "v17") \
    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
    esac && \
@@ -1185,7 +1197,6 @@ ENV PGDATABASE=postgres
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR
 ARG DEBIAN_FLAVOR
-ENV DEBIAN_FLAVOR=$DEBIAN_FLAVOR
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
    echo "postgres:test_console_pass" | chpasswd && \
@@ -1258,7 +1269,7 @@ RUN apt update && \
        libxml2 \
        libxslt1.1 \
        libzstd1 \
-        libcurl4-openssl-dev \
+        libcurl4 \
        locales \
        procps \
        ca-certificates \
--- a/compute/etc/neon_collector.yml
+++ b/compute/etc/neon_collector.yml
@@ -94,6 +94,68 @@ metrics:
  query: |
    select sum(pg_database_size(datname)) as total from pg_database;

+- metric_name: getpage_wait_seconds_count
+  type: counter
+  help: 'Number of getpage requests'
+  values: [getpage_wait_seconds_count]
+  query_ref: neon_perf_counters
+
+- metric_name: getpage_wait_seconds_sum
+  type: counter
+  help: 'Time spent in getpage requests'
+  values: [getpage_wait_seconds_sum]
+  query_ref: neon_perf_counters
+
+- metric_name: getpage_prefetch_requests_total
+  type: counter
+  help: 'Number of getpage issued for prefetching'
+  values: [getpage_prefetch_requests_total]
+  query_ref: neon_perf_counters
+
+- metric_name: getpage_sync_requests_total
+  type: counter
+  help: 'Number of synchronous getpage issued'
+  values: [getpage_sync_requests_total]
+  query_ref: neon_perf_counters
+
+- metric_name: getpage_prefetch_misses_total
+  type: counter
+  help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read'
+  values: [getpage_prefetch_misses_total]
+  query_ref: neon_perf_counters
+
+- metric_name: getpage_prefetch_discards_total
+  type: counter
+  help: 'Number of prefetch responses issued but not used'
+  values: [getpage_prefetch_discards_total]
+  query_ref: neon_perf_counters
+
+- metric_name: pageserver_requests_sent_total
+  type: counter
+  help: 'Number of all requests sent to the pageserver (not just GetPage requests)'
+  values: [pageserver_requests_sent_total]
+  query_ref: neon_perf_counters
+
+- metric_name: pageserver_disconnects_total
+  type: counter
+  help: 'Number of times that the connection to the pageserver was lost'
+  values: [pageserver_disconnects_total]
+  query_ref: neon_perf_counters
+
+- metric_name: pageserver_send_flushes_total
+  type: counter
+  help: 'Number of flushes to the pageserver connection'
+  values: [pageserver_send_flushes_total]
+  query_ref: neon_perf_counters
+
+- metric_name: getpage_wait_seconds_bucket
+  type: counter
+  help: 'Histogram buckets of getpage request latency'
+  key_labels:
+      - bucket_le
+  values: [value]
+  query_ref: getpage_wait_seconds_buckets
+
 # DEPRECATED
 - metric_name: lfc_approximate_working_set_size
  type: gauge
@@ -244,3 +306,26 @@ metrics:
    SELECT slot_name,
           CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
    FROM pg_replication_slots;
+
+queries:
+  - query_name: neon_perf_counters
+    query: |
+      WITH c AS (
+        SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters
+      )
+      SELECT d.*
+      FROM pg_catalog.jsonb_to_record((select jb from c)) as d(
+          getpage_wait_seconds_count numeric,
+          getpage_wait_seconds_sum numeric,
+          getpage_prefetch_requests_total numeric,
+          getpage_sync_requests_total numeric,
+          getpage_prefetch_misses_total numeric,
+          getpage_prefetch_discards_total numeric,
+          pageserver_requests_sent_total numeric,
+          pageserver_disconnects_total numeric,
+          pageserver_send_flushes_total numeric
+      );
+
+  - query_name: getpage_wait_seconds_buckets
+    query: |
+      SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -21,6 +21,7 @@ nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
 opentelemetry.workspace = true
+opentelemetry_sdk.workspace = true
 postgres.workspace = true
 regex.workspace = true
 serde_json.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -218,7 +218,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
    }
    if !startup_tracing_carrier.is_empty() {
        use opentelemetry::propagation::TextMapPropagator;
-        use opentelemetry::sdk::propagation::TraceContextPropagator;
+        use opentelemetry_sdk::propagation::TraceContextPropagator;
        let guard = TraceContextPropagator::new()
            .extract(&startup_tracing_carrier)
            .attach();
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1060,19 +1060,26 @@ impl ComputeNode {
        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;

        let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
-            let pgdata_path = Path::new(&self.pgdata);
-            // temporarily reset max_cluster_size in config
-            // to avoid the possibility of hitting the limit, while we are applying config:
-            // creating new extensions, roles, etc...
-            config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
+        if pspec.spec.mode == ComputeMode::Primary {
+            if !pspec.spec.skip_pg_catalog_updates {
+                let pgdata_path = Path::new(&self.pgdata);
+                // temporarily reset max_cluster_size in config
+                // to avoid the possibility of hitting the limit, while we are applying config:
+                // creating new extensions, roles, etc...
+                config::with_compute_ctl_tmp_override(
+                    pgdata_path,
+                    "neon.max_cluster_size=-1",
+                    || {
+                        self.pg_reload_conf()?;
+
+                        self.apply_config(&compute_state)?;
+
+                        Ok(())
+                    },
+                )?;
                self.pg_reload_conf()?;
-
-                self.apply_config(&compute_state)?;
-
-                Ok(())
-            })?;
-            self.pg_reload_conf()?;
+            }
+            self.post_apply_config()?;
        }

        let startup_end_time = Utc::now();
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,4 +1,3 @@
-use tracing_opentelemetry::OpenTelemetryLayer;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::prelude::*;

@@ -23,8 +22,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
        .with_writer(std::io::stderr);

    // Initialize OpenTelemetry
-    let otlp_layer =
-        tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
+    let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl");

    // Put it all together
    tracing_subscriber::registry()
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -168,6 +168,9 @@ pub struct NeonStorageControllerConf {

    #[serde(with = "humantime_serde")]
    pub heartbeat_interval: Duration,
+
+    #[serde(with = "humantime_serde")]
+    pub long_reconcile_threshold: Option<Duration>,
 }

 impl NeonStorageControllerConf {
@@ -190,6 +193,7 @@ impl Default for NeonStorageControllerConf {
            split_threshold: None,
            max_secondary_lag_bytes: None,
            heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
+            long_reconcile_threshold: None,
        }
    }
 }
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -517,6 +517,13 @@ impl StorageController {
            args.push(format!("--max-secondary-lag-bytes={lag}"))
        }

+        if let Some(threshold) = self.config.long_reconcile_threshold {
+            args.push(format!(
+                "--long-reconcile-threshold={}",
+                humantime::Duration::from(threshold)
+            ))
+        }
+
        args.push(format!(
            "--neon-local-repo-dir={}",
            self.env.base_data_dir.display()
--- a/docs/rfcs/038-aux-file-v2.md
+++ b/docs/rfcs/038-aux-file-v2.md
@@ -0,0 +1,112 @@
+# AUX file v2
+
+## Summary
+
+This is a retrospective RFC describing a new storage strategy for AUX files.
+
+## Motivation
+
+The original aux file storage strategy stores everything in a single `AUX_FILES_KEY`.
+Every time the compute node streams a `neon-file` record to the pageserver, it will
+update the aux file hash map, and then write the serialized hash map into the key.
+This creates serious space bloat. There was a fix to log delta records (i.e., update
+a key in the hash map) to the aux file key. In this way, the pageserver only stores
+the deltas at each of the LSNs. However, this improved v1 storage strategy still
+requires us to store everything in an aux file cache in memory, because we cannot
+fetch a single key (or file) from the compound `AUX_FILES_KEY`.
+
+### Prior art
+
+For storing large amount of small files, we can use a key-value store where the key
+is the filename and the value is the file content.
+
+## Requirements
+
+- No space bloat, fixed space amplification.
+- No write bloat, fixed write amplification.
+
+## Impacted Components
+
+pageserver
+
+## Sparse Keyspace
+
+In pageserver, we had assumed the keyspaces are always contiguous. For example, if the keyspace 0x0000-0xFFFF
+exists in the pageserver, every single key in the key range would exist in the storage. Based on the prior
+assumption, there are code that traverses the keyspace by iterating every single key.
+
+```rust
+loop {
+    // do something
+    key = key.next();
+}
+```
+
+If a keyspace is very large, for example, containing `2^64` keys, this loop will take infinite time to run.
+Therefore, we introduce the concept of sparse keyspace in this RFC. For a sparse keyspace, not every key would
+exist in the key range. Developers should not attempt to iterate every single key in the keyspace. Instead,
+they should fetch all the layer files in the key range, and then do a merge of them.
+
+In aux file v2, we store aux files within the sparse keyspace of the prefix `AUX_KEY_PREFIX`.
+
+## AUX v2 Keyspace and Key Mapping
+
+Pageserver uses fixed-size keys. The key is 128b. In order to store files of arbitrary filenames into the
+keyspace, we assign a predetermined prefix based on the directory storing the aux file, and use the FNV hash
+of the filename for the rest bits of the key. The encoding scheme is defined in `encode_aux_file_key`.
+
+For example, `pg_logical/mappings/test1` will be encoded as:
+
+```
+62 0000 01 01 7F8B83D94F7081693471ABF91C
+^ aux prefix
+        ^ assigned prefix of pg_logical/
+           ^ assigned prefix of mappings/
+              ^ 13B FNV hash of test1
+   ^ not used due to key representation
+```
+
+The prefixes of the directories should be assigned every time we add a new type of aux file into the storage within `aux_file.rs`. For all directories without an assigned prefix, it will be put into the `0xFFFF` keyspace.
+
+Note that inside pageserver, there are two representations of the keys: the 18B full key representation
+and the 16B compact key representation. For the 18B representation, some fields have restricted ranges
+of values. Therefore, the aux keys only use the 16B compact portion of the full key.
+
+It is possible that two files get mapped to the same key due to hash collision. Therefore, the value of
+each of the aux key is an array that contains all filenames and file content that should be stored in
+this key.
+
+We use `Value::Image` to store the aux keys. Therefore, page reconstruction works in the same way as before,
+and we do not need addition code to support reconstructing the value. We simply get the latest image from
+the storage.
+
+## Inbound Logical Replication Key Mapping
+
+For inbound logical replication, Postgres needs the `replorigin_checkpoint` file to store the data.
+This file not directly stored in the pageserver using the aux v2 mechanism. It is constructed during
+generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
+
+## Sparse Keyspace Read Path
+
+There are two places we need to read the aux files from the pageserver:
+
+* On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
+*  We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.
+
+## Compaction and Image Layer Generation
+
+With the add of sparse keyspaces, we also modified the compaction code to accommodate the fact that sparse keyspaces do not have every single key stored in the storage.
+
+* L0 compaction: we modified the hole computation code so that it can handle sparse keyspaces when computing holes.
+* Image layer creation: instead of calling `key.next()` and getting/reconstructing images for every single key, we use the vectored get API to scan all keys in the keyspace at a given LSN. Image layers are only created if there are too many delta layers between the latest LSN and the last image layer we generated for sparse keyspaces. The created image layer always cover the full aux key range for now, and could be optimized later.
+
+## Migration
+
+We decided not to make the new aux storage strategy (v1) compatible with the original one (v1). One feasible way of doing a seamless migration is to store new data in aux v2 while old data in aux v1, but this complicates file deletions. We want all users to start with a clean state with no aux files in the storage, and therefore, we need to do manual migrations for users using aux v1 by using the [migration script](https://github.com/neondatabase/aux_v2_migration).
+
+During the period of migration, we store the aux policy in the `index_part.json` file. When a tenant is attached
+with no policy set, the pageserver will scan the aux file keyspaces to identify the current aux policy being used (v1 or v2).
+
+If a timeline has aux v1 files stored, it will use aux file policy v1 unless we do a manual migration for them. Otherwise, the default aux file policy for new timelines is aux v2. Users enrolled in logical replication before we set aux v2 as default use aux v1 policy. Users who tried setting up inbound replication (which was not supported at that time) may also create some file entries in aux v1 store, even if they did not enroll in the logical replication testing program.
+
+The code for aux v2 migration is in https://github.com/neondatabase/aux_v2_migration. The toolkit scans all projects with logical replication enabled. For all these projects, it put the computes into maintenance mode (suspend all of then), call the migration API to switch the aux file policy on the pageserver (which drops all replication states), and restart all the computes.
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,7 +104,7 @@ pub struct ConfigToml {
    pub image_compression: ImageCompressionAlgorithm,
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: Option<crate::models::L0FlushConfig>,
-    pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
+    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
    pub io_buffer_alignment: usize,
 }

@@ -381,7 +381,7 @@ impl Default for ConfigToml {
            image_compression: (DEFAULT_IMAGE_COMPRESSION),
            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: None,
-            virtual_file_io_mode: None,
+            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),

            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -972,6 +972,8 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
+    use std::path::PathBuf;
+
    #[derive(
        Copy,
        Clone,
@@ -992,49 +994,50 @@ pub mod virtual_file {
    }

    /// Direct IO modes for a pageserver.
-    #[derive(
-        Copy,
-        Clone,
-        PartialEq,
-        Eq,
-        Hash,
-        strum_macros::EnumString,
-        strum_macros::Display,
-        serde_with::DeserializeFromStr,
-        serde_with::SerializeDisplay,
-        Debug,
-    )]
-    #[strum(serialize_all = "kebab-case")]
-    #[repr(u8)]
-    pub enum IoMode {
-        /// Uses buffered IO.
-        Buffered,
-        /// Uses direct IO, error out if the operation fails.
-        #[cfg(target_os = "linux")]
-        Direct,
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+    pub enum DirectIoMode {
+        /// Direct IO disabled (uses usual buffered IO).
+        #[default]
+        Disabled,
+        /// Direct IO disabled (performs checks and perf simulations).
+        Evaluate {
+            /// Alignment check level
+            alignment_check: DirectIoAlignmentCheckLevel,
+            /// Latency padded for performance simulation.
+            latency_padding: DirectIoLatencyPadding,
+        },
+        /// Direct IO enabled.
+        Enabled {
+            /// Actions to perform on alignment error.
+            on_alignment_error: DirectIoOnAlignmentErrorAction,
+        },
    }

-    impl IoMode {
-        pub const fn preferred() -> Self {
-            if cfg!(target_os = "linux") {
-                Self::Direct
-            } else {
-                Self::Buffered
-            }
-        }
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(rename_all = "kebab-case")]
+    pub enum DirectIoAlignmentCheckLevel {
+        #[default]
+        Error,
+        Log,
+        None,
    }

-    impl TryFrom<u8> for IoMode {
-        type Error = u8;
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(rename_all = "kebab-case")]
+    pub enum DirectIoOnAlignmentErrorAction {
+        Error,
+        #[default]
+        FallbackToBuffered,
+    }

-        fn try_from(value: u8) -> Result<Self, Self::Error> {
-            Ok(match value {
-                v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
-                #[cfg(target_os = "linux")]
-                v if v == (IoMode::Direct as u8) => IoMode::Direct,
-                x => return Err(x),
-            })
-        }
+    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
+    #[serde(tag = "type", rename_all = "kebab-case")]
+    pub enum DirectIoLatencyPadding {
+        /// Pad virtual file operations with IO to a fake file.
+        FakeFileRW { path: PathBuf },
+        #[default]
+        None,
    }
 }

--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -6,12 +6,14 @@ license.workspace = true

 [dependencies]
 hyper.workspace = true
-opentelemetry = { workspace = true, features=["rt-tokio"] }
-opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry = { workspace = true, features = ["trace"] }
+opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] }
+opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
+tracing-subscriber.workspace = true

 [dev-dependencies]
 tracing-subscriber.workspace = true    # For examples in docs
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -10,7 +10,6 @@
 //!
 //! ```rust,no_run
 //! use tracing_subscriber::prelude::*;
-//! use tracing_opentelemetry::OpenTelemetryLayer;
 //!
 //! #[tokio::main]
 //! async fn main() {
@@ -22,7 +21,7 @@
 //!         .with_writer(std::io::stderr);
 //!
 //!     // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
-//!     let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
+//!     let otlp_layer = tracing_utils::init_tracing("my_application").await;
 //!
 //!     // Put it all together
 //!     tracing_subscriber::registry()
@@ -35,15 +34,15 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]

-use opentelemetry::sdk::Resource;
-use opentelemetry::KeyValue;
-use opentelemetry_otlp::WithExportConfig;
-use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
-
-pub use tracing_opentelemetry::OpenTelemetryLayer;
-
 pub mod http;

+use opentelemetry::trace::TracerProvider;
+use opentelemetry::KeyValue;
+use opentelemetry_sdk::Resource;
+use tracing::Subscriber;
+use tracing_subscriber::registry::LookupSpan;
+use tracing_subscriber::Layer;
+
 /// Set up OpenTelemetry exporter, using configuration from environment variables.
 ///
 /// `service_name` is set as the OpenTelemetry 'service.name' resource (see
@@ -71,7 +70,10 @@ pub mod http;
 ///
 /// This doesn't block, but is marked as 'async' to hint that this must be called in
 /// asynchronous execution context.
-pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
+pub async fn init_tracing<S>(service_name: &str) -> Option<impl Layer<S>>
+where
+    S: Subscriber + for<'span> LookupSpan<'span>,
+{
    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
        return None;
    };
@@ -80,9 +82,10 @@ pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trac

 /// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
 /// tasks.
-pub fn init_tracing_without_runtime(
-    service_name: &str,
-) -> Option<opentelemetry::sdk::trace::Tracer> {
+pub fn init_tracing_without_runtime<S>(service_name: &str) -> Option<impl Layer<S>>
+where
+    S: Subscriber + for<'span> LookupSpan<'span>,
+{
    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
        return None;
    };
@@ -113,54 +116,36 @@ pub fn init_tracing_without_runtime(
    Some(init_tracing_internal(service_name.to_string()))
 }

-fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
-    // Set up exporter from the OTEL_EXPORTER_* environment variables
-    let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();
+fn init_tracing_internal<S>(service_name: String) -> impl Layer<S>
+where
+    S: Subscriber + for<'span> LookupSpan<'span>,
+{
+    // Sets up exporter from the OTEL_EXPORTER_* environment variables.
+    let exporter = opentelemetry_otlp::new_exporter().http();

-    // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
-    // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
-    // OpenTelemetry spec at
-    // <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#endpoint-urls-for-otlphttp>,
-    // the full exporter URL is formed by appending "/v1/traces" to the value
-    // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
-    // that with the grpc-tonic exporter. Other exporters, like the HTTP
-    // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
-    // appending "/v1/traces".
-    //
-    // See https://github.com/open-telemetry/opentelemetry-rust/pull/950
-    //
-    // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
-    // the endpoint url with the "/v1/traces" path ourselves. If the bug is
-    // fixed in a later version, we can remove this code. But if we don't
-    // remember to remove this, it won't do any harm either, as the crate will
-    // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
-    // is set directly with `with_endpoint`.
-    if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
-        if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
-            if !endpoint.ends_with('/') {
-                endpoint.push('/');
-            }
-            endpoint.push_str("v1/traces");
-            exporter = exporter.with_endpoint(endpoint);
-        }
-    }
+    // TODO: opentelemetry::global::set_error_handler() with custom handler that
+    //       bypasses default tracing layers, but logs regular looking log
+    //       messages.

    // Propagate trace information in the standard W3C TraceContext format.
    opentelemetry::global::set_text_map_propagator(
-        opentelemetry::sdk::propagation::TraceContextPropagator::new(),
+        opentelemetry_sdk::propagation::TraceContextPropagator::new(),
    );

-    opentelemetry_otlp::new_pipeline()
+    let tracer = opentelemetry_otlp::new_pipeline()
        .tracing()
        .with_exporter(exporter)
-        .with_trace_config(
-            opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
+        .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
+            Resource::new(vec![KeyValue::new(
                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
                service_name,
-            )])),
-        )
-        .install_batch(opentelemetry::runtime::Tokio)
+            )]),
+        ))
+        .install_batch(opentelemetry_sdk::runtime::Tokio)
        .expect("could not initialize opentelemetry exporter")
+        .tracer("global");
+
+    tracing_opentelemetry::layer().with_tracer(tracer)
 }

 // Shutdown trace pipeline gracefully, so that it has a chance to send any
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -164,10 +164,12 @@ fn criterion_benchmark(c: &mut Criterion) {
    let conf: &'static PageServerConf = Box::leak(Box::new(
        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
    ));
-
-    let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
-    virtual_file::init(16384, virtual_file::io_engine_for_bench(), align);
-    page_cache::init(conf.page_cache_size, align);
+    virtual_file::init(
+        16384,
+        virtual_file::io_engine_for_bench(),
+        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
+    );
+    page_cache::init(conf.page_cache_size);

    {
        let mut group = c.benchmark_group("ingest-small-values");
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -550,19 +550,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    /// Configs io mode at runtime.
-    pub async fn put_io_mode(
-        &self,
-        mode: &pageserver_api::models::virtual_file::IoMode,
-    ) -> Result<()> {
-        let uri = format!("{}/v1/io_mode", self.mgmt_api_endpoint);
-        self.request(Method::PUT, uri, mode)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
        let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
        self.get(uri)
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -151,10 +151,13 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

-    let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
-    pageserver::page_cache::init(100, align);
+    pageserver::virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
+    );
+    pageserver::page_cache::init(100);

    let mut total_delta_layers = 0usize;
    let mut total_image_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,9 +59,8 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
-    page_cache::init(100, align);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
+    page_cache::init(100);
    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
    let block_reader = FileBlockReader::new(&file, file_id);
@@ -191,10 +190,12 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            let align = pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
-
-            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
-            pageserver::page_cache::init(100, align);
+            pageserver::virtual_file::init(
+                10,
+                virtual_file::api::IoEngineKind::StdFs,
+                pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
+            );
+            pageserver::page_cache::init(100);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -205,9 +205,12 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    let align = DEFAULT_IO_BUFFER_ALIGNMENT;
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, align);
-    page_cache::init(100, align);
+    virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        DEFAULT_IO_BUFFER_ALIGNMENT,
+    );
+    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
 }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -63,10 +63,6 @@ pub(crate) struct Args {
    #[clap(long)]
    set_io_alignment: Option<usize>,

-    /// Before starting the benchmark, live-reconfigure the pageserver to use specified io mode (buffered vs. direct).
-    #[clap(long)]
-    set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
-
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -137,10 +133,6 @@ async fn main_impl(
        mgmt_api_client.put_io_alignment(align).await?;
    }

-    if let Some(mode) = &args.set_io_mode {
-        mgmt_api_client.put_io_mode(mode).await?;
-    }
-
    // discover targets
    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
        &mgmt_api_client,
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,7 +125,7 @@ fn main() -> anyhow::Result<()> {

    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.virtual_file_io_mode, "starting with virtual_file Direct IO settings");
+    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");

    // The tenants directory contains all the pageserver local disk state.
@@ -173,7 +173,7 @@ fn main() -> anyhow::Result<()> {
        conf.virtual_file_io_engine,
        conf.io_buffer_alignment,
    );
-    page_cache::init(conf.page_cache_size, conf.io_buffer_alignment);
+    page_cache::init(conf.page_cache_size);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -174,7 +174,7 @@ pub struct PageServerConf {
    pub l0_flush: crate::l0_flush::L0FlushConfig,

    /// Direct IO settings
-    pub virtual_file_io_mode: virtual_file::IoMode,
+    pub virtual_file_direct_io: virtual_file::DirectIoMode,

    pub io_buffer_alignment: usize,
 }
@@ -325,7 +325,7 @@ impl PageServerConf {
            image_compression,
            ephemeral_bytes_per_memory_kb,
            l0_flush,
-            virtual_file_io_mode,
+            virtual_file_direct_io,
            concurrent_tenant_warmup,
            concurrent_tenant_size_logical_size_queries,
            virtual_file_io_engine,
@@ -368,6 +368,7 @@ impl PageServerConf {
            max_vectored_read_bytes,
            image_compression,
            ephemeral_bytes_per_memory_kb,
+            virtual_file_direct_io,
            io_buffer_alignment,

            // ------------------------------------------------------------
@@ -407,7 +408,6 @@ impl PageServerConf {
            l0_flush: l0_flush
                .map(crate::l0_flush::L0FlushConfig::from)
                .unwrap_or_default(),
-            virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
        };

        // ------------------------------------------------------------
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,7 +17,6 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
-use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
@@ -57,6 +56,7 @@ use utils::http::endpoint::request_span;
 use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

+use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -81,7 +81,6 @@ use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
-use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
@@ -1720,8 +1719,13 @@ async fn timeline_gc_handler(

    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

+    let state = get_state(&request);
+
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
+    let gc_result = state
+        .tenant_manager
+        .immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)
+        .await?;

    json_response(StatusCode::OK, gc_result)
 }
@@ -1738,6 +1742,10 @@ async fn timeline_compact_handler(
    let state = get_state(&request);

    let mut flags = EnumSet::empty();
+
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
+        flags |= CompactFlags::ForceL0Compaction;
+    }
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
@@ -1784,6 +1792,9 @@ async fn timeline_checkpoint_handler(
    let state = get_state(&request);

    let mut flags = EnumSet::empty();
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
+        flags |= CompactFlags::ForceL0Compaction;
+    }
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
@@ -2382,16 +2393,6 @@ async fn put_io_alignment_handler(
    json_response(StatusCode::OK, ())
 }

-async fn put_io_mode_handler(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&r, None)?;
-    let mode: IoMode = json_request(&mut r).await?;
-    crate::virtual_file::set_io_mode(mode);
-    json_response(StatusCode::OK, ())
-}
-
 /// Polled by control plane.
 ///
 /// See [`crate::utilization`].
@@ -3082,7 +3083,6 @@ pub fn make_router(
        .put("/v1/io_alignment", |r| {
            api_handler(r, put_io_alignment_handler)
        })
-        .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
            |r| api_handler(r, force_aux_policy_switch_handler),
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -82,7 +82,6 @@ use once_cell::sync::OnceCell;
 use crate::{
    context::RequestContext,
    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
-    virtual_file::{self, dio::IoBufferMut},
 };

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -91,8 +90,8 @@ const TEST_PAGE_CACHE_SIZE: usize = 50;
 ///
 /// Initialize the page cache. This must be called once at page server startup.
 ///
-pub fn init(size: usize, align: usize) {
-    if PAGE_CACHE.set(PageCache::new(size, align)).is_err() {
+pub fn init(size: usize) {
+    if PAGE_CACHE.set(PageCache::new(size)).is_err() {
        panic!("page cache already initialized");
    }
 }
@@ -107,12 +106,7 @@ pub fn get() -> &'static PageCache {
    // page cache is usable in unit tests.
    //
    if cfg!(test) {
-        PAGE_CACHE.get_or_init(|| {
-            PageCache::new(
-                TEST_PAGE_CACHE_SIZE,
-                virtual_file::get_io_buffer_alignment(),
-            )
-        })
+        PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
    } else {
        PAGE_CACHE.get().expect("page cache not initialized")
    }
@@ -643,11 +637,13 @@ impl PageCache {
    /// Initialize a new page cache
    ///
    /// This should be called only once at page server startup.
-    fn new(num_pages: usize, align: usize) -> Self {
+    fn new(num_pages: usize) -> Self {
        assert!(num_pages > 0, "page cache size must be > 0");

-        let page_buffer =
-            IoBufferMut::with_capacity_aligned_zeroed(num_pages * PAGE_SZ, align).leak();
+        // We could use Vec::leak here, but that potentially also leaks
+        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
+        // this is avoided.
+        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -97,6 +97,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
+use crate::walingest::WalLagCooldown;
 use crate::walredo;
 use crate::InitializationOrder;
 use std::collections::hash_map::Entry;
@@ -319,6 +320,9 @@ pub struct Tenant {
    /// background warmup.
    pub(crate) activate_now_sem: tokio::sync::Semaphore,

+    /// Time it took for the tenant to activate. Zero if not active yet.
+    attach_wal_lag_cooldown: Arc<std::sync::OnceLock<WalLagCooldown>>,
+
    // Cancellation token fires when we have entered shutdown().  This is a parent of
    // Timelines' cancellation token.
    pub(crate) cancel: CancellationToken,
@@ -1000,11 +1004,15 @@ impl Tenant {
                // Remote preload is complete.
                drop(remote_load_completion);

+
                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
+                let attach_start = std::time::Instant::now();
                let attached = {
                    let _attach_timer = Some(TENANT.attach.start_timer());
                    tenant_clone.attach(preload, &ctx).await
                };
+                let attach_duration = attach_start.elapsed();
+                _ = tenant_clone.attach_wal_lag_cooldown.set(WalLagCooldown::new(attach_start, attach_duration));

                match attached {
                    Ok(()) => {
@@ -2754,6 +2762,7 @@ impl Tenant {
            pg_version,
            state,
            last_aux_file_policy,
+            self.attach_wal_lag_cooldown.clone(),
            self.cancel.child_token(),
        );

@@ -2860,6 +2869,7 @@ impl Tenant {
                Some(Duration::from_secs(3600 * 24)),
            )),
            activate_now_sem: tokio::sync::Semaphore::new(0),
+            attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
            cancel: CancellationToken::default(),
            gate: Gate::default(),
            timeline_get_throttle: Arc::new(throttle::Throttle::new(
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -8,7 +8,6 @@
 //! We cannot use global or default config instead, because wrong settings
 //! may lead to a data loss.
 //!
-use anyhow::bail;
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithmSettings;
@@ -441,29 +440,6 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
    }
 }

-impl TryFrom<toml_edit::Item> for TenantConfOpt {
-    type Error = anyhow::Error;
-
-    fn try_from(item: toml_edit::Item) -> Result<Self, Self::Error> {
-        match item {
-            toml_edit::Item::Value(value) => {
-                let d = value.into_deserializer();
-                return serde_path_to_error::deserialize(d)
-                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
-            }
-            toml_edit::Item::Table(table) => {
-                let deserializer =
-                    toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
-                return serde_path_to_error::deserialize(deserializer)
-                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
-            }
-            _ => {
-                bail!("expected non-inline table but found {item}")
-            }
-        }
-    }
-}
-
 /// This is a conversion from our internal tenant config object to the one used
 /// in external APIs.
 impl From<TenantConfOpt> for models::TenantConfig {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -84,7 +84,7 @@ impl Drop for EphemeralFile {
    fn drop(&mut self) {
        // unlink the file
        // we are clear to do this, because we have entered a gate
-        let path = self.buffered_writer.as_inner().as_inner().path();
+        let path = &self.buffered_writer.as_inner().as_inner().path;
        let res = std::fs::remove_file(path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
@@ -356,7 +356,7 @@ mod tests {
        }

        let file_contents =
-            std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
+            std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
        assert_eq!(file_contents, &content[0..cap]);

        let buffer_contents = file.buffered_writer.inspect_buffer();
@@ -392,7 +392,7 @@ mod tests {
            .buffered_writer
            .as_inner()
            .as_inner()
-            .path()
+            .path
            .metadata()
            .unwrap();
        assert_eq!(
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2197,6 +2197,82 @@ impl TenantManager {

        Ok((wanted_bytes, shard_count as u32))
    }
+
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
+    pub(crate) async fn immediate_gc(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        gc_req: TimelineGcRequest,
+        cancel: CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<GcResult, ApiError> {
+        let tenant = {
+            let guard = self.tenants.read().unwrap();
+            guard
+                .get(&tenant_shard_id)
+                .cloned()
+                .with_context(|| format!("tenant {tenant_shard_id}"))
+                .map_err(|e| ApiError::NotFound(e.into()))?
+        };
+
+        let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
+        // Use tenant's pitr setting
+        let pitr = tenant.get_pitr_interval();
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        // Run in task_mgr to avoid race with tenant_detach operation
+        let ctx: RequestContext =
+            ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+
+        let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
+
+        fail::fail_point!("immediate_gc_task_pre");
+
+        #[allow(unused_mut)]
+        let mut result = tenant
+            .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
+            .await;
+        // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
+        // better once the types support it.
+
+        #[cfg(feature = "testing")]
+        {
+            // we need to synchronize with drop completion for python tests without polling for
+            // log messages
+            if let Ok(result) = result.as_mut() {
+                let mut js = tokio::task::JoinSet::new();
+                for layer in std::mem::take(&mut result.doomed_layers) {
+                    js.spawn(layer.wait_drop());
+                }
+                tracing::info!(
+                    total = js.len(),
+                    "starting to wait for the gc'd layers to be dropped"
+                );
+                while let Some(res) = js.join_next().await {
+                    res.expect("wait_drop should not panic");
+                }
+            }
+
+            let timeline = tenant.get_timeline(timeline_id, false).ok();
+            let rtc = timeline.as_ref().map(|x| &x.remote_client);
+
+            if let Some(rtc) = rtc {
+                // layer drops schedule actions on remote timeline client to actually do the
+                // deletions; don't care about the shutdown error, just exit fast
+                drop(rtc.wait_completion().await);
+            }
+        }
+
+        result.map_err(|e| match e {
+            GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
+            GcError::TimelineNotFound => {
+                ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
+            }
+            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+        })
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -2341,7 +2417,7 @@ enum TenantSlotDropError {
 /// Errors that can happen any time we are walking the tenant map to try and acquire
 /// the TenantSlot for a particular tenant.
 #[derive(Debug, thiserror::Error)]
-pub enum TenantMapError {
+pub(crate) enum TenantMapError {
    // Tried to read while initializing
    #[error("tenant map is still initializing")]
    StillInitializing,
@@ -2371,7 +2447,7 @@ pub enum TenantMapError {
 /// The `old_value` may be dropped before the SlotGuard is dropped, by calling
 /// `drop_old_value`.  It is an error to call this without shutting down
 /// the conents of `old_value`.
-pub struct SlotGuard {
+pub(crate) struct SlotGuard {
    tenant_shard_id: TenantShardId,
    old_value: Option<TenantSlot>,
    upserted: bool,
@@ -2764,81 +2840,6 @@ use {
    utils::http::error::ApiError,
 };

-#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
-pub(crate) async fn immediate_gc(
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-    gc_req: TimelineGcRequest,
-    cancel: CancellationToken,
-    ctx: &RequestContext,
-) -> Result<GcResult, ApiError> {
-    let tenant = {
-        let guard = TENANTS.read().unwrap();
-        guard
-            .get(&tenant_shard_id)
-            .cloned()
-            .with_context(|| format!("tenant {tenant_shard_id}"))
-            .map_err(|e| ApiError::NotFound(e.into()))?
-    };
-
-    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
-    // Use tenant's pitr setting
-    let pitr = tenant.get_pitr_interval();
-
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-    // Run in task_mgr to avoid race with tenant_detach operation
-    let ctx: RequestContext =
-        ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
-
-    let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
-
-    fail::fail_point!("immediate_gc_task_pre");
-
-    #[allow(unused_mut)]
-    let mut result = tenant
-        .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-        .await;
-    // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
-    // better once the types support it.
-
-    #[cfg(feature = "testing")]
-    {
-        // we need to synchronize with drop completion for python tests without polling for
-        // log messages
-        if let Ok(result) = result.as_mut() {
-            let mut js = tokio::task::JoinSet::new();
-            for layer in std::mem::take(&mut result.doomed_layers) {
-                js.spawn(layer.wait_drop());
-            }
-            tracing::info!(
-                total = js.len(),
-                "starting to wait for the gc'd layers to be dropped"
-            );
-            while let Some(res) = js.join_next().await {
-                res.expect("wait_drop should not panic");
-            }
-        }
-
-        let timeline = tenant.get_timeline(timeline_id, false).ok();
-        let rtc = timeline.as_ref().map(|x| &x.remote_client);
-
-        if let Some(rtc) = rtc {
-            // layer drops schedule actions on remote timeline client to actually do the
-            // deletions; don't care about the shutdown error, just exit fast
-            drop(rtc.wait_completion().await);
-        }
-    }
-
-    result.map_err(|e| match e {
-        GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
-        GcError::TimelineNotFound => {
-            ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
-        }
-        other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-    })
-}
-
 #[cfg(test)]
 mod tests {
    use std::collections::BTreeMap;
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -43,12 +43,12 @@ use crate::tenant::vectored_blob_io::{
    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
-use crate::virtual_file::dio::IoBufferMut;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
+use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
@@ -572,7 +572,7 @@ impl DeltaLayerWriterInner {
        ensure!(
            metadata.len() <= S3_UPLOAD_LIMIT,
            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
-            file.path(),
+            file.path,
            metadata.len()
        );

@@ -790,7 +790,7 @@ impl DeltaLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open_v2(path, ctx)
+        let file = VirtualFile::open(path, ctx)
            .await
            .context("open layer file")?;

@@ -991,8 +991,7 @@ impl DeltaLayerInner {
            .0
            .into();
        let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let align = virtual_file::get_io_buffer_alignment();
-        let mut buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));
+        let mut buf = Some(BytesMut::with_capacity(buf_size));

        // Note that reads are processed in reverse order (from highest key+lsn).
        // This is the order that `ReconstructState` requires such that it can
@@ -1011,7 +1010,7 @@ impl DeltaLayerInner {
                            blob_meta.key,
                            PageReconstructError::Other(anyhow!(
                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path(),
+                                self.file.path,
                                kind
                            )),
                        );
@@ -1019,7 +1018,7 @@ impl DeltaLayerInner {

                    // We have "lost" the buffer since the lower level IO api
                    // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));
+                    buf = Some(BytesMut::with_capacity(buf_size));

                    continue;
                }
@@ -1037,7 +1036,7 @@ impl DeltaLayerInner {
                            meta.meta.key,
                            PageReconstructError::Other(anyhow!(e).context(format!(
                                "Failed to decompress blob from virtual file {}",
-                                self.file.path(),
+                                self.file.path,
                            ))),
                        );

@@ -1055,7 +1054,7 @@ impl DeltaLayerInner {
                            meta.meta.key,
                            PageReconstructError::Other(anyhow!(e).context(format!(
                                "Failed to deserialize blob from virtual file {}",
-                                self.file.path(),
+                                self.file.path,
                            ))),
                        );

@@ -1187,14 +1186,14 @@ impl DeltaLayerInner {
        let mut prev: Option<(Key, Lsn, BlobRef)> = None;

        let mut read_builder: Option<ChunkedVectoredReadBuilder> = None;
+        let align = virtual_file::get_io_buffer_alignment();

        let max_read_size = self
            .max_vectored_read_bytes
            .map(|x| x.0.get())
            .unwrap_or(8192);

-        let align = virtual_file::get_io_buffer_alignment();
-        let mut buffer = Some(IoBufferMut::with_capacity_aligned(max_read_size, align));
+        let mut buffer = Some(BytesMut::with_capacity(max_read_size));

        // FIXME: buffering of DeltaLayerWriter
        let mut per_blob_copy = Vec::new();
@@ -1553,12 +1552,12 @@ impl<'a> DeltaLayerIterator<'a> {
        let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
        let mut next_batch = std::collections::VecDeque::new();
        let buf_size = plan.size();
-        let align = virtual_file::get_io_buffer_alignment();
-        let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
+        let buf = BytesMut::with_capacity(buf_size);
        let blobs_buf = vectored_blob_reader
            .read_blobs(&plan, buf, self.ctx)
            .await?;
-        let view = BufView::new_slice(&blobs_buf.buf);
+        let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
        for meta in blobs_buf.blobs.iter() {
            let blob_read = meta.read(&view).await?;
            let value = Value::des(&blob_read)?;
@@ -1933,9 +1932,7 @@ pub(crate) mod test {
                &vectored_reads,
                constants::MAX_VECTORED_READ_BYTES,
            );
-
-            let align = virtual_file::get_io_buffer_alignment();
-            let mut buf = Some(IoBufferMut::with_capacity_aligned(buf_size, align));
+            let mut buf = Some(BytesMut::with_capacity(buf_size));

            for read in vectored_reads {
                let blobs_buf = vectored_blob_reader
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -40,12 +40,11 @@ use crate::tenant::vectored_blob_io::{
    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
-use crate::virtual_file::dio::IoBufferMut;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::Bytes;
+use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
@@ -389,7 +388,7 @@ impl ImageLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open_v2(path, ctx)
+        let file = VirtualFile::open(path, ctx)
            .await
            .context("open layer file")?;
        let file_id = page_cache::next_file_id();
@@ -543,15 +542,14 @@ impl ImageLayerInner {
            .await?;

        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
-        let align = virtual_file::get_io_buffer_alignment();
        let mut key_count = 0;
        for read in plan.into_iter() {
            let buf_size = read.size();

-            let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
+            let buf = BytesMut::with_capacity(buf_size);
            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-
-            let view = BufView::new_slice(&blobs_buf.buf);
+            let frozen_buf = blobs_buf.buf.freeze();
+            let view = BufView::new_bytes(frozen_buf);

            for meta in blobs_buf.blobs.iter() {
                let img_buf = meta.read(&view).await?;
@@ -599,13 +597,13 @@ impl ImageLayerInner {
                );
            }

-            let align = virtual_file::get_io_buffer_alignment();
-            let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
+            let buf = BytesMut::with_capacity(buf_size);
            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;

            match res {
                Ok(blobs_buf) => {
-                    let view = BufView::new_slice(&blobs_buf.buf);
+                    let frozen_buf = blobs_buf.buf.freeze();
+                    let view = BufView::new_bytes(frozen_buf);
                    for meta in blobs_buf.blobs.iter() {
                        let img_buf = meta.read(&view).await;

@@ -616,7 +614,7 @@ impl ImageLayerInner {
                                    meta.meta.key,
                                    PageReconstructError::Other(anyhow!(e).context(format!(
                                        "Failed to decompress blob from virtual file {}",
-                                        self.file.path(),
+                                        self.file.path,
                                    ))),
                                );

@@ -637,7 +635,7 @@ impl ImageLayerInner {
                            blob_meta.key,
                            PageReconstructError::from(anyhow!(
                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path(),
+                                self.file.path,
                                kind
                            )),
                        );
@@ -1041,12 +1039,12 @@ impl<'a> ImageLayerIterator<'a> {
        let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
        let mut next_batch = std::collections::VecDeque::new();
        let buf_size = plan.size();
-        let align = virtual_file::get_io_buffer_alignment();
-        let buf = IoBufferMut::with_capacity_aligned(buf_size, align);
+        let buf = BytesMut::with_capacity(buf_size);
        let blobs_buf = vectored_blob_reader
            .read_blobs(&plan, buf, self.ctx)
            .await?;
-        let view = BufView::new_slice(&blobs_buf.buf);
+        let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
        for meta in blobs_buf.blobs.iter() {
            let img_buf = meta.read(&view).await?;
            next_batch.push_back((
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -442,11 +442,13 @@ impl Layer {
            // Visibility was modified to Visible: maybe log about this
            match ctx.task_kind() {
                TaskKind::CalculateSyntheticSize
+                | TaskKind::OndemandLogicalSizeCalculation
                | TaskKind::GarbageCollector
                | TaskKind::MgmtRequest => {
                    // This situation is expected in code paths do binary searches of the LSN space to resolve
                    // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
-                    // and on-demand for certain HTTP API requests.
+                    // and on-demand for certain HTTP API requests. On-demand logical size calculation is also included
+                    // because it is run as a sub-task of synthetic size.
                }
                _ => {
                    // In all other contexts, it is unusual to do I/O involving layers which are not visible at
@@ -457,7 +459,7 @@ impl Layer {
                    // which was covered by a concurrent compaction.
                    tracing::info!(
                        "Layer {} became visible as a result of access",
-                        self.0.desc.key()
+                        self.0.desc.layer_name()
                    );
                }
            }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -48,7 +48,6 @@ use utils::{
    sync::gate::{Gate, GateGuard},
 };

-use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
@@ -62,6 +61,7 @@ use std::{
    collections::btree_map::Entry,
    ops::{Deref, Range},
 };
+use std::{pin::pin, sync::OnceLock};

 use crate::{
    aux_file::AuxFileSizeEstimator,
@@ -71,6 +71,7 @@ use crate::{
        metadata::TimelineMetadata,
        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
    },
+    walingest::WalLagCooldown,
    walredo,
 };
 use crate::{
@@ -429,6 +430,8 @@ pub struct Timeline {
    pub(crate) l0_flush_global_state: L0FlushGlobalState,

    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
+
+    pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
 }

 pub struct WalReceiverInfo {
@@ -737,6 +740,7 @@ pub enum GetLogicalSizePriority {
 pub(crate) enum CompactFlags {
    ForceRepartition,
    ForceImageLayerCreation,
+    ForceL0Compaction,
    EnhancedGcBottomMostCompaction,
    DryRun,
 }
@@ -2130,6 +2134,7 @@ impl Timeline {
        pg_version: u32,
        state: TimelineState,
        aux_file_policy: Option<AuxFilePolicy>,
+        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2271,6 +2276,8 @@ impl Timeline {
                l0_flush_global_state: resources.l0_flush_global_state,

                handles: Default::default(),
+
+                attach_wal_lag_cooldown,
            };

            if aux_file_policy == Some(AuxFilePolicy::V1) {
--- a/pageserver/src/tenant/timeline/analysis.rs
+++ b/pageserver/src/tenant/timeline/analysis.rs
@@ -11,6 +11,7 @@ pub(crate) struct RangeAnalysis {
    has_image: bool,
    num_of_deltas_above_image: usize,
    total_num_of_deltas: usize,
+    num_of_l0: usize,
 }

 impl Timeline {
@@ -20,8 +21,10 @@ impl Timeline {
        let mut delta_ranges = Vec::new();
        let mut image_ranges = Vec::new();

+        let num_of_l0;
        let all_layer_files = {
            let guard = self.layers.read().await;
+            num_of_l0 = guard.layer_map().unwrap().level0_deltas().len();
            guard.all_persistent_layers()
        };
        let lsn = self.get_last_record_lsn();
@@ -82,6 +85,7 @@ impl Timeline {
                has_image: image_layer.is_some(),
                num_of_deltas_above_image: maybe_delta_layers.len(),
                total_num_of_deltas: pitr_delta_layers.len(),
+                num_of_l0,
            });
        }

--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -353,7 +353,13 @@ impl Timeline {

                // 2. Compact
                let timer = self.metrics.compact_time_histo.start_timer();
-                let fully_compacted = self.compact_level0(target_file_size, ctx).await?;
+                let fully_compacted = self
+                    .compact_level0(
+                        target_file_size,
+                        flags.contains(CompactFlags::ForceL0Compaction),
+                        ctx,
+                    )
+                    .await?;
                timer.stop_and_record();

                let mut partitioning = dense_partitioning;
@@ -658,6 +664,7 @@ impl Timeline {
    async fn compact_level0(
        self: &Arc<Self>,
        target_file_size: u64,
+        force_compaction_ignore_threshold: bool,
        ctx: &RequestContext,
    ) -> Result<bool, CompactionError> {
        let CompactLevel0Phase1Result {
@@ -679,9 +686,15 @@ impl Timeline {
            let now = tokio::time::Instant::now();
            stats.read_lock_acquisition_micros =
                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
-            self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
-                .instrument(phase1_span)
-                .await?
+            self.compact_level0_phase1(
+                phase1_layers_locked,
+                stats,
+                target_file_size,
+                force_compaction_ignore_threshold,
+                &ctx,
+            )
+            .instrument(phase1_span)
+            .await?
        };

        if new_layers.is_empty() && deltas_to_compact.is_empty() {
@@ -700,6 +713,7 @@ impl Timeline {
        guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
+        force_compaction_ignore_threshold: bool,
        ctx: &RequestContext,
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
        stats.read_lock_held_spawn_blocking_startup_micros =
@@ -711,11 +725,26 @@ impl Timeline {
        // Only compact if enough layers have accumulated.
        let threshold = self.get_compaction_threshold();
        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
-            debug!(
-                level0_deltas = level0_deltas.len(),
-                threshold, "too few deltas to compact"
-            );
-            return Ok(CompactLevel0Phase1Result::default());
+            if force_compaction_ignore_threshold {
+                if !level0_deltas.is_empty() {
+                    info!(
+                        level0_deltas = level0_deltas.len(),
+                        threshold, "too few deltas to compact, but forcing compaction"
+                    );
+                } else {
+                    info!(
+                        level0_deltas = level0_deltas.len(),
+                        threshold, "too few deltas to compact, cannot force compaction"
+                    );
+                    return Ok(CompactLevel0Phase1Result::default());
+                }
+            } else {
+                debug!(
+                    level0_deltas = level0_deltas.len(),
+                    threshold, "too few deltas to compact"
+                );
+                return Ok(CompactLevel0Phase1Result::default());
+            }
        }

        let mut level0_deltas = level0_deltas
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -18,7 +18,7 @@
 use std::collections::BTreeMap;
 use std::ops::Deref;

-use bytes::Bytes;
+use bytes::{Bytes, BytesMut};
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
@@ -27,7 +27,6 @@ use utils::vec_map::VecMap;

 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
-use crate::virtual_file::dio::IoBufferMut;
 use crate::virtual_file::{self, VirtualFile};

 /// Metadata bundled with the start and end offset of a blob.
@@ -159,7 +158,7 @@ impl std::fmt::Display for VectoredBlob {
 /// Return type of [`VectoredBlobReader::read_blobs`]
 pub struct VectoredBlobsBuf {
    /// Buffer for all blobs in this read
-    pub buf: IoBufferMut,
+    pub buf: BytesMut,
    /// Offsets into the buffer and metadata for all blobs in this read
    pub blobs: Vec<VectoredBlob>,
 }
@@ -461,7 +460,7 @@ impl<'a> VectoredBlobReader<'a> {
    pub async fn read_blobs(
        &self,
        read: &VectoredRead,
-        buf: IoBufferMut,
+        buf: BytesMut,
        ctx: &RequestContext,
    ) -> Result<VectoredBlobsBuf, std::io::Error> {
        assert!(read.size() > 0);
@@ -946,8 +945,7 @@ mod tests {

        // Multiply by two (compressed data might need more space), and add a few bytes for the header
        let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
-        let align = virtual_file::get_io_buffer_alignment();
-        let mut buf = IoBufferMut::with_capacity_aligned(reserved_bytes, align);
+        let mut buf = BytesMut::with_capacity(reserved_bytes);

        let align = virtual_file::get_io_buffer_alignment();
        let vectored_blob_reader = VectoredBlobReader::new(&file);
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -23,12 +23,10 @@ use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-#[cfg(target_os = "linux")]
-use std::os::unix::fs::OpenOptionsExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;

@@ -40,11 +38,10 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
-pub(crate) use api::IoMode;
+pub(crate) use api::DirectIoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
-pub(crate) mod dio;

 pub(crate) mod owned_buffers_io {
    //! Abstractions for IO with owned buffers.
@@ -56,7 +53,6 @@ pub(crate) mod owned_buffers_io {
    //! but for the time being we're proving out the primitives in the neon.git repo
    //! for faster iteration.

-    pub(crate) mod io_buf_aligned;
    pub(crate) mod io_buf_ext;
    pub(crate) mod slice;
    pub(crate) mod write;
@@ -65,176 +61,6 @@ pub(crate) mod owned_buffers_io {
    }
 }

-#[derive(Debug)]
-pub enum VirtualFile {
-    Buffered(VirtualFileInner),
-    Direct(VirtualFileInner),
-}
-
-impl VirtualFile {
-    fn inner(&self) -> &VirtualFileInner {
-        match self {
-            Self::Buffered(file) => file,
-            Self::Direct(file) => file,
-        }
-    }
-
-    fn inner_mut(&mut self) -> &mut VirtualFileInner {
-        match self {
-            Self::Buffered(file) => file,
-            Self::Direct(file) => file,
-        }
-    }
-
-    fn into_inner(self) -> VirtualFileInner {
-        match self {
-            Self::Buffered(file) => file,
-            Self::Direct(file) => file,
-        }
-    }
-    /// Open a file in read-only mode. Like File::open.
-    pub async fn open<P: AsRef<Utf8Path>>(
-        path: P,
-        ctx: &RequestContext,
-    ) -> Result<Self, std::io::Error> {
-        let file = VirtualFileInner::open(path, ctx).await?;
-        Ok(Self::Buffered(file))
-    }
-
-    /// Open a file in read-only mode. Like File::open.
-    ///
-    /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`.
-    pub async fn open_v2<P: AsRef<Utf8Path>>(
-        path: P,
-        ctx: &RequestContext,
-    ) -> Result<Self, std::io::Error> {
-        Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await
-    }
-
-    pub async fn create<P: AsRef<Utf8Path>>(
-        path: P,
-        ctx: &RequestContext,
-    ) -> Result<Self, std::io::Error> {
-        let file = VirtualFileInner::create(path, ctx).await?;
-        Ok(Self::Buffered(file))
-    }
-
-    pub async fn create_v2<P: AsRef<Utf8Path>>(
-        path: P,
-        ctx: &RequestContext,
-    ) -> Result<Self, std::io::Error> {
-        VirtualFile::open_with_options_v2(
-            path.as_ref(),
-            OpenOptions::new().write(true).create(true).truncate(true),
-            ctx,
-        )
-        .await
-    }
-
-    pub async fn open_with_options<P: AsRef<Utf8Path>>(
-        path: P,
-        open_options: &OpenOptions,
-        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
-    ) -> Result<Self, std::io::Error> {
-        let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
-        Ok(Self::Buffered(file))
-    }
-
-    pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
-        path: P,
-        open_options: &mut OpenOptions, // Uses `&mut` here to add `O_DIRECT`.
-        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
-    ) -> Result<Self, std::io::Error> {
-        let file = match get_io_mode() {
-            IoMode::Buffered => {
-                let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
-                Self::Buffered(file)
-            }
-            #[cfg(target_os = "linux")]
-            IoMode::Direct => {
-                let file = VirtualFileInner::open_with_options(
-                    path,
-                    open_options.custom_flags(nix::libc::O_DIRECT),
-                    ctx,
-                )
-                .await?;
-                Self::Direct(file)
-            }
-        };
-        Ok(file)
-    }
-
-    pub fn path(&self) -> &Utf8Path {
-        self.inner().path.as_path()
-    }
-
-    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
-        final_path: Utf8PathBuf,
-        tmp_path: Utf8PathBuf,
-        content: B,
-    ) -> std::io::Result<()> {
-        VirtualFileInner::crashsafe_overwrite(final_path, tmp_path, content).await
-    }
-
-    pub async fn sync_all(&self) -> Result<(), Error> {
-        self.inner().sync_all().await
-    }
-
-    pub async fn sync_data(&self) -> Result<(), Error> {
-        self.inner().sync_data().await
-    }
-
-    pub async fn metadata(&self) -> Result<Metadata, Error> {
-        self.inner().metadata().await
-    }
-
-    pub fn remove(self) {
-        self.into_inner().remove();
-    }
-
-    pub async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
-        self.inner_mut().seek(pos).await
-    }
-
-    pub async fn read_exact_at<Buf>(
-        &self,
-        slice: Slice<Buf>,
-        offset: u64,
-        ctx: &RequestContext,
-    ) -> Result<Slice<Buf>, Error>
-    where
-        Buf: IoBufMut + Send,
-    {
-        self.inner().read_exact_at(slice, offset, ctx).await
-    }
-
-    pub async fn read_exact_at_page(
-        &self,
-        page: PageWriteGuard<'static>,
-        offset: u64,
-        ctx: &RequestContext,
-    ) -> Result<PageWriteGuard<'static>, Error> {
-        self.inner().read_exact_at_page(page, offset, ctx).await
-    }
-
-    pub async fn write_all_at<Buf: IoBuf + Send>(
-        &self,
-        buf: FullSlice<Buf>,
-        offset: u64,
-        ctx: &RequestContext,
-    ) -> (FullSlice<Buf>, Result<(), Error>) {
-        self.inner().write_all_at(buf, offset, ctx).await
-    }
-
-    pub async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> (FullSlice<Buf>, Result<usize, Error>) {
-        self.inner_mut().write_all(buf, ctx).await
-    }
-}
-
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
@@ -251,7 +77,7 @@ impl VirtualFile {
 /// 'tag' field is used to detect whether the handle still is valid or not.
 ///
 #[derive(Debug)]
-pub struct VirtualFileInner {
+pub struct VirtualFile {
    /// Lazy handle to the global file descriptor cache. The slot that this points to
    /// might contain our File, or it may be empty, or it may contain a File that
    /// belongs to a different VirtualFile.
@@ -524,12 +350,12 @@ macro_rules! with_file {
    }};
 }

-impl VirtualFileInner {
+impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
    pub async fn open<P: AsRef<Utf8Path>>(
        path: P,
        ctx: &RequestContext,
-    ) -> Result<VirtualFileInner, std::io::Error> {
+    ) -> Result<VirtualFile, std::io::Error> {
        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
    }

@@ -538,7 +364,7 @@ impl VirtualFileInner {
    pub async fn create<P: AsRef<Utf8Path>>(
        path: P,
        ctx: &RequestContext,
-    ) -> Result<VirtualFileInner, std::io::Error> {
+    ) -> Result<VirtualFile, std::io::Error> {
        Self::open_with_options(
            path.as_ref(),
            OpenOptions::new().write(true).create(true).truncate(true),
@@ -556,7 +382,7 @@ impl VirtualFileInner {
        path: P,
        open_options: &OpenOptions,
        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
-    ) -> Result<VirtualFileInner, std::io::Error> {
+    ) -> Result<VirtualFile, std::io::Error> {
        let path_ref = path.as_ref();
        let path_str = path_ref.to_string();
        let parts = path_str.split('/').collect::<Vec<&str>>();
@@ -587,7 +413,7 @@ impl VirtualFileInner {
            open_options.open(path_ref.as_std_path()).await?
        });

-        // Strip all options other than read and write (O_DIRECT).
+        // Strip all options other than read and write.
        //
        // It would perhaps be nicer to check just for the read and write flags
        // explicitly, but OpenOptions doesn't contain any functions to read flags,
@@ -597,7 +423,7 @@ impl VirtualFileInner {
        reopen_options.create_new(false);
        reopen_options.truncate(false);

-        let vfile = VirtualFileInner {
+        let vfile = VirtualFile {
            handle: RwLock::new(handle),
            pos: 0,
            path: path_ref.to_path_buf(),
@@ -1208,21 +1034,6 @@ impl tokio_epoll_uring::IoFd for FileGuard {

 #[cfg(test)]
 impl VirtualFile {
-    pub(crate) async fn read_blk(
-        &self,
-        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
-        self.inner().read_blk(blknum, ctx).await
-    }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        self.inner_mut().read_to_end(buf, ctx).await
-    }
-}
-
-#[cfg(test)]
-impl VirtualFileInner {
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
@@ -1256,7 +1067,7 @@ impl VirtualFileInner {
    }
 }

-impl Drop for VirtualFileInner {
+impl Drop for VirtualFile {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
        let handle = self.handle.get_mut();
@@ -1405,15 +1216,6 @@ pub(crate) fn get_io_buffer_alignment() -> usize {
    }
 }

-static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
-
-pub(crate) fn set_io_mode(mode: IoMode) {
-    IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed);
-}
-
-pub(crate) fn get_io_mode() -> IoMode {
-    IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap()
-}
 #[cfg(test)]
 mod tests {
    use crate::context::DownloadBehavior;
@@ -1722,7 +1524,7 @@ mod tests {
        // Open the file many times.
        let mut files = Vec::new();
        for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFileInner::open_with_options(
+            let f = VirtualFile::open_with_options(
                &test_file_path,
                OpenOptions::new().read(true),
                &ctx,
@@ -1774,7 +1576,7 @@ mod tests {
        let path = testdir.join("myfile");
        let tmp_path = testdir.join("myfile.tmp");

-        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
@@ -1783,7 +1585,7 @@ mod tests {
        assert!(!tmp_path.exists());
        drop(file);

-        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
@@ -1806,7 +1608,7 @@ mod tests {
        std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
        assert!(tmp_path.exists());

-        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();

--- a/pageserver/src/virtual_file/dio.rs
+++ b/pageserver/src/virtual_file/dio.rs
@@ -1,410 +0,0 @@
-#![allow(unused)]
-
-use core::slice;
-use std::{
-    alloc::{self, Layout},
-    cmp,
-    mem::{ManuallyDrop, MaybeUninit},
-    ops::{Deref, DerefMut},
-    ptr::{addr_of_mut, NonNull},
-};
-
-use bytes::buf::UninitSlice;
-
-struct IoBufferPtr(*mut u8);
-
-// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer.
-unsafe impl Send for IoBufferPtr {}
-
-/// An aligned buffer type used for I/O.
-pub struct IoBufferMut {
-    ptr: IoBufferPtr,
-    capacity: usize,
-    len: usize,
-    align: usize,
-}
-
-impl IoBufferMut {
-    /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment.
-    ///
-    /// The buffer will be able to hold at most `capacity` elements and will never resize.
-    ///
-    ///
-    /// # Panics
-    ///
-    /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met:
-    /// * `align` must not be zero,
-    ///
-    /// * `align` must be a power of two,
-    ///
-    /// * `capacity`, when rounded up to the nearest multiple of `align`,
-    ///    must not overflow isize (i.e., the rounded value must be
-    ///    less than or equal to `isize::MAX`).
-    pub fn with_capacity_aligned(capacity: usize, align: usize) -> Self {
-        let layout = Layout::from_size_align(capacity, align).expect("Invalid layout");
-
-        // SAFETY:  Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout.
-        let ptr = unsafe {
-            let ptr = alloc::alloc(layout);
-            if ptr.is_null() {
-                alloc::handle_alloc_error(layout);
-            }
-            IoBufferPtr(ptr)
-        };
-
-        IoBufferMut {
-            ptr,
-            capacity,
-            len: 0,
-            align,
-        }
-    }
-
-
-    /// Constructs a new `IoBufferMut` with at least the specified capacity and alignment, filled with zeros.
-    pub fn with_capacity_aligned_zeroed(capacity: usize, align: usize) -> Self {
-        use bytes::BufMut;
-        let mut buf = Self::with_capacity_aligned(capacity, align);
-        buf.put_bytes(0, capacity);
-        buf.len = capacity;
-        buf
-    }
-
-    /// Returns the total number of bytes the buffer can hold.
-    #[inline]
-    pub fn capacity(&self) -> usize {
-        self.capacity
-    }
-
-    /// Returns the alignment of the buffer.
-    #[inline]
-    pub fn align(&self) -> usize {
-        self.align
-    }
-
-    /// Returns the number of bytes in the buffer, also referred to as its 'length'.
-    #[inline]
-    pub fn len(&self) -> usize {
-        self.len
-    }
-
-    /// Force the length of the buffer to `new_len`.
-    #[inline]
-    unsafe fn set_len(&mut self, new_len: usize) {
-        debug_assert!(new_len <= self.capacity());
-        self.len = new_len;
-    }
-
-    #[inline]
-    fn as_ptr(&self) -> *const u8 {
-        self.ptr.0
-    }
-
-    #[inline]
-    fn as_mut_ptr(&mut self) -> *mut u8 {
-        self.ptr.0
-    }
-
-    /// Extracts a slice containing the entire buffer.
-    ///
-    /// Equivalent to `&s[..]`.
-    #[inline]
-    fn as_slice(&self) -> &[u8] {
-        // SAFETY: The pointer is valid and `len` bytes are initialized.
-        unsafe { slice::from_raw_parts(self.as_ptr(), self.len) }
-    }
-
-    /// Extracts a mutable slice of the entire buffer.
-    ///
-    /// Equivalent to `&mut s[..]`.
-    fn as_mut_slice(&mut self) -> &mut [u8] {
-        // SAFETY: The pointer is valid and `len` bytes are initialized.
-        unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) }
-    }
-
-    /// Drops the all the contents of the buffer, setting its length to `0`.
-    #[inline]
-    pub fn clear(&mut self) {
-        self.len = 0;
-    }
-
-    /// Reserves capacity for at least `additional` more bytes to be inserted
-    /// in the given `IoBufferMut`. The collection may reserve more space to
-    /// speculatively avoid frequent reallocations. After calling `reserve`,
-    /// capacity will be greater than or equal to `self.len() + additional`.
-    /// Does nothing if capacity is already sufficient.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the new capacity exceeds `isize::MAX` _bytes_.
-    pub fn reserve(&mut self, additional: usize) {
-        if additional > self.capacity() - self.len() {
-            self.reserve_inner(additional);
-        }
-    }
-
-    fn reserve_inner(&mut self, additional: usize) {
-        let Some(required_cap) = self.len().checked_add(additional) else {
-            capacity_overflow()
-        };
-
-        let old_capacity = self.capacity();
-        let align = self.align();
-        // This guarantees exponential growth. The doubling cannot overflow
-        // because `cap <= isize::MAX` and the type of `cap` is `usize`.
-        let cap = cmp::max(old_capacity * 2, required_cap);
-
-        if !is_valid_alloc(cap) {
-            capacity_overflow()
-        }
-        let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout");
-
-        let old_ptr = self.as_mut_ptr();
-
-        // SAFETY: old allocation was allocated with std::alloc::alloc with the same layout,
-        // and we panics on null pointer.
-        let (ptr, cap) = unsafe {
-            let old_layout = Layout::from_size_align_unchecked(old_capacity, align);
-            let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size());
-            if ptr.is_null() {
-                alloc::handle_alloc_error(new_layout);
-            }
-            (IoBufferPtr(ptr), cap)
-        };
-
-        self.ptr = ptr;
-        self.capacity = cap;
-    }
-
-
-    /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8].
-    pub fn leak<'a>(self) -> &'a mut [u8] {
-        let mut buf = ManuallyDrop::new(self);
-        // SAFETY: leaking the buffer as intended.
-        unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) }
-    }
-}
-
-fn capacity_overflow() -> ! {
-    panic!("capacity overflow")
-}
-
-// We need to guarantee the following:
-// * We don't ever allocate `> isize::MAX` byte-size objects.
-// * We don't overflow `usize::MAX` and actually allocate too little.
-//
-// On 64-bit we just need to check for overflow since trying to allocate
-// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add
-// an extra guard for this in case we're running on a platform which can use
-// all 4GB in user-space, e.g., PAE or x32.
-#[inline]
-fn is_valid_alloc(alloc_size: usize) -> bool {
-    !(usize::BITS < 64 && alloc_size > isize::MAX as usize)
-}
-
-impl Drop for IoBufferMut {
-    fn drop(&mut self) {
-        // SAFETY: memory was allocated with std::alloc::alloc with the same layout.
-        unsafe {
-            alloc::dealloc(
-                self.as_mut_ptr(),
-                Layout::from_size_align_unchecked(self.capacity, self.align),
-            )
-        }
-    }
-}
-
-impl Deref for IoBufferMut {
-    type Target = [u8];
-
-    fn deref(&self) -> &Self::Target {
-        self.as_slice()
-    }
-}
-
-impl DerefMut for IoBufferMut {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.as_mut_slice()
-    }
-}
-
-/// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized.
-unsafe impl bytes::BufMut for IoBufferMut {
-    #[inline]
-    fn remaining_mut(&self) -> usize {
-        // Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`.
-        // Thus, it can have at most `self.capacity` bytes.
-        self.capacity() - self.len()
-    }
-
-    // SAFETY: Caller needs to make sure the bytes being advanced past have been initialized.
-    #[inline]
-    unsafe fn advance_mut(&mut self, cnt: usize) {
-        let len: usize = self.len();
-        let remaining = self.remaining_mut();
-
-        if remaining < cnt {
-            panic_advance(cnt, remaining);
-        }
-
-        // Addition will not overflow since the sum is at most the capacity.
-        self.set_len(len + cnt);
-    }
-
-    #[inline]
-    fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice {
-        let cap = self.capacity();
-        let len = self.len();
-
-        // SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be
-        // valid for `cap - len` bytes. The subtraction will not underflow since
-        // `len <= cap`.
-        unsafe { UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len) }
-    }
-}
-
-/// Panic with a nice error message.
-#[cold]
-fn panic_advance(idx: usize, len: usize) -> ! {
-    panic!(
-        "advance out of bounds: the len is {} but advancing by {}",
-        len, idx
-    );
-}
-
-/// Safety: [`IoBufferMut`] has exclusive ownership of the io buffer,
-/// and the location remains stable even if [`Self`] is moved.
-unsafe impl tokio_epoll_uring::IoBuf for IoBufferMut {
-    fn stable_ptr(&self) -> *const u8 {
-        self.as_ptr()
-    }
-
-    fn bytes_init(&self) -> usize {
-        self.len()
-    }
-
-    fn bytes_total(&self) -> usize {
-        self.capacity()
-    }
-}
-
-// SAFETY: See above.
-unsafe impl tokio_epoll_uring::IoBufMut for IoBufferMut {
-    fn stable_mut_ptr(&mut self) -> *mut u8 {
-        self.as_mut_ptr()
-    }
-
-    unsafe fn set_init(&mut self, init_len: usize) {
-        if self.len() < init_len {
-            self.set_len(init_len);
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    use super::*;
-
-    #[test]
-    fn test_with_capacity_aligned() {
-        const ALIGN: usize = 4 * 1024;
-        let v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
-        assert_eq!(v.len(), 0);
-        assert_eq!(v.capacity(), ALIGN * 4);
-        assert_eq!(v.align(), ALIGN);
-        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
-
-        let v = IoBufferMut::with_capacity_aligned(ALIGN / 2, ALIGN);
-        assert_eq!(v.len(), 0);
-        assert_eq!(v.capacity(), ALIGN / 2);
-        assert_eq!(v.align(), ALIGN);
-        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
-    }
-
-    #[test]
-    fn test_with_capacity_aligned_zeroed() {
-        const ALIGN: usize = 4 * 1024;
-        let v = IoBufferMut::with_capacity_aligned_zeroed(ALIGN, ALIGN);
-        assert_eq!(v.len(), ALIGN);
-        assert_eq!(v.capacity(), ALIGN);
-        assert_eq!(v.align(), ALIGN);
-        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
-        assert_eq!(&v[..], &[0; ALIGN])
-    }
-
-    #[test]
-    fn test_reserve() {
-        use bytes::BufMut;
-        const ALIGN: usize = 4 * 1024;
-        let mut v = IoBufferMut::with_capacity_aligned(ALIGN, ALIGN);
-        let capacity = v.capacity();
-        v.reserve(capacity);
-        assert_eq!(v.capacity(), capacity);
-        let data = [b'a'; ALIGN];
-        v.put(&data[..]);
-        v.reserve(capacity);
-        assert!(v.capacity() >= capacity * 2);
-        assert_eq!(&v[..], &data[..]);
-        let capacity = v.capacity();
-        v.clear();
-        v.reserve(capacity);
-        assert_eq!(capacity, v.capacity());
-    }
-
-    #[test]
-    fn test_bytes_put() {
-        use bytes::BufMut;
-        const ALIGN: usize = 4 * 1024;
-        let mut v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
-        let x = [b'a'; ALIGN];
-
-        for _ in 0..2 {
-            for _ in 0..4 {
-                v.put(&x[..]);
-            }
-            assert_eq!(v.len(), ALIGN * 4);
-            assert_eq!(v.capacity(), ALIGN * 4);
-            assert_eq!(v.align(), ALIGN);
-            assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
-            v.clear()
-        }
-        assert_eq!(v.len(), 0);
-        assert_eq!(v.capacity(), ALIGN * 4);
-        assert_eq!(v.align(), ALIGN);
-        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_bytes_put_panic() {
-        use bytes::BufMut;
-        const ALIGN: usize = 4 * 1024;
-        let mut v = IoBufferMut::with_capacity_aligned(ALIGN * 4, ALIGN);
-        let x = [b'a'; ALIGN];
-        for _ in 0..5 {
-            v.put_slice(&x[..]);
-        }
-    }
-
-    #[test]
-    fn test_io_buf_put_slice() {
-        use tokio_epoll_uring::BoundedBufMut;
-        const ALIGN: usize = 4 * 1024;
-        let mut v = IoBufferMut::with_capacity_aligned(ALIGN, ALIGN);
-        let x = [b'a'; ALIGN];
-
-        for _ in 0..2 {
-            v.put_slice(&x[..]);
-            assert_eq!(v.len(), ALIGN);
-            assert_eq!(v.capacity(), ALIGN);
-            assert_eq!(v.align(), ALIGN);
-            assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
-            v.clear()
-        }
-        assert_eq!(v.len(), 0);
-        assert_eq!(v.capacity(), ALIGN);
-        assert_eq!(v.align(), ALIGN);
-        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
-    }
-}
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
@@ -1,9 +0,0 @@
-#![allow(unused)]
-
-use tokio_epoll_uring::IoBufMut;
-
-use crate::virtual_file::dio::IoBufferMut;
-
-pub(crate) trait IoBufAlignedMut: IoBufMut {}
-
-impl IoBufAlignedMut for IoBufferMut {}
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -1,6 +1,5 @@
 //! See [`FullSlice`].

-use crate::virtual_file::dio::IoBufferMut;
 use bytes::{Bytes, BytesMut};
 use std::ops::{Deref, Range};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
@@ -77,4 +76,3 @@ macro_rules! impl_io_buf_ext {
 impl_io_buf_ext!(Bytes);
 impl_io_buf_ext!(BytesMut);
 impl_io_buf_ext!(Vec<u8>);
-impl_io_buf_ext!(IoBufferMut);
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,7 +21,10 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

+use std::sync::Arc;
+use std::sync::OnceLock;
 use std::time::Duration;
+use std::time::Instant;
 use std::time::SystemTime;

 use pageserver_api::shard::ShardIdentity;
@@ -69,7 +72,29 @@ impl CheckPoint {
    }
 }

+/// Temporary limitation of WAL lag warnings after attach
+///
+/// After tenant attach, we want to limit WAL lag warnings because
+/// we don't look at the WAL until the attach is complete, which
+/// might take a while.
+pub struct WalLagCooldown {
+    /// Until when should this limitation apply at all
+    active_until: std::time::Instant,
+    /// The maximum lag to suppress. Lags above this limit get reported anyways.
+    max_lag: Duration,
+}
+
+impl WalLagCooldown {
+    pub fn new(attach_start: Instant, attach_duration: Duration) -> Self {
+        Self {
+            active_until: attach_start + attach_duration * 3 + Duration::from_secs(120),
+            max_lag: attach_duration * 2 + Duration::from_secs(60),
+        }
+    }
+}
+
 pub struct WalIngest {
+    attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
    shard: ShardIdentity,
    checkpoint: CheckPoint,
    checkpoint_modified: bool,
@@ -103,6 +128,7 @@ impl WalIngest {
            shard: *timeline.get_shard_identity(),
            checkpoint,
            checkpoint_modified: false,
+            attach_wal_lag_cooldown: timeline.attach_wal_lag_cooldown.clone(),
            warn_ingest_lag: WarnIngestLag {
                lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
                future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
@@ -1429,6 +1455,13 @@ impl WalIngest {
                    Ok(lag) => {
                        if lag > conf.wait_lsn_timeout {
                            rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| {
+                                if let Some(cooldown) = self.attach_wal_lag_cooldown.get() {
+                                    if std::time::Instant::now() < cooldown.active_until && lag <= cooldown.max_lag {
+                                        return;
+                                    }
+                                } else {
+                                    // Still loading? We shouldn't be here
+                                }
                                let lag = humantime::format_duration(lag);
                                warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
                            })
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -42,6 +42,7 @@

 #include "hll.h"
 #include "bitmap.h"
+#include "neon.h"

 #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)

@@ -173,7 +174,9 @@ lfc_disable(char const *op)
 			 * If the reason of error is ENOSPC, then truncation of file may
 			 * help to reclaim some space
 			 */
+			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_TRUNCATE);
 			int			rc = ftruncate(lfc_desc, 0);
+			pgstat_report_wait_end();

 			if (rc < 0)
 				elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
@@ -769,8 +772,10 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 		if (iteration_hits != 0)
 		{
+			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
 			rc = preadv(lfc_desc, iov, blocks_in_chunk,
 						((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+			pgstat_report_wait_end();

 			if (rc != (BLCKSZ * blocks_in_chunk))
 			{
@@ -944,8 +949,11 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		lfc_ctl->writes += blocks_in_chunk;
 		LWLockRelease(lfc_lock);

+		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
 		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
 					 ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+		pgstat_report_wait_end();
+
 		if (rc != BLCKSZ * blocks_in_chunk)
 		{
 			lfc_disable("write");
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -490,7 +490,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE,
 											   PQsocket(shard->conn),
 											   0,
-											   PG_WAIT_EXTENSION);
+											   WAIT_EVENT_NEON_PS_STARTING);
 					elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc);
 					if (rc & WL_LATCH_SET)
 					{
@@ -512,7 +512,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE,
 											   PQsocket(shard->conn),
 											   0,
-											   PG_WAIT_EXTENSION);
+											   WAIT_EVENT_NEON_PS_STARTING);
 					elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc);
 					if (rc & WL_LATCH_SET)
 					{
@@ -608,7 +608,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			WaitEvent	event;

 			/* Sleep until there's something to do */
-			(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
+			(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
+									WAIT_EVENT_NEON_PS_CONFIGURING);
 			ResetLatch(MyLatch);

 			CHECK_FOR_INTERRUPTS();
@@ -656,7 +657,8 @@ static int
 call_PQgetCopyData(shardno_t shard_no, char **buffer)
 {
 	int			ret;
-	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn = shard->conn;

 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -666,7 +668,8 @@ retry:
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
+								WAIT_EVENT_NEON_PS_READ);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -937,7 +940,7 @@ PagestoreShmemInit(void)

 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	pagestore_shared = ShmemInitStruct("libpagestore shared state",
-									   PagestoreShmemSize(),
+									   sizeof(PagestoreShmemState),
 									   &found);
 	if (!found)
 	{
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -41,6 +41,9 @@
 #include "pagestore_client.h"
 #include "control_plane_connector.h"
 #include "walsender_hooks.h"
+#if PG_MAJORVERSION_NUM >= 16
+#include "storage/ipc.h"
+#endif

 PG_MODULE_MAGIC;
 void		_PG_init(void);
@@ -49,6 +52,23 @@ static int	logical_replication_max_snap_files = 300;

 static int  running_xacts_overflow_policy;

+#if PG_MAJORVERSION_NUM >= 16
+static shmem_startup_hook_type prev_shmem_startup_hook;
+
+static void neon_shmem_startup_hook(void);
+#endif
+#if PG_MAJORVERSION_NUM >= 17
+uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
+uint32		WAIT_EVENT_NEON_LFC_READ;
+uint32		WAIT_EVENT_NEON_LFC_TRUNCATE;
+uint32		WAIT_EVENT_NEON_LFC_WRITE;
+uint32		WAIT_EVENT_NEON_PS_STARTING;
+uint32		WAIT_EVENT_NEON_PS_CONFIGURING;
+uint32		WAIT_EVENT_NEON_PS_SEND;
+uint32		WAIT_EVENT_NEON_PS_READ;
+uint32		WAIT_EVENT_NEON_WAL_DL;
+#endif
+
 enum RunningXactsOverflowPolicies {
 	OP_IGNORE,
 	OP_SKIP,
@@ -635,6 +655,9 @@ _PG_init(void)
 	 */
 #if PG_VERSION_NUM >= 160000
 	load_file("$libdir/neon_rmgr", false);
+
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = neon_shmem_startup_hook;
 #endif

 	pg_init_libpagestore();
@@ -721,3 +744,25 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }
+
+#if PG_MAJORVERSION_NUM >= 16
+static void
+neon_shmem_startup_hook(void)
+{
+	/* Initialize */
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+#if PG_PG_MAJORVERSION_NUM >= 17
+	WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance");
+	WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read");
+	WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate");
+	WAIT_EVENT_NEON_LFC_WRITE = WaitEventExtensionNew("Neon/FileCache_Write");
+	WAIT_EVENT_NEON_PS_STARTING = WaitEventExtensionNew("Neon/PS_Starting");
+	WAIT_EVENT_NEON_PS_CONFIGURING = WaitEventExtensionNew("Neon/PS_Configuring");
+	WAIT_EVENT_NEON_PS_SEND = WaitEventExtensionNew("Neon/PS_SendIO");
+	WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
+	WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
+#endif
+}
+#endif
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,8 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-# TODO: bump default version to 1.5, after we are certain that we don't
-# need to rollback the compute image
-default_version = '1.4'
+default_version = '1.5'
 module_pathname = '$libdir/neon'
 relocatable = true
 trusted = true
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -12,6 +12,7 @@
 #ifndef NEON_H
 #define NEON_H
 #include "access/xlogreader.h"
+#include "utils/wait_event.h"

 /* GUCs */
 extern char *neon_auth_token;
@@ -22,6 +23,28 @@ extern char *wal_acceptors_list;
 extern int	wal_acceptor_reconnect_timeout;
 extern int	wal_acceptor_connection_timeout;

+#if PG_MAJORVERSION_NUM >= 17
+extern uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
+extern uint32		WAIT_EVENT_NEON_LFC_READ;
+extern uint32		WAIT_EVENT_NEON_LFC_TRUNCATE;
+extern uint32		WAIT_EVENT_NEON_LFC_WRITE;
+extern uint32		WAIT_EVENT_NEON_PS_STARTING;
+extern uint32		WAIT_EVENT_NEON_PS_CONFIGURING;
+extern uint32		WAIT_EVENT_NEON_PS_SEND;
+extern uint32		WAIT_EVENT_NEON_PS_READ;
+extern uint32		WAIT_EVENT_NEON_WAL_DL;
+#else
+#define WAIT_EVENT_NEON_LFC_MAINTENANCE	PG_WAIT_EXTENSION
+#define WAIT_EVENT_NEON_LFC_READ		WAIT_EVENT_BUFFILE_READ
+#define WAIT_EVENT_NEON_LFC_TRUNCATE	WAIT_EVENT_BUFFILE_TRUNCATE
+#define WAIT_EVENT_NEON_LFC_WRITE		WAIT_EVENT_BUFFILE_WRITE
+#define WAIT_EVENT_NEON_PS_STARTING		PG_WAIT_EXTENSION
+#define WAIT_EVENT_NEON_PS_CONFIGURING	PG_WAIT_EXTENSION
+#define WAIT_EVENT_NEON_PS_SEND			PG_WAIT_EXTENSION
+#define WAIT_EVENT_NEON_PS_READ			PG_WAIT_EXTENSION
+#define WAIT_EVENT_NEON_WAL_DL			WAIT_EVENT_WAL_READ
+#endif
+
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -27,7 +27,8 @@ NeonPerfCountersShmemSize(void)
 {
 	Size		size = 0;

-	size = add_size(size, mul_size(MaxBackends, sizeof(neon_per_backend_counters)));
+	size = add_size(size, mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
+								   sizeof(neon_per_backend_counters)));

 	return size;
 }
@@ -39,7 +40,7 @@ NeonPerfCountersShmemInit(void)

 	neon_per_backend_counters_shared =
 		ShmemInitStruct("Neon perf counters",
-						mul_size(MaxBackends,
+						mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
 								 sizeof(neon_per_backend_counters)),
 						&found);
 	Assert(found == IsUnderPostmaster);
@@ -137,7 +138,7 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
 	metrics[i].is_bucket = false;
 	metrics[i].value = (double) counters->pageserver_requests_sent_total;
 	i++;
-	metrics[i].name = "pageserver_requests_disconnects_total";
+	metrics[i].name = "pageserver_disconnects_total";
 	metrics[i].is_bucket = false;
 	metrics[i].value = (double) counters->pageserver_disconnects_total;
 	i++;
@@ -192,7 +193,7 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
 	/* We put all the tuples into a tuplestore in one go. */
 	InitMaterializedSRF(fcinfo, 0);

-	for (int procno = 0; procno < MaxBackends; procno++)
+	for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++)
 	{
 		PGPROC	   *proc = GetPGProcByNumber(procno);
 		int			pid = proc->pid;
@@ -231,7 +232,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 	InitMaterializedSRF(fcinfo, 0);

 	/* Aggregate the counters across all backends */
-	for (int procno = 0; procno < MaxBackends; procno++)
+	for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++)
 	{
 		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];

--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -96,6 +96,14 @@ typedef struct
 /* Pointer to the shared memory array of neon_per_backend_counters structs */
 extern neon_per_backend_counters *neon_per_backend_counters_shared;

+/*
+ * Size of the perf counters array in shared memory. One slot for each backend
+ * and aux process. IOW one for each PGPROC slot, except for slots reserved
+ * for prepared transactions, because they're not real processes and cannot do
+ * I/O.
+ */
+#define NUM_NEON_PERF_COUNTER_SLOTS (MaxBackends + NUM_AUXILIARY_PROCS)
+
 #if PG_VERSION_NUM >= 170000
 #define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber])
 #else
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1773,6 +1773,20 @@ neon_init(void)
 	if (MyPState != NULL)
 		return;

+	/*
+	 * Sanity check that theperf counters array is sized correctly. We got
+	 * this wrong once, and the formula for max number of backends and aux
+	 * processes might well change in the future, so better safe than sorry.
+	 * This is a very cheap check so we do it even without assertions.  On
+	 * v14, this gets called before initializing MyProc, so we cannot perform
+	 * the check here. That's OK, we don't expect the logic to change in old
+	 * releases.
+	 */
+#if PG_VERSION_NUM>=150000
+	if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS])
+		elog(ERROR, "MyNeonCounters points past end of array");
+#endif
+
 	prfs_size = offsetof(PrefetchState, prf_buffer) +
 		sizeof(PrefetchRequest) * readahead_buffer_size;

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -213,7 +213,7 @@ WalProposerPoll(WalProposer *wp)
 		rc = wp->api.wait_event_set(wp, timeout, &sk, &events);

 		/* Exit loop if latch is set (we got new WAL) */
-		if ((rc == 1 && events & WL_LATCH_SET))
+		if (rc == 1 && (events & WL_LATCH_SET))
 			break;

 		/*
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -422,6 +422,9 @@ backpressure_throttling_impl(void)
 	TimestampTz start,
 				stop;
 	bool		retry = false;
+	char	   *new_status = NULL;
+	const char *old_status;
+	int			len;

 	if (PointerIsValid(PrevProcessInterruptsCallback))
 		retry = PrevProcessInterruptsCallback();
@@ -442,14 +445,24 @@ backpressure_throttling_impl(void)
 	if (lag == 0)
 		return retry;

-	/* Suspend writers until replicas catch up */
-	set_ps_display("backpressure throttling");
+
+	old_status = get_ps_display(&len);
+	new_status = (char *) palloc(len + 64 + 1);
+	memcpy(new_status, old_status, len);
+	snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
+	set_ps_display(new_status);
+	new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */

 	elog(DEBUG2, "backpressure throttling: lag %lu", lag);
 	start = GetCurrentTimestamp();
 	pg_usleep(BACK_PRESSURE_DELAY);
 	stop = GetCurrentTimestamp();
 	pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start);
+
+	/* Reset ps display */
+	set_ps_display(new_status);
+	pfree(new_status);
+
 	return true;
 }

@@ -1801,7 +1814,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 	 * If wait is terminated by latch set (walsenders' latch is set on each
 	 * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH)
 	 */
-	if ((rc == 1 && event.events & WL_LATCH_SET) || late_cv_trigger)
+	if ((rc == 1 && (event.events & WL_LATCH_SET)) || late_cv_trigger)
 	{
 		/* Reset our latch */
 		ResetLatch(MyLatch);
@@ -1813,7 +1826,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 	 * If the event contains something about the socket, it means we got an
 	 * event from a safekeeper socket.
 	 */
-	if (rc == 1 && (event.events & (WL_SOCKET_MASK)))
+	if (rc == 1 && (event.events & WL_SOCKET_MASK))
 	{
 		*sk = (Safekeeper *) event.user_data;
 		*events = event.events;
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -160,7 +160,7 @@ NeonWALPageRead(
 							  WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events,
 							  sock,
 							  timeout_ms,
-							  WAIT_EVENT_WAL_SENDER_MAIN);
+							  WAIT_EVENT_NEON_WAL_DL);
 		}
 	}
 }
@@ -191,13 +191,7 @@ NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)

 	if (!wal_reader)
 	{
-		XLogRecPtr	epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);
-
-		if (epochStartLsn == 0)
-		{
-			elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!");
-		}
-		wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] ");
+		wal_reader = NeonWALReaderAllocate(wal_segment_size, GetRedoStartLsn(), "[walsender] ");
 	}
 	xlr->page_read = NeonWALPageRead;
 	xlr->segment_open = NeonWALReadSegmentOpen;
--- a/poetry.lock
+++ b/poetry.lock
@@ -2064,73 +2064,80 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]

 [[package]]
 name = "psycopg2-binary"
-version = "2.9.6"
+version = "2.9.9"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "psycopg2-binary-2.9.6.tar.gz", hash = "sha256:1f64dcfb8f6e0c014c7f55e51c9759f024f70ea572fbdef123f85318c297947c"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d26e0342183c762de3276cca7a530d574d4e25121ca7d6e4a98e4f05cb8e4df7"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c48d8f2db17f27d41fb0e2ecd703ea41984ee19362cbce52c097963b3a1b4365"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffe9dc0a884a8848075e576c1de0290d85a533a9f6e9c4e564f19adf8f6e54a7"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a76e027f87753f9bd1ab5f7c9cb8c7628d1077ef927f5e2446477153a602f2c"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6460c7a99fc939b849431f1e73e013d54aa54293f30f1109019c56a0b2b2ec2f"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae102a98c547ee2288637af07393dd33f440c25e5cd79556b04e3fca13325e5f"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9972aad21f965599ed0106f65334230ce826e5ae69fda7cbd688d24fa922415e"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a40c00dbe17c0af5bdd55aafd6ff6679f94a9be9513a4c7e071baf3d7d22a70"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cacbdc5839bdff804dfebc058fe25684cae322987f7a38b0168bc1b2df703fb1"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7f0438fa20fb6c7e202863e0d5ab02c246d35efb1d164e052f2f3bfe2b152bd0"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-win32.whl", hash = "sha256:b6c8288bb8a84b47e07013bb4850f50538aa913d487579e1921724631d02ea1b"},
-    {file = "psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl", hash = "sha256:61b047a0537bbc3afae10f134dc6393823882eb263088c271331602b672e52e9"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:964b4dfb7c1c1965ac4c1978b0f755cc4bd698e8aa2b7667c575fb5f04ebe06b"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afe64e9b8ea66866a771996f6ff14447e8082ea26e675a295ad3bdbffdd72afb"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e2ee79e7cf29582ef770de7dab3d286431b01c3bb598f8e05e09601b890081"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa74c903a3c1f0d9b1c7e7b53ed2d929a4910e272add6700c38f365a6002820"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b83456c2d4979e08ff56180a76429263ea254c3f6552cd14ada95cff1dec9bb8"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645376d399bfd64da57148694d78e1f431b1e1ee1054872a5713125681cf1be"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e99e34c82309dd78959ba3c1590975b5d3c862d6f279f843d47d26ff89d7d7e1"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4ea29fc3ad9d91162c52b578f211ff1c931d8a38e1f58e684c45aa470adf19e2"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4ac30da8b4f57187dbf449294d23b808f8f53cad6b1fc3623fa8a6c11d176dd0"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e78e6e2a00c223e164c417628572a90093c031ed724492c763721c2e0bc2a8df"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-win32.whl", hash = "sha256:1876843d8e31c89c399e31b97d4b9725a3575bb9c2af92038464231ec40f9edb"},
-    {file = "psycopg2_binary-2.9.6-cp311-cp311-win_amd64.whl", hash = "sha256:b4b24f75d16a89cc6b4cdff0eb6a910a966ecd476d1e73f7ce5985ff1328e9a6"},
-    {file = "psycopg2_binary-2.9.6-cp36-cp36m-win32.whl", hash = "sha256:498807b927ca2510baea1b05cc91d7da4718a0f53cb766c154c417a39f1820a0"},
-    {file = "psycopg2_binary-2.9.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0d236c2825fa656a2d98bbb0e52370a2e852e5a0ec45fc4f402977313329174d"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:34b9ccdf210cbbb1303c7c4db2905fa0319391bd5904d32689e6dd5c963d2ea8"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d2222e61f313c4848ff05353653bf5f5cf6ce34df540e4274516880d9c3763"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30637a20623e2a2eacc420059be11527f4458ef54352d870b8181a4c3020ae6b"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8122cfc7cae0da9a3077216528b8bb3629c43b25053284cc868744bfe71eb141"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38601cbbfe600362c43714482f43b7c110b20cb0f8172422c616b09b85a750c5"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c7e62ab8b332147a7593a385d4f368874d5fe4ad4e341770d4983442d89603e3"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2ab652e729ff4ad76d400df2624d223d6e265ef81bb8aa17fbd63607878ecbee"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c83a74b68270028dc8ee74d38ecfaf9c90eed23c8959fca95bd703d25b82c88e"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4e6036decf4b72d6425d5b29bbd3e8f0ff1059cda7ac7b96d6ac5ed34ffbacd"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-win32.whl", hash = "sha256:a8c28fd40a4226b4a84bdf2d2b5b37d2c7bd49486b5adcc200e8c7ec991dfa7e"},
-    {file = "psycopg2_binary-2.9.6-cp37-cp37m-win_amd64.whl", hash = "sha256:51537e3d299be0db9137b321dfb6a5022caaab275775680e0c3d281feefaca6b"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf4499e0a83b7b7edcb8dabecbd8501d0d3a5ef66457200f77bde3d210d5debb"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e13a5a2c01151f1208d5207e42f33ba86d561b7a89fca67c700b9486a06d0e2"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e0f754d27fddcfd74006455b6e04e6705d6c31a612ec69ddc040a5468e44b4e"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d57c3fd55d9058645d26ae37d76e61156a27722097229d32a9e73ed54819982a"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71f14375d6f73b62800530b581aed3ada394039877818b2d5f7fc77e3bb6894d"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:441cc2f8869a4f0f4bb408475e5ae0ee1f3b55b33f350406150277f7f35384fc"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:65bee1e49fa6f9cf327ce0e01c4c10f39165ee76d35c846ade7cb0ec6683e303"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:af335bac6b666cc6aea16f11d486c3b794029d9df029967f9938a4bed59b6a19"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:cfec476887aa231b8548ece2e06d28edc87c1397ebd83922299af2e051cf2827"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65c07febd1936d63bfde78948b76cd4c2a411572a44ac50719ead41947d0f26b"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-win32.whl", hash = "sha256:4dfb4be774c4436a4526d0c554af0cc2e02082c38303852a36f6456ece7b3503"},
-    {file = "psycopg2_binary-2.9.6-cp38-cp38-win_amd64.whl", hash = "sha256:02c6e3cf3439e213e4ee930308dc122d6fb4d4bea9aef4a12535fbd605d1a2fe"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9182eb20f41417ea1dd8e8f7888c4d7c6e805f8a7c98c1081778a3da2bee3e4"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8a6979cf527e2603d349a91060f428bcb135aea2be3201dff794813256c274f1"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8338a271cb71d8da40b023a35d9c1e919eba6cbd8fa20a54b748a332c355d896"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ed340d2b858d6e6fb5083f87c09996506af483227735de6964a6100b4e6a54"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f81e65376e52f03422e1fb475c9514185669943798ed019ac50410fb4c4df232"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfb13af3c5dd3a9588000910178de17010ebcccd37b4f9794b00595e3a8ddad3"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4c727b597c6444a16e9119386b59388f8a424223302d0c06c676ec8b4bc1f963"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d67fbdaf177da06374473ef6f7ed8cc0a9dc640b01abfe9e8a2ccb1b1402c1f"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0892ef645c2fabb0c75ec32d79f4252542d0caec1d5d949630e7d242ca4681a3"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:02c0f3757a4300cf379eb49f543fb7ac527fb00144d39246ee40e1df684ab514"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-win32.whl", hash = "sha256:c3dba7dab16709a33a847e5cd756767271697041fbe3fe97c215b1fc1f5c9848"},
-    {file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"},
+    {file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83791a65b51ad6ee6cf0845634859d69a038ea9b03d7b26e703f94c7e93dbcf9"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ef4854e82c09e84cc63084a9e4ccd6d9b154f1dbdd283efb92ecd0b5e2b8c84"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed1184ab8f113e8d660ce49a56390ca181f2981066acc27cf637d5c1e10ce46e"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d2997c458c690ec2bc6b0b7ecbafd02b029b7b4283078d3b32a852a7ce3ddd98"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b58b4710c7f4161b5e9dcbe73bb7c62d65670a87df7bcce9e1faaad43e715245"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0c009475ee389757e6e34611d75f6e4f05f0cf5ebb76c6037508318e1a1e0d7e"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8dbf6d1bc73f1d04ec1734bae3b4fb0ee3cb2a493d35ede9badbeb901fb40f6f"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-win32.whl", hash = "sha256:3f78fd71c4f43a13d342be74ebbc0666fe1f555b8837eb113cb7416856c79682"},
+    {file = "psycopg2_binary-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:876801744b0dee379e4e3c38b76fc89f88834bb15bf92ee07d94acd06ec890a0"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee825e70b1a209475622f7f7b776785bd68f34af6e7a46e2e42f27b659b5bc26"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ea665f8ce695bcc37a90ee52de7a7980be5161375d42a0b6c6abedbf0d81f0f"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:143072318f793f53819048fdfe30c321890af0c3ec7cb1dfc9cc87aa88241de2"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c332c8d69fb64979ebf76613c66b985414927a40f8defa16cf1bc028b7b0a7b0"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7fc5a5acafb7d6ccca13bfa8c90f8c51f13d8fb87d95656d3950f0158d3ce53"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977646e05232579d2e7b9c59e21dbe5261f403a88417f6a6512e70d3f8a046be"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b6356793b84728d9d50ead16ab43c187673831e9d4019013f1402c41b1db9b27"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bc7bb56d04601d443f24094e9e31ae6deec9ccb23581f75343feebaf30423359"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:77853062a2c45be16fd6b8d6de2a99278ee1d985a7bd8b103e97e41c034006d2"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78151aa3ec21dccd5cdef6c74c3e73386dcdfaf19bced944169697d7ac7482fc"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
+    {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e6f98446430fdf41bd36d4faa6cb409f5140c1c2cf58ce0bbdaf16af7d3f119"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c77e3d1862452565875eb31bdb45ac62502feabbd53429fdc39a1cc341d681ba"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"},
+    {file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"},
+    {file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"},
+    {file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"},
 ]

 [[package]]
@@ -2577,7 +2584,6 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2702,13 +2708,13 @@ files = [

 [[package]]
 name = "requests"
-version = "2.32.0"
+version = "2.32.3"
 description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
-    {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
+    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
+    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
 ]

 [package.dependencies]
@@ -3131,16 +3137,6 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3378,4 +3374,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055"
+content-hash = "9055b73352f1534f664cd8af6ebf8d93cf3bf857f115756f312ff2e3ae1bbbc1"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -82,7 +82,6 @@ tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
-tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -73,6 +73,9 @@ pub(crate) enum AuthErrorImpl {

    #[error("Authentication timed out")]
    UserTimeout(Elapsed),
+
+    #[error("Disconnected due to inactivity after {0}.")]
+    ConfirmationTimeout(humantime::Duration),
 }

 #[derive(Debug, Error)]
@@ -103,6 +106,10 @@ impl AuthError {
    pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
        AuthErrorImpl::UserTimeout(elapsed).into()
    }
+
+    pub(crate) fn confirmation_timeout(timeout: humantime::Duration) -> Self {
+        AuthErrorImpl::ConfirmationTimeout(timeout).into()
+    }
 }

 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -125,6 +132,7 @@ impl UserFacingError for AuthError {
            AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
            AuthErrorImpl::TooManyConnections => self.to_string(),
            AuthErrorImpl::UserTimeout(_) => self.to_string(),
+            AuthErrorImpl::ConfirmationTimeout(_) => self.to_string(),
        }
    }
 }
@@ -143,6 +151,7 @@ impl ReportableError for AuthError {
            AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
            AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
            AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
+            AuthErrorImpl::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
        }
    }
 }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -565,7 +565,7 @@ mod tests {
        stream::{PqStream, Stream},
    };

-    use super::{auth_quirks, AuthRateLimiter};
+    use super::{auth_quirks, jwt::JwkCache, AuthRateLimiter};

    struct Auth {
        ips: Vec<IpPattern>,
@@ -611,12 +611,16 @@ mod tests {
    }

    static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
+        jwks_cache: JwkCache::default(),
        thread_pool: ThreadPool::new(1),
        scram_protocol_timeout: std::time::Duration::from_secs(5),
        rate_limiter_enabled: true,
        rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
        rate_limit_ip_subnet: 64,
        ip_allowlist_check_enabled: true,
+        is_auth_broker: false,
+        accept_jwts: false,
+        webauth_confirmation_timeout: std::time::Duration::from_secs(5),
    });

    async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -8,7 +8,7 @@ use anyhow::{bail, ensure, Context};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
-use serde::{Deserialize, Deserializer};
+use serde::{de::Visitor, Deserialize, Deserializer};
 use signature::Verifier;
 use tokio::time::Instant;

@@ -261,10 +261,6 @@ impl JwkCacheEntryLock {
        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
            .context("Provided authentication token is not a valid JWT encoding")?;

-        ensure!(
-            header.typ == "JWT",
-            "Provided authentication token is not a valid JWT encoding"
-        );
        let kid = header.key_id.context("missing key id")?;

        let mut guard = self
@@ -299,7 +295,7 @@ impl JwkCacheEntryLock {
                verify_ec_signature(header_payload.as_bytes(), &sig, key)?;
            }
            jose_jwk::Key::Rsa(key) => {
-                verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?;
+                verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?;
            }
            key => bail!("unsupported key type {key:?}"),
        };
@@ -311,13 +307,11 @@ impl JwkCacheEntryLock {

        tracing::debug!(?payload, "JWT signature valid with claims");

-        match (expected_audience, payload.audience) {
-            // check the audience matches
-            (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"),
-            // the audience is expected but is missing
-            (Some(_), None) => bail!("invalid JWT token audience"),
-            // we don't care for the audience field
-            (None, _) => {}
+        if let Some(aud) = expected_audience {
+            ensure!(
+                payload.audience.0.iter().any(|s| s == aud),
+                "invalid JWT token audience"
+            );
        }

        let now = SystemTime::now();
@@ -383,7 +377,7 @@ fn verify_rsa_signature(
    data: &[u8],
    sig: &[u8],
    key: &jose_jwk::Rsa,
-    alg: &Option<jose_jwa::Algorithm>,
+    alg: &jose_jwa::Algorithm,
 ) -> anyhow::Result<()> {
    use jose_jwa::{Algorithm, Signing};
    use rsa::{
@@ -394,7 +388,7 @@ fn verify_rsa_signature(
    let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;

    match alg {
-        Some(Algorithm::Signing(Signing::Rs256)) => {
+        Algorithm::Signing(Signing::Rs256) => {
            let key = VerifyingKey::<sha2::Sha256>::new(key);
            let sig = Signature::try_from(sig)?;
            key.verify(data, &sig)?;
@@ -408,9 +402,6 @@ fn verify_rsa_signature(
 /// <https://datatracker.ietf.org/doc/html/rfc7515#section-4.1>
 #[derive(serde::Deserialize, serde::Serialize)]
 struct JwtHeader<'a> {
-    /// must be "JWT"
-    #[serde(rename = "typ")]
-    typ: &'a str,
    /// must be a supported alg
    #[serde(rename = "alg")]
    algorithm: jose_jwa::Algorithm,
@@ -420,11 +411,12 @@ struct JwtHeader<'a> {
 }

 /// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
-#[derive(serde::Deserialize, serde::Serialize, Debug)]
+#[derive(serde::Deserialize, Debug)]
+#[allow(dead_code)]
 struct JwtPayload<'a> {
    /// Audience - Recipient for which the JWT is intended
-    #[serde(rename = "aud")]
-    audience: Option<&'a str>,
+    #[serde(rename = "aud", default)]
+    audience: OneOrMany,
    /// Expiration - Time after which the JWT expires
    #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
    expiration: Option<SystemTime>,
@@ -447,6 +439,59 @@ struct JwtPayload<'a> {
    session_id: Option<&'a str>,
 }

+/// `OneOrMany` supports parsing either a single item or an array of items.
+///
+/// Needed for <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1.3>
+///
+/// > The "aud" (audience) claim identifies the recipients that the JWT is
+/// > intended for.  Each principal intended to process the JWT MUST
+/// > identify itself with a value in the audience claim.  If the principal
+/// > processing the claim does not identify itself with a value in the
+/// > "aud" claim when this claim is present, then the JWT MUST be
+/// > rejected.  In the general case, the "aud" value is **an array of case-
+/// > sensitive strings**, each containing a StringOrURI value.  In the
+/// > special case when the JWT has one audience, the "aud" value MAY be a
+/// > **single case-sensitive string** containing a StringOrURI value.  The
+/// > interpretation of audience values is generally application specific.
+/// > Use of this claim is OPTIONAL.
+#[derive(Default, Debug)]
+struct OneOrMany(Vec<String>);
+
+impl<'de> Deserialize<'de> for OneOrMany {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        struct OneOrManyVisitor;
+        impl<'de> Visitor<'de> for OneOrManyVisitor {
+            type Value = OneOrMany;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                formatter.write_str("a single string or an array of strings")
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Ok(OneOrMany(vec![v.to_owned()]))
+            }
+
+            fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let mut v = vec![];
+                while let Some(s) = seq.next_element()? {
+                    v.push(s);
+                }
+                Ok(OneOrMany(v))
+            }
+        }
+        deserializer.deserialize_any(OneOrManyVisitor)
+    }
+}
+
 fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
    let d = <Option<u64>>::deserialize(d)?;
    Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
@@ -540,7 +585,6 @@ mod tests {
            key: jose_jwk::Key::Ec(pk),
            prm: jose_jwk::Parameters {
                kid: Some(kid),
-                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
                ..Default::default()
            },
        };
@@ -554,7 +598,6 @@ mod tests {
            key: jose_jwk::Key::Rsa(pk),
            prm: jose_jwk::Parameters {
                kid: Some(kid),
-                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
                ..Default::default()
            },
        };
@@ -563,7 +606,6 @@ mod tests {

    fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
        let header = JwtHeader {
-            typ: "JWT",
            algorithm: jose_jwa::Algorithm::Signing(sig),
            key_id: Some(&kid),
        };
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -14,17 +14,15 @@ use crate::{
    EndpointId,
 };

-use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
+use super::jwt::{AuthRule, FetchAuthRules};

 pub struct LocalBackend {
-    pub(crate) jwks_cache: JwkCache,
    pub(crate) node_info: NodeInfo,
 }

 impl LocalBackend {
    pub fn new(postgres_addr: SocketAddr) -> Self {
        LocalBackend {
-            jwks_cache: JwkCache::default(),
            node_info: NodeInfo {
                config: {
                    let mut cfg = ConnCfg::new();
--- a/proxy/src/auth/backend/web.rs
+++ b/proxy/src/auth/backend/web.rs
@@ -89,7 +89,12 @@ pub(super) async fn authenticate(

    // Wait for web console response (see `mgmt`).
    info!(parent: &span, "waiting for console's reply...");
-    let db_info = waiter.await.map_err(WebAuthError::from)?;
+    let db_info = tokio::time::timeout(auth_config.webauth_confirmation_timeout, waiter)
+        .await
+        .map_err(|_elapsed| {
+            auth::AuthError::confirmation_timeout(auth_config.webauth_confirmation_timeout.into())
+        })?
+        .map_err(WebAuthError::from)?;

    if auth_config.ip_allowlist_check_enabled {
        if let Some(allowed_ips) = &db_info.allowed_ips {
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -6,7 +6,10 @@ use compute_api::spec::LocalProxySpec;
 use dashmap::DashMap;
 use futures::future::Either;
 use proxy::{
-    auth::backend::local::{LocalBackend, JWKS_ROLE_MAP},
+    auth::backend::{
+        jwt::JwkCache,
+        local::{LocalBackend, JWKS_ROLE_MAP},
+    },
    cancellation::CancellationHandlerMain,
    config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
    console::{
@@ -267,14 +270,18 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
        allow_self_signed_compute: false,
        http_config,
        authentication_config: AuthenticationConfig {
+            jwks_cache: JwkCache::default(),
            thread_pool: ThreadPool::new(0),
            scram_protocol_timeout: Duration::from_secs(10),
            rate_limiter_enabled: false,
            rate_limiter: BucketRateLimiter::new(vec![]),
            rate_limit_ip_subnet: 64,
            ip_allowlist_check_enabled: true,
+            is_auth_broker: false,
+            accept_jwts: true,
+            webauth_confirmation_timeout: Duration::ZERO,
        },
-        require_client_ip: false,
+        proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
        handshake_timeout: Duration::from_secs(10),
        region: "local".into(),
        wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?,
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -8,6 +8,7 @@ use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
 use aws_config::Region;
 use futures::future::Either;
 use proxy::auth;
+use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::AuthRateLimiter;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
@@ -17,6 +18,7 @@ use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
 use proxy::config::ProjectInfoCacheOptions;
+use proxy::config::ProxyProtocolV2;
 use proxy::console;
 use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
@@ -102,6 +104,9 @@ struct ProxyCliArgs {
        default_value = "http://localhost:3000/authenticate_proxy_request/"
    )]
    auth_endpoint: String,
+    /// if this is not local proxy, this toggles whether we accept jwt or passwords for http
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    is_auth_broker: bool,
    /// path to TLS key for client postgres connections
    ///
    /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
@@ -144,9 +149,6 @@ struct ProxyCliArgs {
    /// size of the threadpool for password hashing
    #[clap(long, default_value_t = 4)]
    scram_thread_pool_size: u8,
-    /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    require_client_ip: bool,
    /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    disable_dynamic_rate_limiter: bool,
@@ -229,6 +231,15 @@ struct ProxyCliArgs {
    /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    is_private_access_proxy: bool,
+
+    /// Configure whether all incoming requests have a Proxy Protocol V2 packet.
+    // TODO(conradludgate): switch default to rejected or required once we've updated all deployments
+    #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)]
+    proxy_protocol_v2: ProxyProtocolV2,
+
+    /// Time the proxy waits for the webauth session to be confirmed by the control plane.
+    #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
+    webauth_confirmation_timeout: std::time::Duration,
 }

 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -382,9 +393,27 @@ async fn main() -> anyhow::Result<()> {
    info!("Starting mgmt on {mgmt_address}");
    let mgmt_listener = TcpListener::bind(mgmt_address).await?;

-    let proxy_address: SocketAddr = args.proxy.parse()?;
-    info!("Starting proxy on {proxy_address}");
-    let proxy_listener = TcpListener::bind(proxy_address).await?;
+    let proxy_listener = if !args.is_auth_broker {
+        let proxy_address: SocketAddr = args.proxy.parse()?;
+        info!("Starting proxy on {proxy_address}");
+
+        Some(TcpListener::bind(proxy_address).await?)
+    } else {
+        None
+    };
+
+    // TODO: rename the argument to something like serverless.
+    // It now covers more than just websockets, it also covers SQL over HTTP.
+    let serverless_listener = if let Some(serverless_address) = args.wss {
+        let serverless_address: SocketAddr = serverless_address.parse()?;
+        info!("Starting wss on {serverless_address}");
+        Some(TcpListener::bind(serverless_address).await?)
+    } else if args.is_auth_broker {
+        bail!("wss arg must be present for auth-broker")
+    } else {
+        None
+    };
+
    let cancellation_token = CancellationToken::new();

    let cancel_map = CancelMap::default();
@@ -430,21 +459,17 @@ async fn main() -> anyhow::Result<()> {
    // client facing tasks. these will exit on error or on cancellation
    // cancellation returns Ok(())
    let mut client_tasks = JoinSet::new();
-    client_tasks.spawn(proxy::proxy::task_main(
-        config,
-        proxy_listener,
-        cancellation_token.clone(),
-        cancellation_handler.clone(),
-        endpoint_rate_limiter.clone(),
-    ));
-
-    // TODO: rename the argument to something like serverless.
-    // It now covers more than just websockets, it also covers SQL over HTTP.
-    if let Some(serverless_address) = args.wss {
-        let serverless_address: SocketAddr = serverless_address.parse()?;
-        info!("Starting wss on {serverless_address}");
-        let serverless_listener = TcpListener::bind(serverless_address).await?;
+    if let Some(proxy_listener) = proxy_listener {
+        client_tasks.spawn(proxy::proxy::task_main(
+            config,
+            proxy_listener,
+            cancellation_token.clone(),
+            cancellation_handler.clone(),
+            endpoint_rate_limiter.clone(),
+        ));
+    }

+    if let Some(serverless_listener) = serverless_listener {
        client_tasks.spawn(serverless::task_main(
            config,
            serverless_listener,
@@ -674,7 +699,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
    )?;

    let http_config = HttpConfig {
-        accept_websockets: true,
+        accept_websockets: !args.is_auth_broker,
        pool_options: GlobalConnPoolOptions {
            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
@@ -689,12 +714,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes,
    };
    let authentication_config = AuthenticationConfig {
+        jwks_cache: JwkCache::default(),
        thread_pool,
        scram_protocol_timeout: args.scram_protocol_timeout,
        rate_limiter_enabled: args.auth_rate_limit_enabled,
        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
        ip_allowlist_check_enabled: !args.is_private_access_proxy,
+        is_auth_broker: args.is_auth_broker,
+        accept_jwts: args.is_auth_broker,
+        webauth_confirmation_timeout: args.webauth_confirmation_timeout,
    };

    let config = Box::leak(Box::new(ProxyConfig {
@@ -704,7 +733,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        allow_self_signed_compute: args.allow_self_signed_compute,
        http_config,
        authentication_config,
-        require_client_ip: args.require_client_ip,
+        proxy_protocol_v2: args.proxy_protocol_v2,
        handshake_timeout: args.handshake_timeout,
        region: args.region.clone(),
        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,5 +1,8 @@
 use crate::{
-    auth::{self, backend::AuthRateLimiter},
+    auth::{
+        self,
+        backend::{jwt::JwkCache, AuthRateLimiter},
+    },
    console::locks::ApiLocks,
    rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
    scram::threadpool::ThreadPool,
@@ -7,6 +10,7 @@ use crate::{
    Host,
 };
 use anyhow::{bail, ensure, Context, Ok};
+use clap::ValueEnum;
 use itertools::Itertools;
 use remote_storage::RemoteStorageConfig;
 use rustls::{
@@ -30,7 +34,7 @@ pub struct ProxyConfig {
    pub allow_self_signed_compute: bool,
    pub http_config: HttpConfig,
    pub authentication_config: AuthenticationConfig,
-    pub require_client_ip: bool,
+    pub proxy_protocol_v2: ProxyProtocolV2,
    pub region: String,
    pub handshake_timeout: Duration,
    pub wake_compute_retry_config: RetryConfig,
@@ -38,6 +42,16 @@ pub struct ProxyConfig {
    pub connect_to_compute_retry_config: RetryConfig,
 }

+#[derive(Copy, Clone, Debug, ValueEnum, PartialEq)]
+pub enum ProxyProtocolV2 {
+    /// Connection will error if PROXY protocol v2 header is missing
+    Required,
+    /// Connection will parse PROXY protocol v2 header, but accept the connection if it's missing.
+    Supported,
+    /// Connection will error if PROXY protocol v2 header is provided
+    Rejected,
+}
+
 #[derive(Debug)]
 pub struct MetricCollectionConfig {
    pub endpoint: reqwest::Url,
@@ -67,6 +81,10 @@ pub struct AuthenticationConfig {
    pub rate_limiter: AuthRateLimiter,
    pub rate_limit_ip_subnet: u8,
    pub ip_allowlist_check_enabled: bool,
+    pub jwks_cache: JwkCache,
+    pub is_auth_broker: bool,
+    pub accept_jwts: bool,
+    pub webauth_confirmation_timeout: tokio::time::Duration,
 }

 impl TlsConfig {
@@ -250,18 +268,26 @@ impl CertResolver {

        let common_name = pem.subject().to_string();

-        // We only use non-wildcard certificates in web auth proxy so it seems okay to treat them the same as
-        // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
-        // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
-        // and passed None instead, which blows up number of cases downstream code should handle. Proper coding
-        // here should better avoid Option for common_names, and do wildcard-based certificate selection instead
-        // of cutting off '*.' parts.
-        let common_name = if common_name.starts_with("CN=*.") {
-            common_name.strip_prefix("CN=*.").map(|s| s.to_string())
+        // We need to get the canonical name for this certificate so we can match them against any domain names
+        // seen within the proxy codebase.
+        //
+        // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI.
+        // We need to remove the wildcard prefix for the purposes of certificate selection.
+        //
+        // auth-broker does not use SNI and instead uses the Neon-Connection-String header.
+        // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
+        //
+        // Console Web proxy does not use any wildcard domains and does not need any certificate selection or conn string
+        // validation, so let's we can continue with any common-name
+        let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
+            s.to_string()
+        } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") {
+            s.to_string()
+        } else if let Some(s) = common_name.strip_prefix("CN=") {
+            s.to_string()
        } else {
-            common_name.strip_prefix("CN=").map(|s| s.to_string())
-        }
-        .context("Failed to parse common name from certificate")?;
+            bail!("Failed to parse common name from certificate")
+        };

        let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));

--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,4 +1,3 @@
-use tracing_opentelemetry::OpenTelemetryLayer;
 use tracing_subscriber::{
    filter::{EnvFilter, LevelFilter},
    prelude::*,
@@ -23,9 +22,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
        .with_writer(std::io::stderr)
        .with_target(false);

-    let otlp_layer = tracing_utils::init_tracing("proxy")
-        .await
-        .map(OpenTelemetryLayer::new);
+    let otlp_layer = tracing_utils::init_tracing("proxy").await;

    tracing_subscriber::registry()
        .with(env_filter)
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -10,6 +10,7 @@ pub(crate) mod wake_compute;
 pub use copy_bidirectional::copy_bidirectional_client_compute;
 pub use copy_bidirectional::ErrorSource;

+use crate::config::ProxyProtocolV2;
 use crate::{
    auth,
    cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal},
@@ -93,15 +94,19 @@ pub async fn task_main(

        connections.spawn(async move {
            let (socket, peer_addr) = match read_proxy_protocol(socket).await {
-                Ok((socket, Some(addr))) => (socket, addr.ip()),
                Err(e) => {
                    error!("per-client task finished with an error: {e:#}");
                    return;
                }
-                Ok((_socket, None)) if config.require_client_ip => {
-                    error!("missing required client IP");
+                Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
+                    error!("missing required proxy protocol header");
                    return;
                }
+                Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
+                    error!("proxy protocol header not supported");
+                    return;
+                }
+                Ok((socket, Some(addr))) => (socket, addr.ip()),
                Ok((socket, None)) => (socket, peer_addr.ip()),
            };

--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -5,6 +5,7 @@
 mod backend;
 pub mod cancel_set;
 mod conn_pool;
+mod http_conn_pool;
 mod http_util;
 mod json;
 mod sql_over_http;
@@ -19,7 +20,8 @@ use anyhow::Context;
 use futures::future::{select, Either};
 use futures::TryFutureExt;
 use http::{Method, Response, StatusCode};
-use http_body_util::Full;
+use http_body_util::combinators::BoxBody;
+use http_body_util::{BodyExt, Empty};
 use hyper1::body::Incoming;
 use hyper_util::rt::TokioExecutor;
 use hyper_util::server::conn::auto::Builder;
@@ -81,7 +83,28 @@ pub async fn task_main(
        }
    });

+    let http_conn_pool = http_conn_pool::GlobalConnPool::new(&config.http_config);
+    {
+        let http_conn_pool = Arc::clone(&http_conn_pool);
+        tokio::spawn(async move {
+            http_conn_pool.gc_worker(StdRng::from_entropy()).await;
+        });
+    }
+
+    // shutdown the connection pool
+    tokio::spawn({
+        let cancellation_token = cancellation_token.clone();
+        let http_conn_pool = http_conn_pool.clone();
+        async move {
+            cancellation_token.cancelled().await;
+            tokio::task::spawn_blocking(move || http_conn_pool.shutdown())
+                .await
+                .unwrap();
+        }
+    });
+
    let backend = Arc::new(PoolingBackend {
+        http_conn_pool: Arc::clone(&http_conn_pool),
        pool: Arc::clone(&conn_pool),
        config,
        endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
@@ -342,7 +365,7 @@ async fn request_handler(
    // used to cancel in-flight HTTP requests. not used to cancel websockets
    http_cancellation_token: CancellationToken,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-) -> Result<Response<Full<Bytes>>, ApiError> {
+) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, ApiError> {
    let host = request
        .headers()
        .get("host")
@@ -386,7 +409,7 @@ async fn request_handler(
        );

        // Return the response so the spawned future can continue.
-        Ok(response.map(|_: http_body_util::Empty<Bytes>| Full::new(Bytes::new())))
+        Ok(response.map(|b| b.map_err(|x| match x {}).boxed()))
    } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
        let ctx = RequestMonitoring::new(
            session_id,
@@ -409,7 +432,7 @@ async fn request_handler(
            )
            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
-            .body(Full::new(Bytes::new()))
+            .body(Empty::new().map_err(|x| match x {}).boxed())
            .map_err(|e| ApiError::InternalServerError(e.into()))
    } else {
        json_response(StatusCode::BAD_REQUEST, "query is not supported")
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,6 +1,8 @@
-use std::{sync::Arc, time::Duration};
+use std::{io, sync::Arc, time::Duration};

 use async_trait::async_trait;
+use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
+use tokio::net::{lookup_host, TcpStream};
 use tracing::{field::display, info};

 use crate::{
@@ -27,9 +29,13 @@ use crate::{
    Host,
 };

-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
+use super::{
+    conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool},
+    http_conn_pool::{self, poll_http2_client},
+};

 pub(crate) struct PoolingBackend {
+    pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool>,
    pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
    pub(crate) config: &'static ProxyConfig,
    pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -103,32 +109,44 @@ impl PoolingBackend {
    pub(crate) async fn authenticate_with_jwt(
        &self,
        ctx: &RequestMonitoring,
+        config: &AuthenticationConfig,
        user_info: &ComputeUserInfo,
-        jwt: &str,
-    ) -> Result<ComputeCredentials, AuthError> {
+        jwt: String,
+    ) -> Result<(), AuthError> {
        match &self.config.auth_backend {
-            crate::auth::Backend::Console(_, ()) => {
-                Err(AuthError::auth_failed("JWT login is not yet supported"))
+            crate::auth::Backend::Console(console, ()) => {
+                config
+                    .jwks_cache
+                    .check_jwt(
+                        ctx,
+                        user_info.endpoint.clone(),
+                        &user_info.user,
+                        &**console,
+                        &jwt,
+                    )
+                    .await
+                    .map_err(|e| AuthError::auth_failed(e.to_string()))?;
+
+                Ok(())
            }
            crate::auth::Backend::Web(_, ()) => Err(AuthError::auth_failed(
                "JWT login over web auth proxy is not supported",
            )),
-            crate::auth::Backend::Local(cache) => {
-                cache
+            crate::auth::Backend::Local(_) => {
+                config
                    .jwks_cache
                    .check_jwt(
                        ctx,
                        user_info.endpoint.clone(),
                        &user_info.user,
                        &StaticAuthRules,
-                        jwt,
+                        &jwt,
                    )
                    .await
                    .map_err(|e| AuthError::auth_failed(e.to_string()))?;
-                Ok(ComputeCredentials {
-                    info: user_info.clone(),
-                    keys: crate::auth::backend::ComputeCredentialKeys::None,
-                })
+
+                // todo: rewrite JWT signature with key shared somehow between local proxy and postgres
+                Ok(())
            }
        }
    }
@@ -174,14 +192,55 @@ impl PoolingBackend {
        )
        .await
    }
+
+    // Wake up the destination if needed
+    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    pub(crate) async fn connect_to_local_proxy(
+        &self,
+        ctx: &RequestMonitoring,
+        conn_info: ConnInfo,
+    ) -> Result<http_conn_pool::Client, HttpConnError> {
+        info!("pool: looking for an existing connection");
+        if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) {
+            return Ok(client);
+        }
+
+        let conn_id = uuid::Uuid::new_v4();
+        tracing::Span::current().record("conn_id", display(conn_id));
+        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+        let backend = self
+            .config
+            .auth_backend
+            .as_ref()
+            .map(|()| ComputeCredentials {
+                info: conn_info.user_info.clone(),
+                keys: crate::auth::backend::ComputeCredentialKeys::None,
+            });
+        crate::proxy::connect_compute::connect_to_compute(
+            ctx,
+            &HyperMechanism {
+                conn_id,
+                conn_info,
+                pool: self.http_conn_pool.clone(),
+                locks: &self.config.connect_compute_locks,
+            },
+            &backend,
+            false, // do not allow self signed compute for http flow
+            self.config.wake_compute_retry_config,
+            self.config.connect_to_compute_retry_config,
+        )
+        .await
+    }
 }

 #[derive(Debug, thiserror::Error)]
 pub(crate) enum HttpConnError {
    #[error("pooled connection closed at inconsistent state")]
    ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
-    #[error("could not connection to compute")]
-    ConnectionError(#[from] tokio_postgres::Error),
+    #[error("could not connection to postgres in compute")]
+    PostgresConnectionError(#[from] tokio_postgres::Error),
+    #[error("could not connection to local-proxy in compute")]
+    LocalProxyConnectionError(#[from] LocalProxyConnError),

    #[error("could not get auth info")]
    GetAuthInfo(#[from] GetAuthInfoError),
@@ -193,11 +252,20 @@ pub(crate) enum HttpConnError {
    TooManyConnectionAttempts(#[from] ApiLockError),
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum LocalProxyConnError {
+    #[error("error with connection to local-proxy")]
+    Io(#[source] std::io::Error),
+    #[error("could not establish h2 connection")]
+    H2(#[from] hyper1::Error),
+}
+
 impl ReportableError for HttpConnError {
    fn get_error_kind(&self) -> ErrorKind {
        match self {
            HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
-            HttpConnError::ConnectionError(p) => p.get_error_kind(),
+            HttpConnError::PostgresConnectionError(p) => p.get_error_kind(),
+            HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute,
            HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
            HttpConnError::AuthError(a) => a.get_error_kind(),
            HttpConnError::WakeCompute(w) => w.get_error_kind(),
@@ -210,7 +278,8 @@ impl UserFacingError for HttpConnError {
    fn to_string_client(&self) -> String {
        match self {
            HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(),
-            HttpConnError::ConnectionError(p) => p.to_string(),
+            HttpConnError::PostgresConnectionError(p) => p.to_string(),
+            HttpConnError::LocalProxyConnectionError(p) => p.to_string(),
            HttpConnError::GetAuthInfo(c) => c.to_string_client(),
            HttpConnError::AuthError(c) => c.to_string_client(),
            HttpConnError::WakeCompute(c) => c.to_string_client(),
@@ -224,7 +293,8 @@ impl UserFacingError for HttpConnError {
 impl CouldRetry for HttpConnError {
    fn could_retry(&self) -> bool {
        match self {
-            HttpConnError::ConnectionError(e) => e.could_retry(),
+            HttpConnError::PostgresConnectionError(e) => e.could_retry(),
+            HttpConnError::LocalProxyConnectionError(e) => e.could_retry(),
            HttpConnError::ConnectionClosedAbruptly(_) => false,
            HttpConnError::GetAuthInfo(_) => false,
            HttpConnError::AuthError(_) => false,
@@ -236,7 +306,7 @@ impl CouldRetry for HttpConnError {
 impl ShouldRetryWakeCompute for HttpConnError {
    fn should_retry_wake_compute(&self) -> bool {
        match self {
-            HttpConnError::ConnectionError(e) => e.should_retry_wake_compute(),
+            HttpConnError::PostgresConnectionError(e) => e.should_retry_wake_compute(),
            // we never checked cache validity
            HttpConnError::TooManyConnectionAttempts(_) => false,
            _ => true,
@@ -244,6 +314,38 @@ impl ShouldRetryWakeCompute for HttpConnError {
    }
 }

+impl ReportableError for LocalProxyConnError {
+    fn get_error_kind(&self) -> ErrorKind {
+        match self {
+            LocalProxyConnError::Io(_) => ErrorKind::Compute,
+            LocalProxyConnError::H2(_) => ErrorKind::Compute,
+        }
+    }
+}
+
+impl UserFacingError for LocalProxyConnError {
+    fn to_string_client(&self) -> String {
+        "Could not establish HTTP connection to the database".to_string()
+    }
+}
+
+impl CouldRetry for LocalProxyConnError {
+    fn could_retry(&self) -> bool {
+        match self {
+            LocalProxyConnError::Io(_) => false,
+            LocalProxyConnError::H2(_) => false,
+        }
+    }
+}
+impl ShouldRetryWakeCompute for LocalProxyConnError {
+    fn should_retry_wake_compute(&self) -> bool {
+        match self {
+            LocalProxyConnError::Io(_) => false,
+            LocalProxyConnError::H2(_) => false,
+        }
+    }
+}
+
 struct TokioMechanism {
    pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
    conn_info: ConnInfo,
@@ -293,3 +395,99 @@ impl ConnectMechanism for TokioMechanism {

    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
 }
+
+struct HyperMechanism {
+    pool: Arc<http_conn_pool::GlobalConnPool>,
+    conn_info: ConnInfo,
+    conn_id: uuid::Uuid,
+
+    /// connect_to_compute concurrency lock
+    locks: &'static ApiLocks<Host>,
+}
+
+#[async_trait]
+impl ConnectMechanism for HyperMechanism {
+    type Connection = http_conn_pool::Client;
+    type ConnectError = HttpConnError;
+    type Error = HttpConnError;
+
+    async fn connect_once(
+        &self,
+        ctx: &RequestMonitoring,
+        node_info: &CachedNodeInfo,
+        timeout: Duration,
+    ) -> Result<Self::Connection, Self::ConnectError> {
+        let host = node_info.config.get_host()?;
+        let permit = self.locks.get_permit(&host).await?;
+
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+
+        // let port = node_info.config.get_ports().first().unwrap_or_else(10432);
+        let res = connect_http2(&host, 10432, timeout).await;
+        drop(pause);
+        let (client, connection) = permit.release_result(res)?;
+
+        Ok(poll_http2_client(
+            self.pool.clone(),
+            ctx,
+            &self.conn_info,
+            client,
+            connection,
+            self.conn_id,
+            node_info.aux.clone(),
+        ))
+    }
+
+    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
+}
+
+async fn connect_http2(
+    host: &str,
+    port: u16,
+    timeout: Duration,
+) -> Result<(http_conn_pool::Send, http_conn_pool::Connect), LocalProxyConnError> {
+    // assumption: host is an ip address so this should not actually perform any requests.
+    // todo: add that assumption as a guarantee in the control-plane API.
+    let mut addrs = lookup_host((host, port))
+        .await
+        .map_err(LocalProxyConnError::Io)?;
+
+    let mut last_err = None;
+
+    let stream = loop {
+        let Some(addr) = addrs.next() else {
+            return Err(last_err.unwrap_or_else(|| {
+                LocalProxyConnError::Io(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "could not resolve any addresses",
+                ))
+            }));
+        };
+
+        match tokio::time::timeout(timeout, TcpStream::connect(addr)).await {
+            Ok(Ok(stream)) => {
+                stream.set_nodelay(true).map_err(LocalProxyConnError::Io)?;
+                break stream;
+            }
+            Ok(Err(e)) => {
+                last_err = Some(LocalProxyConnError::Io(e));
+            }
+            Err(e) => {
+                last_err = Some(LocalProxyConnError::Io(io::Error::new(
+                    io::ErrorKind::TimedOut,
+                    e,
+                )));
+            }
+        };
+    };
+
+    let (client, connection) = hyper1::client::conn::http2::Builder::new(TokioExecutor::new())
+        .timer(TokioTimer::new())
+        .keep_alive_interval(Duration::from_secs(20))
+        .keep_alive_while_idle(true)
+        .keep_alive_timeout(Duration::from_secs(5))
+        .handshake(TokioIo::new(stream))
+        .await?;
+
+    Ok((client, connection))
+}
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -0,0 +1,342 @@
+use dashmap::DashMap;
+use hyper1::client::conn::http2;
+use hyper_util::rt::{TokioExecutor, TokioIo};
+use parking_lot::RwLock;
+use rand::Rng;
+use std::collections::VecDeque;
+use std::sync::atomic::{self, AtomicUsize};
+use std::{sync::Arc, sync::Weak};
+use tokio::net::TcpStream;
+
+use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
+use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
+use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+use crate::{context::RequestMonitoring, EndpointCacheKey};
+
+use tracing::{debug, error};
+use tracing::{info, info_span, Instrument};
+
+use super::conn_pool::ConnInfo;
+
+pub(crate) type Send = http2::SendRequest<hyper1::body::Incoming>;
+pub(crate) type Connect =
+    http2::Connection<TokioIo<TcpStream>, hyper1::body::Incoming, TokioExecutor>;
+
+#[derive(Clone)]
+struct ConnPoolEntry {
+    conn: Send,
+    conn_id: uuid::Uuid,
+    aux: MetricsAuxInfo,
+}
+
+// Per-endpoint connection pool
+// Number of open connections is limited by the `max_conns_per_endpoint`.
+pub(crate) struct EndpointConnPool {
+    // TODO(conrad):
+    // either we should open more connections depending on stream count
+    // (not exposed by hyper, need our own counter)
+    // or we can change this to an Option rather than a VecDeque.
+    //
+    // Opening more connections to the same db because we run out of streams
+    // seems somewhat redundant though.
+    //
+    // Probably we should run a semaphore and just the single conn. TBD.
+    conns: VecDeque<ConnPoolEntry>,
+    _guard: HttpEndpointPoolsGuard<'static>,
+    global_connections_count: Arc<AtomicUsize>,
+}
+
+impl EndpointConnPool {
+    fn get_conn_entry(&mut self) -> Option<ConnPoolEntry> {
+        let Self { conns, .. } = self;
+
+        loop {
+            let conn = conns.pop_front()?;
+            if !conn.conn.is_closed() {
+                conns.push_back(conn.clone());
+                return Some(conn);
+            }
+        }
+    }
+
+    fn remove_conn(&mut self, conn_id: uuid::Uuid) -> bool {
+        let Self {
+            conns,
+            global_connections_count,
+            ..
+        } = self;
+
+        let old_len = conns.len();
+        conns.retain(|conn| conn.conn_id != conn_id);
+        let new_len = conns.len();
+        let removed = old_len - new_len;
+        if removed > 0 {
+            global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(removed as i64);
+        }
+        removed > 0
+    }
+}
+
+impl Drop for EndpointConnPool {
+    fn drop(&mut self) {
+        if !self.conns.is_empty() {
+            self.global_connections_count
+                .fetch_sub(self.conns.len(), atomic::Ordering::Relaxed);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(self.conns.len() as i64);
+        }
+    }
+}
+
+pub(crate) struct GlobalConnPool {
+    // endpoint -> per-endpoint connection pool
+    //
+    // That should be a fairly conteded map, so return reference to the per-endpoint
+    // pool as early as possible and release the lock.
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,
+
+    /// Number of endpoint-connection pools
+    ///
+    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
+    /// It's only used for diagnostics.
+    global_pool_size: AtomicUsize,
+
+    /// Total number of connections in the pool
+    global_connections_count: Arc<AtomicUsize>,
+
+    config: &'static crate::config::HttpConfig,
+}
+
+impl GlobalConnPool {
+    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
+        let shards = config.pool_options.pool_shards;
+        Arc::new(Self {
+            global_pool: DashMap::with_shard_amount(shards),
+            global_pool_size: AtomicUsize::new(0),
+            config,
+            global_connections_count: Arc::new(AtomicUsize::new(0)),
+        })
+    }
+
+    pub(crate) fn shutdown(&self) {
+        // drops all strong references to endpoint-pools
+        self.global_pool.clear();
+    }
+
+    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
+        let epoch = self.config.pool_options.gc_epoch;
+        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
+        loop {
+            interval.tick().await;
+
+            let shard = rng.gen_range(0..self.global_pool.shards().len());
+            self.gc(shard);
+        }
+    }
+
+    fn gc(&self, shard: usize) {
+        debug!(shard, "pool: performing epoch reclamation");
+
+        // acquire a random shard lock
+        let mut shard = self.global_pool.shards()[shard].write();
+
+        let timer = Metrics::get()
+            .proxy
+            .http_pool_reclaimation_lag_seconds
+            .start_timer();
+        let current_len = shard.len();
+        let mut clients_removed = 0;
+        shard.retain(|endpoint, x| {
+            // if the current endpoint pool is unique (no other strong or weak references)
+            // then it is currently not in use by any connections.
+            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+                let EndpointConnPool { conns, .. } = pool.get_mut();
+
+                let old_len = conns.len();
+
+                conns.retain(|conn| !conn.conn.is_closed());
+
+                let new_len = conns.len();
+                let removed = old_len - new_len;
+                clients_removed += removed;
+
+                // we only remove this pool if it has no active connections
+                if conns.is_empty() {
+                    info!("pool: discarding pool for endpoint {endpoint}");
+                    return false;
+                }
+            }
+
+            true
+        });
+
+        let new_len = shard.len();
+        drop(shard);
+        timer.observe();
+
+        // Do logging outside of the lock.
+        if clients_removed > 0 {
+            let size = self
+                .global_connections_count
+                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
+                - clients_removed;
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(clients_removed as i64);
+            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
+        }
+        let removed = current_len - new_len;
+
+        if removed > 0 {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_sub(removed, atomic::Ordering::Relaxed)
+                - removed;
+            info!("pool: performed global pool gc. size now {global_pool_size}");
+        }
+    }
+
+    pub(crate) fn get(
+        self: &Arc<Self>,
+        ctx: &RequestMonitoring,
+        conn_info: &ConnInfo,
+    ) -> Option<Client> {
+        let endpoint = conn_info.endpoint_cache_key()?;
+        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
+        let client = endpoint_pool.write().get_conn_entry()?;
+
+        tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
+        info!(
+            cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
+            "pool: reusing connection '{conn_info}'"
+        );
+        ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
+        ctx.success();
+        Some(Client::new(client.conn, client.aux))
+    }
+
+    fn get_or_create_endpoint_pool(
+        self: &Arc<Self>,
+        endpoint: &EndpointCacheKey,
+    ) -> Arc<RwLock<EndpointConnPool>> {
+        // fast path
+        if let Some(pool) = self.global_pool.get(endpoint) {
+            return pool.clone();
+        }
+
+        // slow path
+        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
+            conns: VecDeque::new(),
+            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
+            global_connections_count: self.global_connections_count.clone(),
+        }));
+
+        // find or create a pool for this endpoint
+        let mut created = false;
+        let pool = self
+            .global_pool
+            .entry(endpoint.clone())
+            .or_insert_with(|| {
+                created = true;
+                new_pool
+            })
+            .clone();
+
+        // log new global pool size
+        if created {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_add(1, atomic::Ordering::Relaxed)
+                + 1;
+            info!(
+                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
+            );
+        }
+
+        pool
+    }
+}
+
+pub(crate) fn poll_http2_client(
+    global_pool: Arc<GlobalConnPool>,
+    ctx: &RequestMonitoring,
+    conn_info: &ConnInfo,
+    client: Send,
+    connection: Connect,
+    conn_id: uuid::Uuid,
+    aux: MetricsAuxInfo,
+) -> Client {
+    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
+    let session_id = ctx.session_id();
+
+    let span = info_span!(parent: None, "connection", %conn_id);
+    let cold_start_info = ctx.cold_start_info();
+    span.in_scope(|| {
+        info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
+    });
+
+    let pool = match conn_info.endpoint_cache_key() {
+        Some(endpoint) => {
+            let pool = global_pool.get_or_create_endpoint_pool(&endpoint);
+
+            pool.write().conns.push_back(ConnPoolEntry {
+                conn: client.clone(),
+                conn_id,
+                aux: aux.clone(),
+            });
+
+            Arc::downgrade(&pool)
+        }
+        None => Weak::new(),
+    };
+
+    tokio::spawn(
+        async move {
+            let _conn_gauge = conn_gauge;
+            let res = connection.await;
+            match res {
+                Ok(()) => info!("connection closed"),
+                Err(e) => error!(%session_id, "connection error: {}", e),
+            }
+
+            // remove from connection pool
+            if let Some(pool) = pool.clone().upgrade() {
+                if pool.write().remove_conn(conn_id) {
+                    info!("closed connection removed");
+                }
+            }
+        }
+        .instrument(span),
+    );
+
+    Client::new(client, aux)
+}
+
+pub(crate) struct Client {
+    pub(crate) inner: Send,
+    aux: MetricsAuxInfo,
+}
+
+impl Client {
+    pub(self) fn new(inner: Send, aux: MetricsAuxInfo) -> Self {
+        Self { inner, aux }
+    }
+
+    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+        USAGE_METRICS.register(Ids {
+            endpoint_id: self.aux.endpoint_id,
+            branch_id: self.aux.branch_id,
+        })
+    }
+}
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -5,13 +5,13 @@ use bytes::Bytes;

 use anyhow::Context;
 use http::{Response, StatusCode};
-use http_body_util::Full;
+use http_body_util::{combinators::BoxBody, BodyExt, Full};

 use serde::Serialize;
 use utils::http::error::ApiError;

 /// Like [`ApiError::into_response`]
-pub(crate) fn api_error_into_response(this: ApiError) -> Response<Full<Bytes>> {
+pub(crate) fn api_error_into_response(this: ApiError) -> Response<BoxBody<Bytes, hyper1::Error>> {
    match this {
        ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
            format!("{err:#?}"), // use debug printing so that we give the cause
@@ -64,17 +64,24 @@ struct HttpErrorBody {

 impl HttpErrorBody {
    /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`]
-    fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response<Full<Bytes>> {
+    fn response_from_msg_and_status(
+        msg: String,
+        status: StatusCode,
+    ) -> Response<BoxBody<Bytes, hyper1::Error>> {
        HttpErrorBody { msg }.to_response(status)
    }

    /// Same as [`utils::http::error::HttpErrorBody::to_response`]
-    fn to_response(&self, status: StatusCode) -> Response<Full<Bytes>> {
+    fn to_response(&self, status: StatusCode) -> Response<BoxBody<Bytes, hyper1::Error>> {
        Response::builder()
            .status(status)
            .header(http::header::CONTENT_TYPE, "application/json")
            // we do not have nested maps with non string keys so serialization shouldn't fail
-            .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap())))
+            .body(
+                Full::new(Bytes::from(serde_json::to_string(self).unwrap()))
+                    .map_err(|x| match x {})
+                    .boxed(),
+            )
            .unwrap()
    }
 }
@@ -83,14 +90,14 @@ impl HttpErrorBody {
 pub(crate) fn json_response<T: Serialize>(
    status: StatusCode,
    data: T,
-) -> Result<Response<Full<Bytes>>, ApiError> {
+) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, ApiError> {
    let json = serde_json::to_string(&data)
        .context("Failed to serialize JSON response")
        .map_err(ApiError::InternalServerError)?;
    let response = Response::builder()
        .status(status)
        .header(http::header::CONTENT_TYPE, "application/json")
-        .body(Full::new(Bytes::from(json)))
+        .body(Full::new(Bytes::from(json)).map_err(|x| match x {}).boxed())
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
 }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -8,6 +8,8 @@ use futures::future::Either;
 use futures::StreamExt;
 use futures::TryFutureExt;
 use http::header::AUTHORIZATION;
+use http::Method;
+use http_body_util::combinators::BoxBody;
 use http_body_util::BodyExt;
 use http_body_util::Full;
 use hyper1::body::Body;
@@ -38,9 +40,11 @@ use url::Url;
 use urlencoding;
 use utils::http::error::ApiError;

+use crate::auth::backend::ComputeCredentials;
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::endpoint_sni;
 use crate::auth::ComputeUserInfoParseError;
+use crate::config::AuthenticationConfig;
 use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
@@ -56,6 +60,7 @@ use crate::usage_metrics::MetricCounterRecorder;
 use crate::DbName;
 use crate::RoleName;

+use super::backend::LocalProxyConnError;
 use super::backend::PoolingBackend;
 use super::conn_pool::AuthData;
 use super::conn_pool::Client;
@@ -123,8 +128,8 @@ pub(crate) enum ConnInfoError {
    MissingUsername,
    #[error("invalid username: {0}")]
    InvalidUsername(#[from] std::string::FromUtf8Error),
-    #[error("missing password")]
-    MissingPassword,
+    #[error("missing authentication credentials: {0}")]
+    MissingCredentials(Credentials),
    #[error("missing hostname")]
    MissingHostname,
    #[error("invalid hostname: {0}")]
@@ -133,6 +138,14 @@ pub(crate) enum ConnInfoError {
    MalformedEndpoint,
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum Credentials {
+    #[error("required password")]
+    Password,
+    #[error("required authorization bearer token in JWT format")]
+    BearerJwt,
+}
+
 impl ReportableError for ConnInfoError {
    fn get_error_kind(&self) -> ErrorKind {
        ErrorKind::User
@@ -146,6 +159,7 @@ impl UserFacingError for ConnInfoError {
 }

 fn get_conn_info(
+    config: &'static AuthenticationConfig,
    ctx: &RequestMonitoring,
    headers: &HeaderMap,
    tls: Option<&TlsConfig>,
@@ -181,21 +195,32 @@ fn get_conn_info(
    ctx.set_user(username.clone());

    let auth = if let Some(auth) = headers.get(&AUTHORIZATION) {
+        if !config.accept_jwts {
+            return Err(ConnInfoError::MissingCredentials(Credentials::Password));
+        }
+
        let auth = auth
            .to_str()
            .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?;
        AuthData::Jwt(
            auth.strip_prefix("Bearer ")
-                .ok_or(ConnInfoError::MissingPassword)?
+                .ok_or(ConnInfoError::MissingCredentials(Credentials::BearerJwt))?
                .into(),
        )
    } else if let Some(pass) = connection_url.password() {
+        // wrong credentials provided
+        if config.accept_jwts {
+            return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt));
+        }
+
        AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) {
            std::borrow::Cow::Borrowed(b) => b.into(),
            std::borrow::Cow::Owned(b) => b.into(),
        })
+    } else if config.accept_jwts {
+        return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt));
    } else {
-        return Err(ConnInfoError::MissingPassword);
+        return Err(ConnInfoError::MissingCredentials(Credentials::Password));
    };

    let endpoint = match connection_url.host() {
@@ -247,7 +272,7 @@ pub(crate) async fn handle(
    request: Request<Incoming>,
    backend: Arc<PoolingBackend>,
    cancel: CancellationToken,
-) -> Result<Response<Full<Bytes>>, ApiError> {
+) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, ApiError> {
    let result = handle_inner(cancel, config, &ctx, request, backend).await;

    let mut response = match result {
@@ -279,7 +304,7 @@ pub(crate) async fn handle(

            let mut message = e.to_string_client();
            let db_error = match &e {
-                SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
+                SqlOverHttpError::ConnectCompute(HttpConnError::PostgresConnectionError(e))
                | SqlOverHttpError::Postgres(e) => e.as_db_error(),
                _ => None,
            };
@@ -504,7 +529,7 @@ async fn handle_inner(
    ctx: &RequestMonitoring,
    request: Request<Incoming>,
    backend: Arc<PoolingBackend>,
-) -> Result<Response<Full<Bytes>>, SqlOverHttpError> {
+) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, SqlOverHttpError> {
    let _requeset_gauge = Metrics::get()
        .proxy
        .connection_requests
@@ -514,18 +539,50 @@ async fn handle_inner(
        "handling interactive connection from client"
    );

-    //
-    // Determine the destination and connection params
-    //
-    let headers = request.headers();
-
-    // TLS config should be there.
-    let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
+    let conn_info = get_conn_info(
+        &config.authentication_config,
+        ctx,
+        request.headers(),
+        config.tls_config.as_ref(),
+    )?;
    info!(
        user = conn_info.conn_info.user_info.user.as_str(),
        "credentials"
    );

+    match conn_info.auth {
+        AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => {
+            handle_auth_broker_inner(config, ctx, request, conn_info.conn_info, jwt, backend).await
+        }
+        auth => {
+            handle_db_inner(
+                cancel,
+                config,
+                ctx,
+                request,
+                conn_info.conn_info,
+                auth,
+                backend,
+            )
+            .await
+        }
+    }
+}
+
+async fn handle_db_inner(
+    cancel: CancellationToken,
+    config: &'static ProxyConfig,
+    ctx: &RequestMonitoring,
+    request: Request<Incoming>,
+    conn_info: ConnInfo,
+    auth: AuthData,
+    backend: Arc<PoolingBackend>,
+) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, SqlOverHttpError> {
+    //
+    // Determine the destination and connection params
+    //
+    let headers = request.headers();
+
    // Allow connection pooling only if explicitly requested
    // or if we have decided that http pool is no longer opt-in
    let allow_pool = !config.http_config.pool_options.opt_in
@@ -563,26 +620,36 @@ async fn handle_inner(

    let authenticate_and_connect = Box::pin(
        async {
-            let keys = match &conn_info.auth {
+            let keys = match auth {
                AuthData::Password(pw) => {
                    backend
                        .authenticate_with_password(
                            ctx,
                            &config.authentication_config,
-                            &conn_info.conn_info.user_info,
-                            pw,
+                            &conn_info.user_info,
+                            &pw,
                        )
                        .await?
                }
                AuthData::Jwt(jwt) => {
                    backend
-                        .authenticate_with_jwt(ctx, &conn_info.conn_info.user_info, jwt)
-                        .await?
+                        .authenticate_with_jwt(
+                            ctx,
+                            &config.authentication_config,
+                            &conn_info.user_info,
+                            jwt,
+                        )
+                        .await?;
+
+                    ComputeCredentials {
+                        info: conn_info.user_info.clone(),
+                        keys: crate::auth::backend::ComputeCredentialKeys::None,
+                    }
                }
            };

            let client = backend
-                .connect_to_compute(ctx, conn_info.conn_info, keys, !allow_pool)
+                .connect_to_compute(ctx, conn_info, keys, !allow_pool)
                .await?;
            // not strictly necessary to mark success here,
            // but it's just insurance for if we forget it somewhere else
@@ -640,7 +707,11 @@ async fn handle_inner(

    let len = json_output.len();
    let response = response
-        .body(Full::new(Bytes::from(json_output)))
+        .body(
+            Full::new(Bytes::from(json_output))
+                .map_err(|x| match x {})
+                .boxed(),
+        )
        // only fails if invalid status code or invalid header/values are given.
        // these are not user configurable so it cannot fail dynamically
        .expect("building response payload should not fail");
@@ -656,6 +727,65 @@ async fn handle_inner(
    Ok(response)
 }

+static HEADERS_TO_FORWARD: &[&HeaderName] = &[
+    &AUTHORIZATION,
+    &CONN_STRING,
+    &RAW_TEXT_OUTPUT,
+    &ARRAY_MODE,
+    &TXN_ISOLATION_LEVEL,
+    &TXN_READ_ONLY,
+    &TXN_DEFERRABLE,
+];
+
+async fn handle_auth_broker_inner(
+    config: &'static ProxyConfig,
+    ctx: &RequestMonitoring,
+    request: Request<Incoming>,
+    conn_info: ConnInfo,
+    jwt: String,
+    backend: Arc<PoolingBackend>,
+) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, SqlOverHttpError> {
+    backend
+        .authenticate_with_jwt(
+            ctx,
+            &config.authentication_config,
+            &conn_info.user_info,
+            jwt,
+        )
+        .await
+        .map_err(HttpConnError::from)?;
+
+    let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?;
+
+    let local_proxy_uri = ::http::Uri::from_static("http://proxy.local/sql");
+
+    let (mut parts, body) = request.into_parts();
+    let mut req = Request::builder().method(Method::POST).uri(local_proxy_uri);
+
+    // todo(conradludgate): maybe auth-broker should parse these and re-serialize
+    // these instead just to ensure they remain normalised.
+    for &h in HEADERS_TO_FORWARD {
+        if let Some(hv) = parts.headers.remove(h) {
+            req = req.header(h, hv);
+        }
+    }
+
+    let req = req
+        .body(body)
+        .expect("all headers and params received via hyper should be valid for request");
+
+    // todo: map body to count egress
+    let _metrics = client.metrics();
+
+    Ok(client
+        .inner
+        .send_request(req)
+        .await
+        .map_err(LocalProxyConnError::from)
+        .map_err(HttpConnError::from)?
+        .map(|b| b.boxed()))
+}
+
 impl QueryData {
    async fn process(
        self,
@@ -705,7 +835,9 @@ impl QueryData {
                    // query failed or was cancelled.
                    Ok(Err(error)) => {
                        let db_error = match &error {
-                            SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
+                            SqlOverHttpError::ConnectCompute(
+                                HttpConnError::PostgresConnectionError(e),
+                            )
                            | SqlOverHttpError::Postgres(e) => e.as_db_error(),
                            _ => None,
                        };
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,10 +6,10 @@ package-mode = false
 [tool.poetry.dependencies]
 python = "^3.9"
 pytest = "^7.4.4"
-psycopg2-binary = "^2.9.6"
+psycopg2-binary = "^2.9.9"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
-requests = "^2.32.0"
+requests = "^2.32.3"
 pytest-xdist = "^3.3.1"
 asyncpg = "^0.29.0"
 aiopg = "^1.4.0"
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -23,6 +23,7 @@ crc32c.workspace = true
 fail.workspace = true
 hex.workspace = true
 humantime.workspace = true
+http.workspace = true
 hyper.workspace = true
 futures.workspace = true
 once_cell.workspace = true
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -161,7 +161,7 @@ pub static HTTP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
        .thread_name("HTTP worker")
        .enable_all()
        .build()
-        .expect("Failed to create WAL service runtime")
+        .expect("Failed to create HTTP runtime")
 });

 pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -13,7 +13,7 @@ use desim::{
    node_os::NodeOs,
    proto::{AnyMessage, NetEvent, NodeEvent},
 };
-use hyper::Uri;
+use http::Uri;
 use safekeeper::{
    safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION},
    state::{TimelinePersistentState, TimelineState},
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -10,13 +10,16 @@ bench = []
 [dependencies]
 anyhow.workspace = true
 async-stream.workspace = true
+bytes.workspace = true
 clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 futures.workspace = true
 futures-core.workspace = true
 futures-util.workspace = true
 humantime.workspace = true
-hyper = { workspace = true, features = ["full"] }
+hyper_1 = { workspace = true, features = ["full"] }
+http-body-util.workspace = true
+hyper-util = "0.1"
 once_cell.workspace = true
 parking_lot.workspace = true
 prost.workspace = true
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -13,10 +13,13 @@
 use clap::{command, Parser};
 use futures_core::Stream;
 use futures_util::StreamExt;
+use http_body_util::Full;
 use hyper::header::CONTENT_TYPE;
-use hyper::server::conn::AddrStream;
-use hyper::service::{make_service_fn, service_fn};
-use hyper::{Body, Method, StatusCode};
+use hyper::service::service_fn;
+use hyper::{Method, StatusCode};
+use hyper_1 as hyper;
+use hyper_1::body::Incoming;
+use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
 use parking_lot::RwLock;
 use std::collections::HashMap;
 use std::convert::Infallible;
@@ -24,9 +27,11 @@ use std::net::SocketAddr;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;
+use tokio::net::TcpListener;
 use tokio::sync::broadcast;
 use tokio::sync::broadcast::error::RecvError;
 use tokio::time;
+use tonic::body::{self, empty_body, BoxBody};
 use tonic::codegen::Service;
 use tonic::transport::server::Connected;
 use tonic::Code;
@@ -45,9 +50,7 @@ use storage_broker::proto::{
    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
    SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage,
 };
-use storage_broker::{
-    parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR,
-};
+use storage_broker::{parse_proto_ttid, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR};
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
 use utils::sentry_init::init_sentry;
@@ -599,8 +602,8 @@ impl BrokerService for Broker {

 // We serve only metrics and healthcheck through http1.
 async fn http1_handler(
-    req: hyper::Request<hyper::body::Body>,
-) -> Result<hyper::Response<Body>, Infallible> {
+    req: hyper::Request<Incoming>,
+) -> Result<hyper::Response<BoxBody>, Infallible> {
    let resp = match (req.method(), req.uri().path()) {
        (&Method::GET, "/metrics") => {
            let mut buffer = vec![];
@@ -611,16 +614,16 @@ async fn http1_handler(
            hyper::Response::builder()
                .status(StatusCode::OK)
                .header(CONTENT_TYPE, encoder.format_type())
-                .body(Body::from(buffer))
+                .body(body::boxed(Full::new(bytes::Bytes::from(buffer))))
                .unwrap()
        }
        (&Method::GET, "/status") => hyper::Response::builder()
            .status(StatusCode::OK)
-            .body(Body::empty())
+            .body(empty_body())
            .unwrap(),
        _ => hyper::Response::builder()
            .status(StatusCode::NOT_FOUND)
-            .body(Body::empty())
+            .body(empty_body())
            .unwrap(),
    };
    Ok(resp)
@@ -662,52 +665,74 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    };
    let storage_broker_server = BrokerServiceServer::new(storage_broker_impl);

-    info!("listening on {}", &args.listen_addr);
-
    // grpc is served along with http1 for metrics on a single port, hence we
    // don't use tonic's Server.
-    hyper::Server::bind(&args.listen_addr)
-        .http2_keep_alive_interval(Some(args.http2_keepalive_interval))
-        .serve(make_service_fn(move |conn: &AddrStream| {
-            let storage_broker_server_cloned = storage_broker_server.clone();
-            let connect_info = conn.connect_info();
-            async move {
-                Ok::<_, Infallible>(service_fn(move |mut req| {
-                    // That's what tonic's MakeSvc.call does to pass conninfo to
-                    // the request handler (and where its request.remote_addr()
-                    // expects it to find).
-                    req.extensions_mut().insert(connect_info.clone());
-
-                    // Technically this second clone is not needed, but consume
-                    // by async block is apparently unavoidable. BTW, error
-                    // message is enigmatic, see
-                    // https://github.com/rust-lang/rust/issues/68119
-                    //
-                    // We could get away without async block at all, but then we
-                    // need to resort to futures::Either to merge the result,
-                    // which doesn't caress an eye as well.
-                    let mut storage_broker_server_svc = storage_broker_server_cloned.clone();
-                    async move {
-                        if req.headers().get("content-type").map(|x| x.as_bytes())
-                            == Some(b"application/grpc")
-                        {
-                            let res_resp = storage_broker_server_svc.call(req).await;
-                            // Grpc and http1 handlers have slightly different
-                            // Response types: it is UnsyncBoxBody for the
-                            // former one (not sure why) and plain hyper::Body
-                            // for the latter. Both implement HttpBody though,
-                            // and EitherBody is used to merge them.
-                            res_resp.map(|resp| resp.map(EitherBody::Left))
-                        } else {
-                            let res_resp = http1_handler(req).await;
-                            res_resp.map(|resp| resp.map(EitherBody::Right))
-                        }
-                    }
-                }))
+    let tcp_listener = TcpListener::bind(&args.listen_addr).await?;
+    info!("listening on {}", &args.listen_addr);
+    loop {
+        let (stream, addr) = match tcp_listener.accept().await {
+            Ok(v) => v,
+            Err(e) => {
+                info!("couldn't accept connection: {e}");
+                continue;
            }
-        }))
-        .await?;
-    Ok(())
+        };
+
+        let mut builder = hyper_util::server::conn::auto::Builder::new(TokioExecutor::new());
+        builder.http1().timer(TokioTimer::new());
+        builder
+            .http2()
+            .timer(TokioTimer::new())
+            .keep_alive_interval(Some(args.http2_keepalive_interval));
+
+        let storage_broker_server_cloned = storage_broker_server.clone();
+        let connect_info = stream.connect_info();
+        let service_fn_ = async move {
+            service_fn(move |mut req| {
+                // That's what tonic's MakeSvc.call does to pass conninfo to
+                // the request handler (and where its request.remote_addr()
+                // expects it to find).
+                req.extensions_mut().insert(connect_info.clone());
+
+                // Technically this second clone is not needed, but consume
+                // by async block is apparently unavoidable. BTW, error
+                // message is enigmatic, see
+                // https://github.com/rust-lang/rust/issues/68119
+                //
+                // We could get away without async block at all, but then we
+                // need to resort to futures::Either to merge the result,
+                // which doesn't caress an eye as well.
+                let mut storage_broker_server_svc = storage_broker_server_cloned.clone();
+                async move {
+                    if req.headers().get("content-type").map(|x| x.as_bytes())
+                        == Some(b"application/grpc")
+                    {
+                        let res_resp = storage_broker_server_svc.call(req).await;
+                        // Grpc and http1 handlers have slightly different
+                        // Response types: it is UnsyncBoxBody for the
+                        // former one (not sure why) and plain hyper::Body
+                        // for the latter. Both implement HttpBody though,
+                        // and `Either` is used to merge them.
+                        res_resp.map(|resp| resp.map(http_body_util::Either::Left))
+                    } else {
+                        let res_resp = http1_handler(req).await;
+                        res_resp.map(|resp| resp.map(http_body_util::Either::Right))
+                    }
+                }
+            })
+        }
+        .await;
+
+        tokio::task::spawn(async move {
+            let res = builder
+                .serve_connection(TokioIo::new(stream), service_fn_)
+                .await;
+
+            if let Err(e) = res {
+                info!("error serving connection from {addr}: {e}");
+            }
+        });
+    }
 }

 #[cfg(test)]
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -1,6 +1,4 @@
-use hyper::body::HttpBody;
-use std::pin::Pin;
-use std::task::{Context, Poll};
+use hyper_1 as hyper;
 use std::time::Duration;
 use tonic::codegen::StdError;
 use tonic::transport::{ClientTlsConfig, Endpoint};
@@ -94,56 +92,3 @@ pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result<TenantTime
        timeline_id,
    })
 }
-
-// These several usages don't justify anyhow dependency, though it would work as
-// well.
-type AnyError = Box<dyn std::error::Error + Send + Sync + 'static>;
-
-// Provides impl HttpBody for two different types implementing it. Inspired by
-// https://github.com/hyperium/tonic/blob/master/examples/src/hyper_warp/server.rs
-pub enum EitherBody<A, B> {
-    Left(A),
-    Right(B),
-}
-
-impl<A, B> HttpBody for EitherBody<A, B>
-where
-    A: HttpBody + Send + Unpin,
-    B: HttpBody<Data = A::Data> + Send + Unpin,
-    A::Error: Into<AnyError>,
-    B::Error: Into<AnyError>,
-{
-    type Data = A::Data;
-    type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
-
-    fn is_end_stream(&self) -> bool {
-        match self {
-            EitherBody::Left(b) => b.is_end_stream(),
-            EitherBody::Right(b) => b.is_end_stream(),
-        }
-    }
-
-    fn poll_data(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Option<Result<Self::Data, Self::Error>>> {
-        match self.get_mut() {
-            EitherBody::Left(b) => Pin::new(b).poll_data(cx).map(map_option_err),
-            EitherBody::Right(b) => Pin::new(b).poll_data(cx).map(map_option_err),
-        }
-    }
-
-    fn poll_trailers(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-    ) -> Poll<Result<Option<hyper::HeaderMap>, Self::Error>> {
-        match self.get_mut() {
-            EitherBody::Left(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into),
-            EitherBody::Right(b) => Pin::new(b).poll_trailers(cx).map_err(Into::into),
-        }
-    }
-}
-
-fn map_option_err<T, U: Into<AnyError>>(err: Option<Result<T, U>>) -> Option<Result<T, AnyError>> {
-    err.map(|e| e.map_err(Into::into))
-}
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
 use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
-    Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT,
-    MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+    Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
+    MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -108,6 +108,9 @@ struct Cli {
    // Period with which to send heartbeats to registered nodes
    #[arg(long)]
    heartbeat_interval: Option<humantime::Duration>,
+
+    #[arg(long)]
+    long_reconcile_threshold: Option<humantime::Duration>,
 }

 enum StrictMode {
@@ -293,6 +296,10 @@ async fn async_main() -> anyhow::Result<()> {
            .heartbeat_interval
            .map(humantime::Duration::into)
            .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT),
+        long_reconcile_threshold: args
+            .long_reconcile_threshold
+            .map(humantime::Duration::into)
+            .unwrap_or(LONG_RECONCILE_THRESHOLD_DEFAULT),
        address_for_peers: args.address_for_peers,
        start_as_candidate: args.start_as_candidate,
        http_service_port: args.listen.port() as i32,
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -87,6 +87,10 @@ pub(crate) struct StorageControllerMetricGroup {
        measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,

    pub(crate) storage_controller_leadership_status: measured::GaugeVec<LeadershipStatusGroupSet>,
+
+    /// HTTP request status counters for handled requests
+    pub(crate) storage_controller_reconcile_long_running:
+        measured::CounterVec<ReconcileLongRunningLabelGroupSet>,
 }

 impl StorageControllerMetrics {
@@ -168,6 +172,17 @@ pub(crate) struct LeadershipStatusGroup {
    pub(crate) status: LeadershipStatus,
 }

+#[derive(measured::LabelGroup, Clone)]
+#[label(set = ReconcileLongRunningLabelGroupSet)]
+pub(crate) struct ReconcileLongRunningLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) tenant_id: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) shard_number: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) sequence: &'a str,
+}
+
 #[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum ReconcileOutcome {
    #[label(rename = "ok")]
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -454,7 +454,7 @@ impl Reconciler {
                Ok(l) => l,
                Err(e) => {
                    tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",);
-                    std::thread::sleep(Duration::from_millis(500));
+                    tokio::time::sleep(Duration::from_millis(500)).await;
                    continue;
                }
            };
@@ -469,10 +469,7 @@ impl Reconciler {
                        }
                    }
                    None => {
-                        // Expected timeline isn't yet visible on migration destination.
-                        // (IRL we would have to account for timeline deletion, but this
-                        //  is just test helper)
-                        any_behind = true;
+                        // Timeline was deleted in the meantime - ignore it
                    }
                }
            }
@@ -481,7 +478,7 @@ impl Reconciler {
                tracing::info!("✅ LSN caught up.  Proceeding...");
                break;
            } else {
-                std::thread::sleep(Duration::from_millis(500));
+                tokio::time::sleep(Duration::from_millis(500)).await;
            }
        }

@@ -562,6 +559,8 @@ impl Reconciler {
        self.location_config(&dest_ps, dest_conf, None, false)
            .await?;

+        pausable_failpoint!("reconciler-live-migrate-pre-await-lsn");
+
        if let Some(baseline) = baseline_lsns {
            tracing::info!("🕑 Waiting for LSN to catch up...");
            self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
@@ -572,30 +571,7 @@ impl Reconciler {

        // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
        // the origin without notifying compute, we will render the tenant unavailable.
-        let mut notify_attempts = 0;
-        while let Err(e) = self.compute_notify().await {
-            match e {
-                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
-                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
-                _ => {
-                    tracing::warn!(
-                        "Live migration blocked by compute notification error, retrying: {e}"
-                    );
-                }
-            }
-
-            exponential_backoff(
-                notify_attempts,
-                // Generous waits: control plane operations which might be blocking us usually complete on the order
-                // of hundreds to thousands of milliseconds, so no point busy polling.
-                1.0,
-                10.0,
-                &self.cancel,
-            )
-            .await;
-            notify_attempts += 1;
-        }
-
+        self.compute_notify_blocking(&origin_ps).await?;
        pausable_failpoint!("reconciler-live-migrate-post-notify");

        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
@@ -869,6 +845,117 @@ impl Reconciler {
            Ok(())
        }
    }
+
+    /// Keep trying to notify the compute indefinitely, only dropping out if:
+    /// - the node `origin` becomes unavailable -> Ok(())
+    /// - the node `origin` no longer has our tenant shard attached -> Ok(())
+    /// - our cancellation token fires -> Err(ReconcileError::Cancelled)
+    ///
+    /// This is used during live migration, where we do not wish to detach
+    /// an origin location until the compute definitely knows about the new
+    /// location.
+    ///
+    /// In cases where the origin node becomes unavailable, we return success, indicating
+    /// to the caller that they should continue irrespective of whether the compute was notified,
+    /// because the origin node is unusable anyway.  Notification will be retried later via the
+    /// [`Self::compute_notify_failure`] flag.
+    async fn compute_notify_blocking(&mut self, origin: &Node) -> Result<(), ReconcileError> {
+        let mut notify_attempts = 0;
+        while let Err(e) = self.compute_notify().await {
+            match e {
+                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
+                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
+                _ => {
+                    tracing::warn!(
+                        "Live migration blocked by compute notification error, retrying: {e}"
+                    );
+                }
+            }
+
+            // Did the origin pageserver become unavailable?
+            if !origin.is_available() {
+                tracing::info!("Giving up on compute notification because {origin} is unavailable");
+                break;
+            }
+
+            // Does the origin pageserver still host the shard we are interested in?  We should only
+            // continue waiting for compute notification to be acked if the old location is still usable.
+            let tenant_shard_id = self.tenant_shard_id;
+            match origin
+                .with_client_retries(
+                    |client| async move { client.get_location_config(tenant_shard_id).await },
+                    &self.service_config.jwt_token,
+                    1,
+                    3,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(Some(location_conf))) => {
+                    if matches!(
+                        location_conf.mode,
+                        LocationConfigMode::AttachedMulti
+                            | LocationConfigMode::AttachedSingle
+                            | LocationConfigMode::AttachedStale
+                    ) {
+                        tracing::debug!(
+                            "Still attached to {origin}, will wait & retry compute notification"
+                        );
+                    } else {
+                        tracing::info!(
+                            "Giving up on compute notification because {origin} is in state {:?}",
+                            location_conf.mode
+                        );
+                        return Ok(());
+                    }
+                    // Fall through
+                }
+                Some(Ok(None)) => {
+                    tracing::info!(
+                        "No longer attached to {origin}, giving up on compute notification"
+                    );
+                    return Ok(());
+                }
+                Some(Err(e)) => {
+                    match e {
+                        mgmt_api::Error::Cancelled => {
+                            tracing::info!(
+                                "Giving up on compute notification because {origin} is unavailable"
+                            );
+                            return Ok(());
+                        }
+                        mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _) => {
+                            tracing::info!(
+                                "No longer attached to {origin}, giving up on compute notification"
+                            );
+                            return Ok(());
+                        }
+                        e => {
+                            // Other API errors are unexpected here.
+                            tracing::warn!("Unexpected error checking location on {origin}: {e}");
+
+                            // Fall through, we will retry compute notification.
+                        }
+                    }
+                }
+                None => return Err(ReconcileError::Cancel),
+            };
+
+            exponential_backoff(
+                notify_attempts,
+                // Generous waits: control plane operations which might be blocking us usually complete on the order
+                // of hundreds to thousands of milliseconds, so no point busy polling.
+                1.0,
+                10.0,
+                &self.cancel,
+            )
+            .await;
+            notify_attempts += 1;
+        }
+
+        Ok(())
+    }
 }

 /// We tweak the externally-set TenantConfig while configuring
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -128,6 +128,9 @@ pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
 /// How often to send heartbeats to registered nodes?
 pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5);

+/// How long is too long for a reconciliation?
+pub const LONG_RECONCILE_THRESHOLD_DEFAULT: Duration = Duration::from_secs(120);
+
 #[derive(Clone, strum_macros::Display)]
 enum TenantOperations {
    Create,
@@ -348,6 +351,8 @@ pub struct Config {
    pub start_as_candidate: bool,

    pub http_service_port: i32,
+
+    pub long_reconcile_threshold: Duration,
 }

 impl From<DatabaseError> for ApiError {
@@ -4974,7 +4979,12 @@ impl Service {

            {
                let mut nodes_mut = (**nodes).clone();
-                nodes_mut.remove(&node_id);
+                if let Some(mut removed_node) = nodes_mut.remove(&node_id) {
+                    // Ensure that any reconciler holding an Arc<> to this node will
+                    // drop out when trying to RPC to it (setting Offline state sets the
+                    // cancellation token on the Node object).
+                    removed_node.set_availability(NodeAvailability::Offline);
+                }
                *nodes = Arc::new(nodes_mut);
            }
        }
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -5,7 +5,9 @@ use std::{
 };

 use crate::{
-    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
+    metrics::{
+        self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome,
+    },
    persistence::TenantShardPersistence,
    reconciler::{ReconcileUnits, ReconcilerConfig},
    scheduler::{
@@ -14,6 +16,7 @@ use crate::{
    },
    service::ReconcileResultRequest,
 };
+use futures::future::{self, Either};
 use pageserver_api::controller_api::{
    AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
 };
@@ -1083,6 +1086,47 @@ impl TenantShard {
        }
    }

+    async fn reconcile(
+        sequence: Sequence,
+        mut reconciler: Reconciler,
+        must_notify: bool,
+    ) -> ReconcileResult {
+        // Attempt to make observed state match intent state
+        let result = reconciler.reconcile().await;
+
+        // If we know we had a pending compute notification from some previous action, send a notification irrespective
+        // of whether the above reconcile() did any work
+        if result.is_ok() && must_notify {
+            // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
+            reconciler.compute_notify().await.ok();
+        }
+
+        // Update result counter
+        let outcome_label = match &result {
+            Ok(_) => ReconcileOutcome::Success,
+            Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
+            Err(_) => ReconcileOutcome::Error,
+        };
+
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_reconcile_complete
+            .inc(ReconcileCompleteLabelGroup {
+                status: outcome_label,
+            });
+
+        // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might
+        // try and schedule more work in response to our result.
+        ReconcileResult {
+            sequence,
+            result,
+            tenant_shard_id: reconciler.tenant_shard_id,
+            generation: reconciler.generation,
+            observed: reconciler.observed,
+            pending_compute_notification: reconciler.compute_notify_failure,
+        }
+    }
+
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub(crate) fn spawn_reconciler(
@@ -1122,7 +1166,7 @@ impl TenantShard {

        let reconciler_cancel = cancel.child_token();
        let reconciler_intent = TargetState::from_intent(pageservers, &self.intent);
-        let mut reconciler = Reconciler {
+        let reconciler = Reconciler {
            tenant_shard_id: self.tenant_shard_id,
            shard: self.shard,
            placement_policy: self.policy.clone(),
@@ -1142,6 +1186,7 @@ impl TenantShard {
        };

        let reconcile_seq = self.sequence;
+        let long_reconcile_threshold = service_config.long_reconcile_threshold;

        tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
        let must_notify = self.pending_compute_notification;
@@ -1178,41 +1223,55 @@ impl TenantShard {
                    return;
                }

-                // Attempt to make observed state match intent state
-                let result = reconciler.reconcile().await;
+                let (tenant_id_label, shard_number_label, sequence_label) = {
+                    (
+                        reconciler.tenant_shard_id.tenant_id.to_string(),
+                        reconciler.tenant_shard_id.shard_number.0.to_string(),
+                        reconcile_seq.to_string(),
+                    )
+                };

-                // If we know we had a pending compute notification from some previous action, send a notification irrespective
-                // of whether the above reconcile() did any work
-                if result.is_ok() && must_notify {
-                    // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
-                    reconciler.compute_notify().await.ok();
+                let label_group = ReconcileLongRunningLabelGroup {
+                    tenant_id: &tenant_id_label,
+                    shard_number: &shard_number_label,
+                    sequence: &sequence_label,
+                };
+
+                let reconcile_fut = Self::reconcile(reconcile_seq, reconciler, must_notify);
+                let long_reconcile_fut = {
+                    let label_group = label_group.clone();
+                    async move {
+                        tokio::time::sleep(long_reconcile_threshold).await;
+
+                        tracing::warn!("Reconcile passed the long running threshold of {long_reconcile_threshold:?}");
+
+                        metrics::METRICS_REGISTRY
+                            .metrics_group
+                            .storage_controller_reconcile_long_running
+                            .inc(label_group);
+                    }
+                };
+
+                let reconcile_fut = std::pin::pin!(reconcile_fut);
+                let long_reconcile_fut = std::pin::pin!(long_reconcile_fut);
+
+                let (was_long, result) =
+                    match future::select(reconcile_fut, long_reconcile_fut).await {
+                        Either::Left((reconcile_result, _)) => (false, reconcile_result),
+                        Either::Right((_, reconcile_fut)) => (true, reconcile_fut.await),
+                    };
+
+                if was_long {
+                    let id = metrics::METRICS_REGISTRY
+                        .metrics_group
+                        .storage_controller_reconcile_long_running
+                        .with_labels(label_group);
+                    metrics::METRICS_REGISTRY
+                        .metrics_group
+                        .storage_controller_reconcile_long_running
+                        .remove_metric(id);
                }

-                // Update result counter
-                let outcome_label = match &result {
-                    Ok(_) => ReconcileOutcome::Success,
-                    Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
-                    Err(_) => ReconcileOutcome::Error,
-                };
-
-                metrics::METRICS_REGISTRY
-                    .metrics_group
-                    .storage_controller_reconcile_complete
-                    .inc(ReconcileCompleteLabelGroup {
-                        status: outcome_label,
-                    });
-
-                // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might
-                // try and schedule more work in response to our result.
-                let result = ReconcileResult {
-                    sequence: reconcile_seq,
-                    result,
-                    tenant_shard_id: reconciler.tenant_shard_id,
-                    generation: reconciler.generation,
-                    observed: reconciler.observed,
-                    pending_compute_notification: reconciler.compute_notify_failure,
-                };
-
                result_tx
                    .send(ReconcileResultRequest::ReconcileResult(result))
                    .ok();
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -4,7 +4,7 @@ use std::time::Duration;

 use crate::checks::{list_timeline_blobs, BlobDataParseResult};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
-use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
+use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
@@ -18,6 +18,7 @@ use serde::Serialize;
 use storage_controller_client::control_api;
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, Instrument};
+use utils::backoff;
 use utils::generation::Generation;
 use utils::id::{TenantId, TenantTimelineId};

@@ -326,15 +327,25 @@ async fn maybe_delete_index(
    }

    // All validations passed: erase the object
-    match remote_client
-        .delete(&obj.key, &CancellationToken::new())
-        .await
+    let cancel = CancellationToken::new();
+    match backoff::retry(
+        || remote_client.delete(&obj.key, &cancel),
+        |_| false,
+        3,
+        MAX_RETRIES as u32,
+        "maybe_delete_index",
+        &cancel,
+    )
+    .await
    {
-        Ok(_) => {
+        None => {
+            unreachable!("Using a dummy cancellation token");
+        }
+        Some(Ok(_)) => {
            tracing::info!("Successfully deleted index");
            summary.indices_deleted += 1;
        }
-        Err(e) => {
+        Some(Err(e)) => {
            tracing::warn!("Failed to delete index: {e}");
            summary.remote_storage_errors += 1;
        }
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -340,23 +340,27 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare:


@pytest.fixture(scope="function", autouse=True)
-def sync_after_each_test():
-    # The fixture calls `sync(2)` after each test if `SYNC_AFTER_EACH_TEST` env var is `true`
+def sync_between_tests():
+    # The fixture calls `sync(2)` after each test if `SYNC_BETWEEN_TESTS` env var is `true`
    #
-    # In CI, `SYNC_AFTER_EACH_TEST` is set to `true` only for benchmarks (`test_runner/performance`)
+    # In CI, `SYNC_BETWEEN_TESTS` is set to `true` only for benchmarks (`test_runner/performance`)
    # that are run on self-hosted runners because some of these tests are pretty write-heavy
    # and create issues to start the processes within 10s
-    key = "SYNC_AFTER_EACH_TEST"
+    key = "SYNC_BETWEEN_TESTS"
    enabled = os.environ.get(key) == "true"

+    if enabled:
+        start = time.time()
+        # we only run benches on unices, the method might not exist on windows
+        os.sync()
+        elapsed = time.time() - start
+        log.info(f"called sync before test {elapsed=}")
+
    yield

-    if not enabled:
-        # regress test, or running locally
-        return
-
-    start = time.time()
-    # we only run benches on unices, the method might not exist on windows
-    os.sync()
-    elapsed = time.time() - start
-    log.info(f"called sync after test {elapsed=}")
+    if enabled:
+        start = time.time()
+        # we only run benches on unices, the method might not exist on windows
+        os.sync()
+        elapsed = time.time() - start
+        log.info(f"called sync after test {elapsed=}")
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -20,7 +20,7 @@ from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
 from fcntl import LOCK_EX, LOCK_UN, flock
-from functools import cached_property, partial
+from functools import cached_property
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
@@ -86,7 +86,7 @@ from fixtures.remote_storage import (
    remote_storage_to_toml_dict,
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
-from fixtures.safekeeper.utils import are_walreceivers_absent
+from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
    allure_add_grafana_links,
@@ -401,7 +401,6 @@ class NeonEnvBuilder:
        safekeeper_extra_opts: Optional[list[str]] = None,
        storage_controller_port_override: Optional[int] = None,
        pageserver_io_buffer_alignment: Optional[int] = None,
-        pageserver_virtual_file_io_mode: Optional[str] = None,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
@@ -456,7 +455,6 @@ class NeonEnvBuilder:
        self.storage_controller_port_override = storage_controller_port_override

        self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment
-        self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode

        assert test_name.startswith(
            "test_"
@@ -952,9 +950,6 @@ class NeonEnv:

    safekeepers - An array containing objects representing the safekeepers

-    pg_bin - pg_bin.run() can be used to execute Postgres client binaries,
-        like psql or pg_dump
-
    initial_tenant - tenant ID of the initial tenant created in the repository

    neon_cli - can be used to run the 'neon' CLI tool
@@ -1030,7 +1025,6 @@ class NeonEnv:
        self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
        self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
        self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment
-        self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode

        # Create the neon_local's `NeonLocalInitConf`
        cfg: Dict[str, Any] = {
@@ -1094,10 +1088,7 @@ class NeonEnv:
                        for key, value in override.items():
                            ps_cfg[key] = value

-            if self.pageserver_io_buffer_alignment is not None:
-                ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment
-            if self.pageserver_virtual_file_io_mode is not None:
-                ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode
+            ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment

            # Create a corresponding NeonPageserver object
            self.pageservers.append(
@@ -1336,7 +1327,6 @@ def neon_simple_env(
    pageserver_aux_file_policy: Optional[AuxFileStore],
    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
    pageserver_io_buffer_alignment: Optional[int],
-    pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnv]:
    """
    Simple Neon environment, with no authentication and no safekeepers.
@@ -1363,7 +1353,6 @@ def neon_simple_env(
        pageserver_aux_file_policy=pageserver_aux_file_policy,
        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
        pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
-        pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
    ) as builder:
        env = builder.init_start()

@@ -1388,7 +1377,6 @@ def neon_env_builder(
    pageserver_aux_file_policy: Optional[AuxFileStore],
    record_property: Callable[[str, object], None],
    pageserver_io_buffer_alignment: Optional[int],
-    pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnvBuilder]:
    """
    Fixture to create a Neon environment for test.
@@ -1424,7 +1412,6 @@ def neon_env_builder(
        pageserver_aux_file_policy=pageserver_aux_file_policy,
        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
        pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
-        pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
    ) as builder:
        yield builder
        # Propogate `preserve_database_files` to make it possible to use in other fixtures,
@@ -3310,6 +3297,8 @@ class PgBin:

@pytest.fixture(scope="function")
 def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
+    """pg_bin.run() can be used to execute Postgres client binaries, like psql or pg_dump"""
+
    return PgBin(test_output_dir, pg_distrib_dir, pg_version)


@@ -4111,12 +4100,26 @@ class Endpoint(PgProtocol, LogUtils):
        with open(remote_extensions_spec_path, "w") as file:
            json.dump(spec, file, indent=4)

-    def stop(self, mode: str = "fast") -> "Endpoint":
+    def stop(
+        self,
+        mode: str = "fast",
+        sks_wait_walreceiver_gone: Optional[tuple[List[Safekeeper], TimelineId]] = None,
+    ) -> "Endpoint":
        """
        Stop the Postgres instance if it's running.

-        Because test teardown might try and stop an endpoint concurrently with test code
-        stopping the endpoint, this method is thread safe
+        Because test teardown might try and stop an endpoint concurrently with
+        test code stopping the endpoint, this method is thread safe
+
+        If sks_wait_walreceiever_gone is not None, wait for the safekeepers in
+        this list to have no walreceivers, i.e. compute endpoint connection be
+        gone. When endpoint is stopped in immediate mode and started again this
+        avoids race of old connection delivering some data after
+        sync-safekeepers check, which makes basebackup unusable. TimelineId is
+        needed because endpoint doesn't know it.
+
+        A better solution would be bump term when sync-safekeepers is skipped on
+        start, see #9079.

        Returns self.
        """
@@ -4128,6 +4131,11 @@ class Endpoint(PgProtocol, LogUtils):
                self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
            )

+        if sks_wait_walreceiver_gone is not None:
+            for sk in sks_wait_walreceiver_gone[0]:
+                cli = sk.http_client()
+                wait_walreceivers_absent(cli, self.tenant_id, sks_wait_walreceiver_gone[1])
+
        return self

    def stop_and_destroy(self, mode: str = "immediate") -> "Endpoint":
@@ -5220,7 +5228,7 @@ def flush_ep_to_pageserver(
    for sk in env.safekeepers:
        cli = sk.http_client()
        # wait until compute connections are gone
-        wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline))
+        wait_walreceivers_absent(cli, tenant, timeline)
        commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn)

    # Note: depending on WAL filtering implementation, probably most shards
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -586,6 +586,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        timeline_id: TimelineId,
        force_repartition=False,
        force_image_layer_creation=False,
+        force_l0_compaction=False,
        wait_until_uploaded=False,
        enhanced_gc_bottom_most_compaction=False,
    ):
@@ -595,6 +596,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
            query["force_repartition"] = "true"
        if force_image_layer_creation:
            query["force_image_layer_creation"] = "true"
+        if force_l0_compaction:
+            query["force_l0_compaction"] = "true"
        if wait_until_uploaded:
            query["wait_until_uploaded"] = "true"
        if enhanced_gc_bottom_most_compaction:
@@ -701,6 +704,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        timeline_id: TimelineId,
        force_repartition=False,
        force_image_layer_creation=False,
+        force_l0_compaction=False,
        wait_until_uploaded=False,
        compact: Optional[bool] = None,
        **kwargs,
@@ -711,6 +715,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
            query["force_repartition"] = "true"
        if force_image_layer_creation:
            query["force_image_layer_creation"] = "true"
+        if force_l0_compaction:
+            query["force_l0_compaction"] = "true"
        if wait_until_uploaded:
            query["wait_until_uploaded"] = "true"

--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -39,11 +39,6 @@ def pageserver_io_buffer_alignment() -> Optional[int]:
    return None


-@pytest.fixture(scope="function", autouse=True)
-def pageserver_virtual_file_io_mode() -> Optional[str]:
-    return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
-
-
@pytest.fixture(scope="function", autouse=True)
 def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
    return None
--- a/test_runner/fixtures/safekeeper/utils.py
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -1,11 +1,20 @@
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.utils import wait_until


-def are_walreceivers_absent(
+def wait_walreceivers_absent(
    sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):
-    status = sk_http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
-    return len(status.walreceivers) == 0
+    """
+    Wait until there is no walreceiver connections from the compute(s) on the
+    safekeeper.
+    """
+
+    def walreceivers_absent():
+        status = sk_http_cli.timeline_status(tenant_id, timeline_id)
+        log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
+        assert len(status.walreceivers) == 0
+
+    wait_until(30, 0.5, walreceivers_absent)
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -175,7 +175,9 @@ class Workload:
                if upload:
                    # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
                    ps_http.timeline_checkpoint(
-                        tenant_shard_id, self.timeline_id, wait_until_uploaded=True
+                        tenant_shard_id,
+                        self.timeline_id,
+                        wait_until_uploaded=True,
                    )
                    log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
                else:
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -53,7 +53,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
    env = neon_simple_env
    pageserver_http_client = env.pageserver.http_client()

-    tenant, _ = env.neon_cli.create_tenant(
+    tenant, timeline_main = env.neon_cli.create_tenant(
        conf={
            # disable background GC
            "gc_period": "0s",
@@ -70,8 +70,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
        }
    )

-    timeline_main = env.neon_cli.create_timeline("test_main", tenant_id=tenant)
-    endpoint_main = env.endpoints.create_start("test_main", tenant_id=tenant)
+    endpoint_main = env.endpoints.create_start("main", tenant_id=tenant)

    main_cur = endpoint_main.connect().cursor()

@@ -92,7 +91,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
    pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024)

    env.neon_cli.create_branch(
-        "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1
+        "test_branch", ancestor_branch_name="main", ancestor_start_lsn=lsn1, tenant_id=tenant
    )
    endpoint_branch = env.endpoints.create_start("test_branch", tenant_id=tenant)

--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -63,7 +63,10 @@ page_cache_size=10
            log.info(f"Running churn round {i}/{churn_rounds} ...")

        workload.churn_rows(row_count, env.pageserver.id)
-        ps_http.timeline_compact(tenant_id, timeline_id)
+        # Force L0 compaction to ensure the number of layers is within bounds; we don't want to count L0 layers
+        # in this benchmark. In other words, this smoke test ensures number of L1 layers are bound.
+        ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
+        assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1

    log.info("Validating at workload end ...")
    workload.validate(env.pageserver.id)
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -252,7 +252,7 @@ def test_forward_compatibility(
        # not using env.pageserver.version because it was initialized before
        prev_pageserver_version_str = env.get_binary_version("pageserver")
        prev_pageserver_version_match = re.search(
-            "Neon page server git-env:(.*) failpoints: (.*), features: (.*)",
+            "Neon page server git(?:-env)?:(.*) failpoints: (.*), features: (.*)",
            prev_pageserver_version_str,
        )
        if prev_pageserver_version_match is not None:
@@ -263,12 +263,12 @@ def test_forward_compatibility(
            )

        # does not include logs from previous runs
-        assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
+        assert not env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}")

        env.start()

        # ensure the specified pageserver is running
-        assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)
+        assert env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}")

        check_neon_works(
            env,
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -31,9 +31,7 @@ def helper_compare_timeline_list(
        )
    )

-    timelines_cli = env.neon_cli.list_timelines()
-    assert timelines_cli == env.neon_cli.list_timelines(initial_tenant)
-
+    timelines_cli = env.neon_cli.list_timelines(initial_tenant)
    cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli])
    assert timelines_api == cli_timeline_ids

--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -24,7 +24,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
            # IMPORTANT:
            # If the version has changed, the test should be updated.
            # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.4",)
+            assert cur.fetchone() == ("1.5",)
            cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
            res = cur.fetchall()
            log.info(res)
@@ -48,7 +48,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
            # IMPORTANT:
            # If the version has changed, the test should be updated.
            # Ensure that the default version is also updated in the neon.control file
-            assert cur.fetchone() == ("1.4",)
+            assert cur.fetchone() == ("1.5",)
            cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
            all_versions = ["1.5", "1.4", "1.3", "1.2", "1.1", "1.0"]
            current_version = "1.5"
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -435,7 +435,9 @@ $$;

    # Wait until pageserver has received all the data, and restart the endpoint
    wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id)
-    endpoint.stop(mode="immediate")  # 'immediate' to avoid writing shutdown checkpoint
+    endpoint.stop(
+        mode="immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id)
+    )  # 'immediate' to avoid writing shutdown checkpoint
    endpoint.start()

    # Check that the next-multixid value wrapped around correctly
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -549,6 +549,14 @@ def test_multi_attach(
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

+    # Instruct the storage controller to not interfere with our low level configuration
+    # of the pageserver's attachment states.  Otherwise when it sees nodes go offline+return,
+    # it would send its own requests that would conflict with the test's.
+    env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"})
+    env.storage_controller.allowed_errors.extend(
+        [".*Scheduling is disabled by policy Stop.*", ".*Skipping reconcile for policy Stop.*"]
+    )
+
    # Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
    _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -174,8 +174,7 @@ def test_pageserver_chaos(
            "checkpoint_distance": "5000000",
        }
    )
-    env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant)
-    endpoint = env.endpoints.create_start("test_pageserver_chaos", tenant_id=tenant)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant)

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
    # shared_buffers, otherwise the SELECT after restart will just return answer
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -103,6 +103,7 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv):
    # Initialize the primary, a test table, and a helper function to create lots
    # of subtransactions.
    env = neon_simple_env
+    timeline_id = env.initial_timeline
    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
    primary_conn = primary.connect()
    primary_cur = primary_conn.cursor()
@@ -114,7 +115,7 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv):
    # chance to write abort records for them.
    primary_cur.execute("begin")
    primary_cur.execute("select create_subxacts(100000)")
-    primary.stop(mode="immediate")
+    primary.stop(mode="immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id))

    # Restart the primary. Do some light work, and shut it down cleanly
    primary.start()
@@ -659,6 +660,7 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv):

    # Initialize the primary and a test table
    env = neon_simple_env
+    timeline_id = env.initial_timeline
    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
    with primary.cursor() as primary_cur:
        primary_cur.execute("create table t(pk serial primary key, payload integer)")
@@ -667,7 +669,7 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv):
        with primary.cursor() as primary_cur:
            primary_cur.execute("insert into t (payload) values (0)")
        # restart primary
-        primary.stop("immediate")
+        primary.stop("immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id))
        primary.start()

    # Wait for the WAL to be flushed
--- a/Show More
+++ b/Show More