fix: dont hard error on initial redis failure

2026-07-03 12:10:36 +00:00 · 2025-01-30 10:26:48 +00:00
44 changed files with 825 additions and 1475 deletions
--- a/.github/ISSUE_TEMPLATE/bug-template.md
+++ b/.github/ISSUE_TEMPLATE/bug-template.md
@@ -3,7 +3,6 @@ name: Bug Template
 about: Used for describing bugs
 title: ''
 labels: t/bug
-type: Bug
 assignees: ''

 ---
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -4,7 +4,6 @@ about: A set of related tasks contributing towards specific outcome, comprising
  more than 1 week of work.
 title: 'Epic: '
 labels: t/Epic
-type: Epic
 assignees: ''

 ---
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -158,8 +158,6 @@ jobs:

      - name: Run cargo build
        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

      # Do install *before* running rust tests because they might recompile the
@@ -217,8 +215,6 @@ jobs:
        env:
          NEXTEST_RETRIES: 3
        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
          LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
          export LD_LIBRARY_PATH

--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -319,7 +319,7 @@ jobs:
                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, 
                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
        }'
@@ -458,7 +458,7 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB
+    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB 
    # without (neonvm-captest-new)
    # and with (neonvm-captest-new-many-tables) many relations in the database
    - name: Create many relations before the run
@@ -590,20 +590,36 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    - name: Configure AWS credentials
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours
+    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
+    # instead of using Neon artifacts containing pgbench
+    - name: Install postgresql-16 where pytest expects it
+      run: |
+        # Just to make it easier to test things locally on macOS (with arm64)
+        arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')

-    - name: Download Neon artifact
-      uses: ./.github/actions/download
-      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
-        path: /tmp/neon/
-        prefix: latest
-        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        cd /home/nonroot
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb"
+        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb"
+        dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg
+        dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg
+        dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
+
+        mkdir -p /tmp/neon/pg_install/v16/bin
+        mkdir -p /tmp/neon/pg_install/v17/bin
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
+        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v17/bin/pgbench
+        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v17/bin/psql
+        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v17/lib
+
+        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
+        export LD_LIBRARY_PATH
+        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
+
+        /tmp/neon/pg_install/v16/bin/pgbench --version
+        /tmp/neon/pg_install/v16/bin/psql --version

    - name: Set up Connection String
      id: set-up-connstr
@@ -626,6 +642,13 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
    - name: Benchmark pgvector hnsw indexing
      uses: ./.github/actions/run-python-test-set
      with:
@@ -741,10 +764,10 @@ jobs:
          neonvm-captest-reuse)
            case "${PG_VERSION}" in
              16)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR_V16 }}
                ;;
              17)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_CONNSTR_PG17 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
                ;;
              *)
                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
@@ -864,7 +887,7 @@ jobs:
                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR"
                ;;
              17)
-                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_CONNSTR_PG17"
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_CONNSTR_PG17"
                ;;
              *)
                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
@@ -883,7 +906,7 @@ jobs:
            exit 1
            ;;
        esac
-
+        
        echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV

    - name: Set up Connection String
@@ -984,7 +1007,7 @@ jobs:
                CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
                ;;
              17)
-                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_USER_EXAMPLE_CONNSTR_PG17 }}
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
                ;;
              *)
                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV

      - name: Run cargo build (only for v17)
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: cargo build --all --release -j$(sysctl -n hw.ncpu)

      - name: Check that no warnings are produced (only for v17)
        run: ./run_clippy.sh
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
        run: make walproposer-lib -j$(nproc)

      - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
+        run: cargo build --all --release --timings -j$(nproc)

      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -941,6 +941,18 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"

+[[package]]
+name = "bb8"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89aabfae550a5c44b43ab941844ffcd2e993cb6900b342debf59e9ea74acdb8"
+dependencies = [
+ "async-trait",
+ "futures-util",
+ "parking_lot 0.12.1",
+ "tokio",
+]
+
 [[package]]
 name = "bcder"
 version = "0.7.4"
@@ -1300,7 +1312,7 @@ dependencies = [
 "tar",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "tokio-stream",
 "tokio-util",
 "tower 0.5.2",
@@ -1409,7 +1421,7 @@ dependencies = [
 "storage_broker",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "tokio-util",
 "toml",
 "toml_edit",
@@ -1785,11 +1797,24 @@ dependencies = [
 "chrono",
 "diesel_derives",
 "itoa",
- "pq-sys",
- "r2d2",
 "serde_json",
 ]

+[[package]]
+name = "diesel-async"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51a307ac00f7c23f526a04a77761a0519b9f0eb2838ebf5b905a58580095bdcb"
+dependencies = [
+ "async-trait",
+ "bb8",
+ "diesel",
+ "futures-util",
+ "scoped-futures",
+ "tokio",
+ "tokio-postgres 0.7.12",
+]
+
 [[package]]
 name = "diesel_derives"
 version = "2.2.1"
@@ -4035,8 +4060,8 @@ dependencies = [
 "pageserver_compaction",
 "pin-project-lite",
 "postgres",
- "postgres-protocol",
- "postgres-types",
+ "postgres-protocol 0.6.6",
+ "postgres-types 0.2.6",
 "postgres_backend",
 "postgres_connection",
 "postgres_ffi",
@@ -4067,7 +4092,7 @@ dependencies = [
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -4125,7 +4150,7 @@ dependencies = [
 "serde",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "tokio-stream",
 "tokio-util",
 "utils",
@@ -4431,7 +4456,7 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 ]

 [[package]]
@@ -4452,6 +4477,24 @@ dependencies = [
 "stringprep",
 ]

+[[package]]
+name = "postgres-protocol"
+version = "0.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23"
+dependencies = [
+ "base64 0.22.1",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "hmac",
+ "md-5",
+ "memchr",
+ "rand 0.8.5",
+ "sha2",
+ "stringprep",
+]
+
 [[package]]
 name = "postgres-protocol2"
 version = "0.1.0"
@@ -4476,7 +4519,18 @@ dependencies = [
 "bytes",
 "chrono",
 "fallible-iterator",
- "postgres-protocol",
+ "postgres-protocol 0.6.6",
+]
+
+[[package]]
+name = "postgres-types"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f"
+dependencies = [
+ "bytes",
+ "fallible-iterator",
+ "postgres-protocol 0.6.7",
 ]

 [[package]]
@@ -4501,7 +4555,7 @@ dependencies = [
 "serde",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "tokio-postgres-rustls",
 "tokio-rustls 0.26.0",
 "tokio-util",
@@ -4516,7 +4570,7 @@ dependencies = [
 "itertools 0.10.5",
 "once_cell",
 "postgres",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "url",
 ]

@@ -4603,15 +4657,6 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"

-[[package]]
-name = "pq-sys"
-version = "0.6.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
-dependencies = [
- "vcpkg",
-]
-
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4619,7 +4664,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "itertools 0.10.5",
- "postgres-protocol",
+ "postgres-protocol 0.6.6",
 "rand 0.8.5",
 "serde",
 "thiserror 1.0.69",
@@ -4867,7 +4912,7 @@ dependencies = [
 "tikv-jemalloc-ctl",
 "tikv-jemallocator",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "tokio-postgres2",
 "tokio-rustls 0.26.0",
 "tokio-tungstenite 0.21.0",
@@ -4924,17 +4969,6 @@ dependencies = [
 "proc-macro2",
 ]

-[[package]]
-name = "r2d2"
-version = "0.8.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93"
-dependencies = [
- "log",
- "parking_lot 0.12.1",
- "scheduled-thread-pool",
-]
-
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5666,7 +5700,7 @@ dependencies = [
 "pageserver_api",
 "parking_lot 0.12.1",
 "postgres",
- "postgres-protocol",
+ "postgres-protocol 0.6.6",
 "postgres_backend",
 "postgres_ffi",
 "pprof",
@@ -5690,7 +5724,7 @@ dependencies = [
 "tikv-jemallocator",
 "tokio",
 "tokio-io-timeout",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -5749,12 +5783,12 @@ dependencies = [
 ]

 [[package]]
-name = "scheduled-thread-pool"
-version = "0.2.7"
+name = "scoped-futures"
+version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19"
+checksum = "1b24aae2d0636530f359e9d5ef0c04669d11c5e756699b27a6a6d845d8329091"
 dependencies = [
- "parking_lot 0.12.1",
+ "pin-project-lite",
 ]

 [[package]]
@@ -6289,6 +6323,7 @@ dependencies = [
 "clap",
 "control_plane",
 "diesel",
+ "diesel-async",
 "diesel_migrations",
 "fail",
 "futures",
@@ -6303,10 +6338,10 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "postgres_connection",
- "r2d2",
 "rand 0.8.5",
 "reqwest",
 "routerify",
+ "scoped-futures",
 "scopeguard",
 "serde",
 "serde_json",
@@ -6359,7 +6394,7 @@ dependencies = [
 "serde_json",
 "storage_controller_client",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "tokio-postgres-rustls",
 "tokio-stream",
 "tokio-util",
@@ -6838,8 +6873,34 @@ dependencies = [
 "percent-encoding",
 "phf",
 "pin-project-lite",
- "postgres-protocol",
- "postgres-types",
+ "postgres-protocol 0.6.6",
+ "postgres-types 0.2.6",
+ "rand 0.8.5",
+ "socket2",
+ "tokio",
+ "tokio-util",
+ "whoami",
+]
+
+[[package]]
+name = "tokio-postgres"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb"
+dependencies = [
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "futures-channel",
+ "futures-util",
+ "log",
+ "parking_lot 0.12.1",
+ "percent-encoding",
+ "phf",
+ "pin-project-lite",
+ "postgres-protocol 0.6.7",
+ "postgres-types 0.2.8",
 "rand 0.8.5",
 "socket2",
 "tokio",
@@ -6856,7 +6917,7 @@ dependencies = [
 "ring",
 "rustls 0.23.18",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "tokio-rustls 0.26.0",
 "x509-certificate",
 ]
@@ -7515,12 +7576,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"

-[[package]]
-name = "vcpkg"
-version = "0.2.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
-
 [[package]]
 name = "version_check"
 version = "0.9.4"
@@ -7540,7 +7595,7 @@ dependencies = [
 "serde_json",
 "sysinfo",
 "tokio",
- "tokio-postgres",
+ "tokio-postgres 0.7.9",
 "tokio-util",
 "tracing",
 "tracing-subscriber",
--- a/2
+++ b/2
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .

 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
--- a/2
+++ b/2
@@ -64,8 +64,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
-CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib

 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"

--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1140,8 +1140,8 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION

-RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
-    echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
+    echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
    mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
    make release -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -1345,7 +1345,6 @@ FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
 RUN mkdir /ext-src

-COPY --from=pg-build /postgres /postgres
 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
 COPY --from=plv8-build /plv8.tar.gz /ext-src/
--- a/compute/patches/contrib_pg16.patch
+++ b/compute/patches/contrib_pg16.patch
@@ -1,242 +0,0 @@
-diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
-index 979e5e8..2375b45 100644
--- a/contrib/amcheck/expected/check_heap.out
-+++ b/contrib/amcheck/expected/check_heap.out
-@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
- -- same transaction.  The heaptest table is smaller than the default
- -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
- -- shared_buffers.  A transaction delays that and excludes any autovacuum.
-SET allow_in_place_tablespaces = true;
-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
- SELECT sum(reads) AS stats_bulkreads_before
-   FROM pg_stat_io WHERE context = 'bulkread' \gset
- BEGIN;
-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
- -- Check that valid options are not rejected nor corruption reported
- -- for a non-empty table
- SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
-@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
-  
- (1 row)
- 
-SELECT sum(reads) AS stats_bulkreads_after
-  FROM pg_stat_io WHERE context = 'bulkread' \gset
-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
- ?column? 
-----------
- t
-(1 row)
-
- CREATE ROLE regress_heaptest_role;
- -- verify permissions are checked (error due to function not callable)
- SET ROLE regress_heaptest_role;
-@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
- DETAIL:  This operation is not supported for foreign tables.
- -- cleanup
- DROP TABLE heaptest;
-DROP TABLESPACE regress_test_stats_tblspc;
- DROP TABLE test_partition;
- DROP TABLE test_partitioned;
- DROP OWNED BY regress_heaptest_role; -- permissions
-diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
-index 1745bae..3b429c3 100644
--- a/contrib/amcheck/sql/check_heap.sql
-+++ b/contrib/amcheck/sql/check_heap.sql
-@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
- -- same transaction.  The heaptest table is smaller than the default
- -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
- -- shared_buffers.  A transaction delays that and excludes any autovacuum.
-SET allow_in_place_tablespaces = true;
-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
- SELECT sum(reads) AS stats_bulkreads_before
-   FROM pg_stat_io WHERE context = 'bulkread' \gset
- BEGIN;
-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
- -- Check that valid options are not rejected nor corruption reported
- -- for a non-empty table
- SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
-@@ -58,9 +55,6 @@ COMMIT;
- --   ALTER TABLE ... SET TABLESPACE ...
- -- causing an additional bulkread, which should be reflected in pg_stat_io.
- SELECT pg_stat_force_next_flush();
-SELECT sum(reads) AS stats_bulkreads_after
-  FROM pg_stat_io WHERE context = 'bulkread' \gset
-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
- 
- CREATE ROLE regress_heaptest_role;
- 
-@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
- 
- -- cleanup
- DROP TABLE heaptest;
-DROP TABLESPACE regress_test_stats_tblspc;
- DROP TABLE test_partition;
- DROP TABLE test_partitioned;
- DROP OWNED BY regress_heaptest_role; -- permissions
-diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
-index 33be13a..70a406c 100644
--- a/contrib/citext/expected/create_index_acl.out
-+++ b/contrib/citext/expected/create_index_acl.out
-@@ -5,9 +5,6 @@
- -- owner having as few applicable privileges as possible.  (The privileges.sql
- -- regress_sro_user tests look for the opposite defect; they confirm that
- -- DefineIndex() uses the table owner userid where necessary.)
-SET allow_in_place_tablespaces = true;
-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
-RESET allow_in_place_tablespaces;
- BEGIN;
- CREATE ROLE regress_minimal;
- CREATE SCHEMA s;
-@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
- -- Empty-table DefineIndex()
- CREATE UNIQUE INDEX u0rows ON s.x USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
-  TABLESPACE regress_create_idx_tblspace
-   WHERE s.index_row_if(y);
- ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- -- Make the table nonempty.
- INSERT INTO s.x VALUES ('foo'), ('bar');
-@@ -66,11 +61,9 @@ RESET search_path;
- GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
- CREATE UNIQUE INDEX u2rows ON s.x USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
-  TABLESPACE regress_create_idx_tblspace
-   WHERE s.index_row_if(y);
- ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- -- Shall not find s.coll via search_path, despite the s.const->public.setter
- -- call having set search_path=s during expression planning.  Suppress the
-@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
- \set VERBOSITY sqlstate
- ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- ERROR:  42704
- \set VERBOSITY default
- ROLLBACK;
-DROP TABLESPACE regress_create_idx_tblspace;
-diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
-index 10b5225..ae442e1 100644
--- a/contrib/citext/sql/create_index_acl.sql
-+++ b/contrib/citext/sql/create_index_acl.sql
-@@ -6,10 +6,6 @@
- -- regress_sro_user tests look for the opposite defect; they confirm that
- -- DefineIndex() uses the table owner userid where necessary.)
- 
-SET allow_in_place_tablespaces = true;
-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
-RESET allow_in_place_tablespaces;
-
- BEGIN;
- CREATE ROLE regress_minimal;
- CREATE SCHEMA s;
-@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
- -- Empty-table DefineIndex()
- CREATE UNIQUE INDEX u0rows ON s.x USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
-  TABLESPACE regress_create_idx_tblspace
-   WHERE s.index_row_if(y);
- ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- -- Make the table nonempty.
- INSERT INTO s.x VALUES ('foo'), ('bar');
-@@ -68,11 +62,9 @@ RESET search_path;
- GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
- CREATE UNIQUE INDEX u2rows ON s.x USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
-  TABLESPACE regress_create_idx_tblspace
-   WHERE s.index_row_if(y);
- ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- -- Shall not find s.coll via search_path, despite the s.const->public.setter
- -- call having set search_path=s during expression planning.  Suppress the
-@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
- \set VERBOSITY sqlstate
- ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- \set VERBOSITY default
- ROLLBACK;
- 
-DROP TABLESPACE regress_create_idx_tblspace;
-diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
-index 72304e0..ebe131b 100644
--- a/contrib/file_fdw/expected/file_fdw.out
-+++ b/contrib/file_fdw/expected/file_fdw.out
-@@ -4,6 +4,7 @@
- -- directory paths are passed to us in environment variables
- \getenv abs_srcdir PG_ABS_SRCDIR
- -- Clean up in case a prior regression run failed
-+SET compute_query_id TO 'off';
- SET client_min_messages TO 'warning';
- DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
- RESET client_min_messages;
-diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
-index f0548e1..848a08c 100644
--- a/contrib/file_fdw/sql/file_fdw.sql
-+++ b/contrib/file_fdw/sql/file_fdw.sql
-@@ -6,6 +6,7 @@
- \getenv abs_srcdir PG_ABS_SRCDIR
- 
- -- Clean up in case a prior regression run failed
-+SET compute_query_id TO 'off';
- SET client_min_messages TO 'warning';
- DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
- RESET client_min_messages;
-diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out
-index d1adbab..38b52ac 100644
--- a/contrib/pageinspect/expected/gist.out
-+++ b/contrib/pageinspect/expected/gist.out
-@@ -10,25 +10,6 @@ BEGIN;
- CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
-     generate_series(1,1000) i;
- CREATE INDEX test_gist_idx ON test_gist USING gist (p);
--- Page 0 is the root, the rest are leaf pages
-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
- lsn | nsn | rightlink  | flags 
------+-----+------------+-------
- 0/1 | 0/0 | 4294967295 | {}
-(1 row)
-
-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
- lsn | nsn | rightlink  | flags  
------+-----+------------+--------
- 0/1 | 0/0 | 4294967295 | {leaf}
-(1 row)
-
-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
- lsn | nsn | rightlink | flags  
------+-----+-----------+--------
- 0/1 | 0/0 |         1 | {leaf}
-(1 row)
-
- COMMIT;
- SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
-  itemoffset |   ctid    | itemlen | dead |             keys              
-diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql
-index d263542..607992f 100644
--- a/contrib/pageinspect/sql/gist.sql
-+++ b/contrib/pageinspect/sql/gist.sql
-@@ -12,11 +12,6 @@ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
-     generate_series(1,1000) i;
- CREATE INDEX test_gist_idx ON test_gist USING gist (p);
- 
--- Page 0 is the root, the rest are leaf pages
-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
-
- COMMIT;
- 
- SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
--- a/compute/patches/contrib_pg17.patch
+++ b/compute/patches/contrib_pg17.patch
@@ -1,196 +0,0 @@
-diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
-index 979e5e8..2375b45 100644
--- a/contrib/amcheck/expected/check_heap.out
-+++ b/contrib/amcheck/expected/check_heap.out
-@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
- -- same transaction.  The heaptest table is smaller than the default
- -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
- -- shared_buffers.  A transaction delays that and excludes any autovacuum.
-SET allow_in_place_tablespaces = true;
-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
- SELECT sum(reads) AS stats_bulkreads_before
-   FROM pg_stat_io WHERE context = 'bulkread' \gset
- BEGIN;
-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
- -- Check that valid options are not rejected nor corruption reported
- -- for a non-empty table
- SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
-@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
-  
- (1 row)
- 
-SELECT sum(reads) AS stats_bulkreads_after
-  FROM pg_stat_io WHERE context = 'bulkread' \gset
-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
- ?column? 
-----------
- t
-(1 row)
-
- CREATE ROLE regress_heaptest_role;
- -- verify permissions are checked (error due to function not callable)
- SET ROLE regress_heaptest_role;
-@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
- DETAIL:  This operation is not supported for foreign tables.
- -- cleanup
- DROP TABLE heaptest;
-DROP TABLESPACE regress_test_stats_tblspc;
- DROP TABLE test_partition;
- DROP TABLE test_partitioned;
- DROP OWNED BY regress_heaptest_role; -- permissions
-diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
-index 1745bae..3b429c3 100644
--- a/contrib/amcheck/sql/check_heap.sql
-+++ b/contrib/amcheck/sql/check_heap.sql
-@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
- -- same transaction.  The heaptest table is smaller than the default
- -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
- -- shared_buffers.  A transaction delays that and excludes any autovacuum.
-SET allow_in_place_tablespaces = true;
-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
- SELECT sum(reads) AS stats_bulkreads_before
-   FROM pg_stat_io WHERE context = 'bulkread' \gset
- BEGIN;
-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
- -- Check that valid options are not rejected nor corruption reported
- -- for a non-empty table
- SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
-@@ -58,9 +55,6 @@ COMMIT;
- --   ALTER TABLE ... SET TABLESPACE ...
- -- causing an additional bulkread, which should be reflected in pg_stat_io.
- SELECT pg_stat_force_next_flush();
-SELECT sum(reads) AS stats_bulkreads_after
-  FROM pg_stat_io WHERE context = 'bulkread' \gset
-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
- 
- CREATE ROLE regress_heaptest_role;
- 
-@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
- 
- -- cleanup
- DROP TABLE heaptest;
-DROP TABLESPACE regress_test_stats_tblspc;
- DROP TABLE test_partition;
- DROP TABLE test_partitioned;
- DROP OWNED BY regress_heaptest_role; -- permissions
-diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
-index 33be13a..70a406c 100644
--- a/contrib/citext/expected/create_index_acl.out
-+++ b/contrib/citext/expected/create_index_acl.out
-@@ -5,9 +5,6 @@
- -- owner having as few applicable privileges as possible.  (The privileges.sql
- -- regress_sro_user tests look for the opposite defect; they confirm that
- -- DefineIndex() uses the table owner userid where necessary.)
-SET allow_in_place_tablespaces = true;
-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
-RESET allow_in_place_tablespaces;
- BEGIN;
- CREATE ROLE regress_minimal;
- CREATE SCHEMA s;
-@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
- -- Empty-table DefineIndex()
- CREATE UNIQUE INDEX u0rows ON s.x USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
-  TABLESPACE regress_create_idx_tblspace
-   WHERE s.index_row_if(y);
- ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- -- Make the table nonempty.
- INSERT INTO s.x VALUES ('foo'), ('bar');
-@@ -66,11 +61,9 @@ RESET search_path;
- GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
- CREATE UNIQUE INDEX u2rows ON s.x USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
-  TABLESPACE regress_create_idx_tblspace
-   WHERE s.index_row_if(y);
- ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- -- Shall not find s.coll via search_path, despite the s.const->public.setter
- -- call having set search_path=s during expression planning.  Suppress the
-@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
- \set VERBOSITY sqlstate
- ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- ERROR:  42704
- \set VERBOSITY default
- ROLLBACK;
-DROP TABLESPACE regress_create_idx_tblspace;
-diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
-index 10b5225..ae442e1 100644
--- a/contrib/citext/sql/create_index_acl.sql
-+++ b/contrib/citext/sql/create_index_acl.sql
-@@ -6,10 +6,6 @@
- -- regress_sro_user tests look for the opposite defect; they confirm that
- -- DefineIndex() uses the table owner userid where necessary.)
- 
-SET allow_in_place_tablespaces = true;
-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
-RESET allow_in_place_tablespaces;
-
- BEGIN;
- CREATE ROLE regress_minimal;
- CREATE SCHEMA s;
-@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
- -- Empty-table DefineIndex()
- CREATE UNIQUE INDEX u0rows ON s.x USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
-  TABLESPACE regress_create_idx_tblspace
-   WHERE s.index_row_if(y);
- ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- -- Make the table nonempty.
- INSERT INTO s.x VALUES ('foo'), ('bar');
-@@ -68,11 +62,9 @@ RESET search_path;
- GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
- CREATE UNIQUE INDEX u2rows ON s.x USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
-  TABLESPACE regress_create_idx_tblspace
-   WHERE s.index_row_if(y);
- ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- -- Shall not find s.coll via search_path, despite the s.const->public.setter
- -- call having set search_path=s during expression planning.  Suppress the
-@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
- \set VERBOSITY sqlstate
- ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
-   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
-  USING INDEX TABLESPACE regress_create_idx_tblspace
-   WHERE (s.index_row_if(y));
- \set VERBOSITY default
- ROLLBACK;
- 
-DROP TABLESPACE regress_create_idx_tblspace;
-diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
-index 86c148a..81bdb2c 100644
--- a/contrib/file_fdw/expected/file_fdw.out
-+++ b/contrib/file_fdw/expected/file_fdw.out
-@@ -4,6 +4,7 @@
- -- directory paths are passed to us in environment variables
- \getenv abs_srcdir PG_ABS_SRCDIR
- -- Clean up in case a prior regression run failed
-+SET compute_query_id TO 'off';
- SET client_min_messages TO 'warning';
- DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
- RESET client_min_messages;
-diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
-index f0548e1..848a08c 100644
--- a/contrib/file_fdw/sql/file_fdw.sql
-+++ b/contrib/file_fdw/sql/file_fdw.sql
-@@ -6,6 +6,7 @@
- \getenv abs_srcdir PG_ABS_SRCDIR
- 
- -- Clean up in case a prior regression run failed
-+SET compute_query_id TO 'off';
- SET client_min_messages TO 'warning';
- DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
- RESET client_min_messages;
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -258,11 +258,14 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
    let uri = format!("{}/{}", ext_remote_storage, ext_path);

-    info!("Download extension {} from uri {}", ext_path, uri);
+    info!("Download extension {:?} from uri {:?}", ext_path, uri);

    match do_extension_server_request(&uri).await {
        Ok(resp) => {
-            info!("Successfully downloaded remote extension data {}", ext_path);
+            info!(
+                "Successfully downloaded remote extension data {:?}",
+                ext_path
+            );
            REMOTE_EXT_REQUESTS_TOTAL
                .with_label_values(&[&StatusCode::OK.to_string()])
                .inc();
@@ -282,10 +285,7 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
    let resp = reqwest::get(uri).await.map_err(|e| {
        (
-            format!(
-                "could not perform remote extensions server request: {:?}",
-                e
-            ),
+            format!("could not perform remote extensions server request: {}", e),
            UNKNOWN_HTTP_STATUS.to_string(),
        )
    })?;
@@ -295,7 +295,7 @@ async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String
        StatusCode::OK => match resp.bytes().await {
            Ok(resp) => Ok(resp),
            Err(e) => Err((
-                format!("could not read remote extensions server response: {:?}", e),
+                format!("could not read remote extensions server response: {}", e),
                // It's fine to return and report error with status as 200 OK,
                // because we still failed to read the response.
                status.to_string(),
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -125,7 +125,7 @@ impl<'m> MigrationRunner<'m> {
                    info!("Finished migration id={}", migration_id);
                }
                Err(e) => {
-                    error!("Failed to run migration id={}: {:?}", migration_id, e);
+                    error!("Failed to run migration id={}: {}", migration_id, e);
                    DB_MIGRATION_FAILED
                        .with_label_values(&[migration_id.to_string().as_str()])
                        .inc();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -28,7 +28,7 @@ fn do_control_plane_request(
        .map_err(|e| {
            (
                true,
-                format!("could not perform spec request to control plane: {:?}", e),
+                format!("could not perform spec request to control plane: {}", e),
                UNKNOWN_HTTP_STATUS.to_string(),
            )
        })?;
@@ -39,7 +39,7 @@ fn do_control_plane_request(
            Ok(spec_resp) => Ok(spec_resp),
            Err(e) => Err((
                true,
-                format!("could not deserialize control plane response: {:?}", e),
+                format!("could not deserialize control plane response: {}", e),
                status.to_string(),
            )),
        },
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -10,8 +10,8 @@ use pageserver_api::{
    controller_api::{
        AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
        SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy,
-        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
+        ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
+        TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -800,7 +800,7 @@ async fn main() -> anyhow::Result<()> {
                    .collect(),
            };
            storcon_client
-                .dispatch::<ShardsPreferredAzsRequest, ShardsPreferredAzsResponse>(
+                .dispatch::<ShardsPreferredAzsRequest, ()>(
                    Method::PUT,
                    "control/v1/preferred_azs".to_string(),
                    Some(req),
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
                       jq   \
                       netcat-openbsd
 #This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw
+RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src

 USER postgres
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -61,32 +61,17 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
-        # The following block does the same for the contrib/file_fdw test
-        TMPDIR=$(mktemp -d)
-        docker cp $TEST_CONTAINER_NAME:/postgres/contrib/file_fdw/data $TMPDIR/data
-        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/postgres/contrib/file_fdw/data
-        rm -rf $TMPDIR
-        # Apply patches
-        cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
        # We are running tests now
-        rm -f testout.txt testout_contrib.txt
-        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
-        $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
-        docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
-        $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
-        if [ $EXT_SUCCESS -eq 0 ] || [ $CONTRIB_SUCCESS -eq 0 ]; then
-            CONTRIB_FAILED=
-            FAILED=
-            [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
-            [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
-            for d in $FAILED $CONTRIB_FAILED; do
-                dn="$(basename $d)"
-                rm -rf $dn
-                mkdir $dn
-                docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ]
-                docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ]
-                cat $dn/regression.out $dn/regression.diffs || true
-                rm -rf $dn
+        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
+            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
+        then
+            FAILED=$(tail -1 testout.txt)
+            for d in $FAILED
+            do
+                mkdir $d
+                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
+                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
+                cat $d/regression.out $d/regression.diffs || true
            done
        rm -rf $FAILED
        exit 1
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,11 +1,9 @@
 #!/bin/bash
 set -x

-extdir=${1}
-
-cd "${extdir}" || exit 2
+cd /ext-src || exit 2
 FAILED=
-LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
+LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}; do
    [ -d "${d}" ] || continue
    if ! psql -w -c "select 1" >/dev/null; then
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1472,13 +1472,7 @@ async fn layer_download_handler(
    let downloaded = timeline
        .download_layer(&layer_name)
        .await
-        .map_err(|e| match e {
-            tenant::storage_layer::layer::DownloadError::TimelineShutdown
-            | tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
-                ApiError::ShuttingDown
-            }
-            other => ApiError::InternalServerError(other.into()),
-        })?;
+        .map_err(ApiError::InternalServerError)?;

    match downloaded {
        Some(true) => json_response(StatusCode::OK, ()),
@@ -3175,16 +3169,12 @@ async fn put_tenant_timeline_import_basebackup(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    let span = info_span!("import_basebackup",
-        tenant_id=%tenant_id, timeline_id=%timeline_id, shard_id=%tenant_shard_id.shard_slug(),
-        base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
    async move {
        let state = get_state(&request);
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
+            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;

        let broker_client = state.broker_client.clone();

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -116,38 +116,11 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-/// Measures layers visited per read (i.e. read amplification).
-///
-/// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits
-/// are amortized across the batch, and some layers may not intersect with a given key, each visited
-/// layer contributes directly to the observed latency for every read in the batch, which is what we
-/// care about.
-pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_layers_per_read",
-        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
-        &["tenant_id", "shard_id", "timeline_id"],
-        // Low resolution to reduce cardinality.
-        vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "pageserver_layers_per_read_global",
-        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
-        vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
-    // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
-    register_histogram!(
-        "pageserver_deltas_per_read_global",
-        "Number of delta pages applied to image page per read",
-        vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
+        "pageserver_layers_visited_per_vectored_read_global",
+        "Average number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
    )
    .expect("failed to define a metric")
 });
@@ -2659,7 +2632,6 @@ pub(crate) struct TimelineMetrics {
    pub disk_consistent_lsn_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
-    pub layers_per_read: Histogram,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    pub visible_physical_size_gauge: UIntGauge,
@@ -2757,10 +2729,6 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

-        let layers_per_read = LAYERS_PER_READ
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
-
        let standby_horizon_gauge = STANDBY_HORIZON
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2825,7 +2793,6 @@ impl TimelineMetrics {
            disk_consistent_lsn_gauge,
            pitr_history_size,
            archival_size,
-            layers_per_read,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            visible_physical_size_gauge,
@@ -2995,8 +2962,6 @@ impl TimelineMetrics {
            }
        }

-        let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -3947,8 +3912,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {

    // histograms
    [
-        &LAYERS_PER_READ_GLOBAL,
-        &DELTAS_PER_READ_GLOBAL,
+        &VEC_READ_NUM_LAYERS_VISITED,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2426,7 +2426,7 @@ impl Tenant {
        // Make sure the freeze_and_flush reaches remote storage.
        tline.remote_client.wait_completion().await.unwrap();

-        let tl = uninit_tl.finish_creation().await?;
+        let tl = uninit_tl.finish_creation()?;
        // The non-test code would call tl.activate() here.
        tl.set_state(TimelineState::Active);
        Ok(tl)
@@ -4702,7 +4702,7 @@ impl Tenant {
            )
            .await?;

-        let new_timeline = uninitialized_timeline.finish_creation().await?;
+        let new_timeline = uninitialized_timeline.finish_creation()?;

        // Root timeline gets its layers during creation and uploads them along with the metadata.
        // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
@@ -4892,11 +4892,10 @@ impl Tenant {
        }

        // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
-        let pgdata_path_deferred = pgdata_path.clone();
        scopeguard::defer! {
-            if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred) {
+            if let Err(e) = fs::remove_dir_all(&pgdata_path) {
                // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
-                error!("Failed to remove temporary initdb directory '{pgdata_path_deferred}': {e}");
+                error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
            }
        }
        if let Some(existing_initdb_timeline_id) = load_existing_initdb {
@@ -4963,7 +4962,7 @@ impl Tenant {
            pgdata_lsn,
            pg_version,
        );
-        let mut raw_timeline = self
+        let raw_timeline = self
            .prepare_new_timeline(
                timeline_id,
                &new_metadata,
@@ -4974,33 +4973,42 @@ impl Tenant {
            .await?;

        let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
-        raw_timeline
-            .write(|unfinished_timeline| async move {
-                import_datadir::import_timeline_from_postgres_datadir(
-                    &unfinished_timeline,
-                    &pgdata_path,
-                    pgdata_lsn,
-                    ctx,
+        let unfinished_timeline = raw_timeline.raw_timeline()?;
+
+        // Flush the new layer files to disk, before we make the timeline as available to
+        // the outside world.
+        //
+        // Flush loop needs to be spawned in order to be able to flush.
+        unfinished_timeline.maybe_spawn_flush_loop();
+
+        import_datadir::import_timeline_from_postgres_datadir(
+            unfinished_timeline,
+            &pgdata_path,
+            pgdata_lsn,
+            ctx,
+        )
+        .await
+        .with_context(|| {
+            format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
+        })?;
+
+        fail::fail_point!("before-checkpoint-new-timeline", |_| {
+            Err(CreateTimelineError::Other(anyhow::anyhow!(
+                "failpoint before-checkpoint-new-timeline"
+            )))
+        });
+
+        unfinished_timeline
+            .freeze_and_flush()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}"
                )
-                .await
-                .with_context(|| {
-                    format!(
-                        "Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}"
-                    )
-                })?;
-
-                fail::fail_point!("before-checkpoint-new-timeline", |_| {
-                    Err(CreateTimelineError::Other(anyhow::anyhow!(
-                        "failpoint before-checkpoint-new-timeline"
-                    )))
-                });
-
-                Ok(())
-            })
-            .await?;
+            })?;

        // All done!
-        let timeline = raw_timeline.finish_creation().await?;
+        let timeline = raw_timeline.finish_creation()?;

        // Callers are responsible to wait for uploads to complete and for activating the timeline.

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -673,30 +673,12 @@ impl<'a> TenantDownloader<'a> {
            HeatMapDownload::Modified(m) => m,
        };

-        // Heatmap storage location
-        let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
-
-        let last_heatmap = if last_download.is_none() {
-            match load_heatmap(&heatmap_path, ctx).await {
-                Ok(htm) => htm,
-                Err(e) => {
-                    tracing::warn!("Couldn't load heatmap from {heatmap_path}: {e:?}");
-                    None
-                }
-            }
-        } else {
-            None
-        };
-
-        let last_heatmap_timelines = last_heatmap.as_ref().map(|htm| {
-            htm.timelines
-                .iter()
-                .map(|tl| (tl.timeline_id, tl))
-                .collect::<HashMap<_, _>>()
-        });
-
        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;

+        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
+        // layer metadata without having to re-download it.
+        let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
+
        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
        let heatmap_path_bg = heatmap_path.clone();
@@ -725,17 +707,10 @@ impl<'a> TenantDownloader<'a> {
            let timeline_state = match timeline_state {
                Some(t) => t,
                None => {
-                    let last_heatmap =
-                        last_heatmap_timelines
-                            .as_ref()
-                            .and_then(|last_heatmap_timelines| {
-                                last_heatmap_timelines.get(&timeline.timeline_id).copied()
-                            });
                    // We have no existing state: need to scan local disk for layers first.
                    let timeline_state = init_timeline_state(
                        self.conf,
                        tenant_shard_id,
-                        last_heatmap,
                        timeline,
                        &self.secondary_state.resident_size_metric,
                    )
@@ -1104,12 +1079,12 @@ impl<'a> TenantDownloader<'a> {
                }
            }

-            if on_disk.metadata.generation_file_size() != layer.metadata.generation_file_size() {
+            if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() {
                tracing::info!(
                    "Re-downloading layer {} with changed size or generation: {:?}->{:?}",
                    layer.name,
                    on_disk.metadata.generation_file_size(),
-                    layer.metadata.generation_file_size()
+                    on_disk.metadata.generation_file_size()
                );
                return LayerAction::Download;
            }
@@ -1302,7 +1277,6 @@ impl<'a> TenantDownloader<'a> {
 async fn init_timeline_state(
    conf: &'static PageServerConf,
    tenant_shard_id: &TenantShardId,
-    last_heatmap: Option<&HeatMapTimeline>,
    heatmap: &HeatMapTimeline,
    resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
@@ -1332,13 +1306,6 @@ async fn init_timeline_state(
    let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
        heatmap.layers.iter().map(|l| (&l.name, l)).collect();

-    let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
-        if let Some(last_heatmap) = last_heatmap {
-            last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
-        } else {
-            HashMap::new()
-        };
-
    while let Some(dentry) = dir
        .next_entry()
        .await
@@ -1372,32 +1339,18 @@ async fn init_timeline_state(
        match LayerName::from_str(file_name) {
            Ok(name) => {
                let remote_meta = heatmap_metadata.get(&name);
-                let last_meta = last_heatmap_metadata.get(&name);
-                let mut remove = false;
                match remote_meta {
                    Some(remote_meta) => {
-                        let last_meta_generation_file_size = last_meta
-                            .map(|m| m.metadata.generation_file_size())
-                            .unwrap_or(remote_meta.metadata.generation_file_size());
                        // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
-                        if remote_meta.metadata.generation_file_size()
-                            != last_meta_generation_file_size
-                        {
-                            tracing::info!(
-                                "Removing local layer {name} as on-disk json metadata has different generation or file size from remote: {:?} -> {:?}",
-                                last_meta_generation_file_size,
-                                remote_meta.metadata.generation_file_size()
-                            );
-                            remove = true;
-                        } else if local_meta.len() != remote_meta.metadata.file_size {
-                            // This can happen in the presence of race conditions: the remote and on-disk metadata have changed, but we haven't had
-                            // the chance yet to download the new layer to disk, before the process restarted.
-                            tracing::info!(
+                        if local_meta.len() != remote_meta.metadata.file_size {
+                            // This should not happen, because we do crashsafe write-then-rename when downloading
+                            // layers, and layers in remote storage are immutable.  Remove the local file because
+                            // we cannot trust it.
+                            tracing::warn!(
                                "Removing local layer {name} with unexpected local size {} != {}",
                                local_meta.len(),
                                remote_meta.metadata.file_size
                            );
-                            remove = true;
                        } else {
                            // We expect the access time to be initialized immediately afterwards, when
                            // the latest heatmap is applied to the state.
@@ -1419,18 +1372,15 @@ async fn init_timeline_state(
                            "Removing secondary local layer {} because it's absent in heatmap",
                            name
                        );
-                        remove = true;
+                        tokio::fs::remove_file(&dentry.path())
+                            .await
+                            .or_else(fs_ext::ignore_not_found)
+                            .fatal_err(&format!(
+                                "Removing layer {}",
+                                dentry.path().to_string_lossy()
+                            ));
                    }
                }
-                if remove {
-                    tokio::fs::remove_file(&dentry.path())
-                        .await
-                        .or_else(fs_ext::ignore_not_found)
-                        .fatal_err(&format!(
-                            "Removing layer {}",
-                            dentry.path().to_string_lossy()
-                        ));
-                }
            }
            Err(_) => {
                // Ignore it.
@@ -1441,18 +1391,3 @@ async fn init_timeline_state(

    detail
 }
-
-/// Loads a json-encoded heatmap file from the provided on-disk path
-async fn load_heatmap(
-    path: &Utf8PathBuf,
-    ctx: &RequestContext,
-) -> Result<Option<HeatMapTenant>, anyhow::Error> {
-    let mut file = match VirtualFile::open(path, ctx).await {
-        Ok(file) => file,
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
-        Err(e) => Err(e)?,
-    };
-    let st = file.read_to_string(ctx).await?;
-    let htm = serde_json::from_str(&st)?;
-    Ok(Some(htm))
-}
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -80,16 +80,6 @@ pub(crate) struct ValueReconstructState {
    pub(crate) img: Option<(Lsn, Bytes)>,
 }

-impl ValueReconstructState {
-    /// Returns the number of page deltas applied to the page image.
-    pub fn num_deltas(&self) -> usize {
-        match self.img {
-            Some(_) => self.records.len(),
-            None => self.records.len() - 1, // omit will_init record
-        }
-    }
-}
-
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
 pub(crate) enum ValueReconstructSituation {
    Complete,
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -340,7 +340,7 @@ impl Layer {
    /// Download the layer if evicted.
    ///
    /// Will not error when the layer is already downloaded.
-    pub(crate) async fn download(&self) -> Result<(), DownloadError> {
+    pub(crate) async fn download(&self) -> anyhow::Result<()> {
        self.0.get_or_maybe_download(true, None).await?;
        Ok(())
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -51,7 +51,6 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::rate_limit::RateLimit;
 use utils::{
    fs_ext,
    guard_arc_swap::GuardArcSwap,
@@ -116,7 +115,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
+use crate::metrics::TimelineMetrics;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -341,8 +340,6 @@ pub struct Timeline {
    // Needed to ensure that we can't create a branch at a point that was already garbage collected
    pub latest_gc_cutoff_lsn: Rcu<Lsn>,

-    pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
-
    // List of child timelines and their branch points. This is needed to avoid
    // garbage collecting data that is still needed by the child timelines.
    pub(crate) gc_info: std::sync::RwLock<GcInfo>,
@@ -1047,7 +1044,7 @@ impl Timeline {
    }

    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
-    pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;
+    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;

    /// Look up multiple page versions at a given LSN
    ///
@@ -1197,7 +1194,6 @@ impl Timeline {
                            return (key, Err(err));
                        }
                    };
-                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);

                    // The walredo module expects the records to be descending in terms of Lsn.
                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
@@ -1225,28 +1221,25 @@ impl Timeline {
        // (this is a requirement, not a bug). Skip updating the metric in these cases
        // to avoid infinite results.
        if !results.is_empty() {
-            // Record the total number of layers visited towards each key in the batch. While some
-            // layers may not intersect with a given read, and the cost of layer visits are
-            // amortized across the batch, each visited layer contributes directly to the observed
-            // latency for every read in the batch, which is what we care about.
-            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
-                static LOG_PACER: Lazy<Mutex<RateLimit>> =
+            let avg = layers_visited as f64 / results.len() as f64;
+            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
-                LOG_PACER.lock().unwrap().call(|| {
-                    let num_keys = keyspace.total_raw_size();
-                    let num_pages = results.len();
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
                    tracing::info!(
                      shard_id = %self.tenant_shard_id.shard_slug(),
                      lsn = %lsn,
-                      "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
-                    );
+                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
+                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
                });
            }

-            for _ in &results {
-                self.metrics.layers_per_read.observe(layers_visited as f64);
-                LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
-            }
+            // Note that this is an approximation. Tracking the exact number of layers visited
+            // per key requires virtually unbounded memory usage and is inefficient
+            // (i.e. segment tree tracking each range queried from a layer)
+            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
        }

        Ok(results)
@@ -2028,16 +2021,8 @@ impl Timeline {
    pub(crate) async fn download_layer(
        &self,
        layer_file_name: &LayerName,
-    ) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
-        let Some(layer) = self
-            .find_layer(layer_file_name)
-            .await
-            .map_err(|e| match e {
-                layer_manager::Shutdown => {
-                    super::storage_layer::layer::DownloadError::TimelineShutdown
-                }
-            })?
-        else {
+    ) -> anyhow::Result<Option<bool>> {
+        let Some(layer) = self.find_layer(layer_file_name).await? else {
            return Ok(None);
        };

@@ -2447,7 +2432,6 @@ impl Timeline {
                shard_identity,
                pg_version,
                layers: Default::default(),
-                gc_compaction_layer_update_lock: tokio::sync::RwLock::new(()),

                walredo_mgr,
                walreceiver: Mutex::new(None),
@@ -3491,9 +3475,6 @@ impl Timeline {
        // image layer).
        let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();

-        // See `compaction::compact_with_gc` for why we need this.
-        let _guard = timeline.gc_compaction_layer_update_lock.read().await;
-
        loop {
            if cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1122,13 +1122,7 @@ impl Timeline {
        // Under normal circumstances, we will accumulate up to compaction_upper_limit L0s of size
        // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
        // work in this function to only operate on this much delta data at once.
-        //
-        // In general, compaction_threshold should be <= compaction_upper_limit, but in case that
-        // the constraint is not respected, we use the larger of the two.
-        let delta_size_limit = std::cmp::max(
-            self.get_compaction_upper_limit(),
-            self.get_compaction_threshold(),
-        ) as u64
+        let delta_size_limit = self.get_compaction_upper_limit() as u64
            * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);

        let mut fully_compacted = true;
@@ -2919,45 +2913,10 @@ impl Timeline {
        // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
        // operate on L1 layers.
        {
-            // Gc-compaction will rewrite the history of a key. This could happen in two ways:
-            //
-            // 1. We create an image layer to replace all the deltas below the compact LSN. In this case, assume
-            // we have 2 delta layers A and B, both below the compact LSN. We create an image layer I to replace
-            // A and B at the compact LSN. If the read path finishes reading A, yields, and now we update the layer
-            // map, the read path then cannot find any keys below A, reporting a missing key error, while the key
-            // now gets stored in I at the compact LSN.
-            //
-            // ---------------                                       ---------------
-            //   delta1@LSN20                                         image1@LSN20
-            // ---------------  (read path collects delta@LSN20,  => ---------------  (read path cannot find anything
-            //   delta1@LSN10    yields)                                               below LSN 20)
-            // ---------------
-            //
-            // 2. We create a delta layer to replace all the deltas below the compact LSN, and in the delta layers,
-            // we combines the history of a key into a single image. For example, we have deltas at LSN 1, 2, 3, 4,
-            // Assume one delta layer contains LSN 1, 2, 3 and the other contains LSN 4.
-            //
-            // We let gc-compaction combine delta 2, 3, 4 into an image at LSN 4, which produces a delta layer that
-            // contains the delta at LSN 1, the image at LSN 4. If the read path finishes reading the original delta
-            // layer containing 4, yields, and we update the layer map to put the delta layer.
-            //
-            // ---------------                                      ---------------
-            //   delta1@LSN4                                          image1@LSN4
-            // ---------------  (read path collects delta@LSN4,  => ---------------  (read path collects LSN4 and LSN1,
-            //  delta1@LSN1-3    yields)                              delta1@LSN1     which is an invalid history)
-            // ---------------                                      ---------------
-            //
-            // Therefore, the gc-compaction layer update operation should wait for all ongoing reads, block all pending reads,
-            // and only allow reads to continue after the update is finished.
-
-            let update_guard = self.gc_compaction_layer_update_lock.write().await;
-            // Acquiring the update guard ensures current read operations end and new read operations are blocked.
-            // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect?
            let mut guard = self.layers.write().await;
            guard
                .open_mut()?
-                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics);
-            drop(update_guard); // Allow new reads to start ONLY after we finished updating the layer map.
+                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
        };

        // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json.
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,4 +1,4 @@
-use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc};
+use std::{collections::hash_map::Entry, fs, sync::Arc};

 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -8,8 +8,7 @@ use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};
 use crate::{
    context::RequestContext,
    import_datadir,
-    span::debug_assert_current_span_has_tenant_and_timeline_id,
-    tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
+    tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
 };

 use super::Timeline;
@@ -25,9 +24,6 @@ pub struct UninitializedTimeline<'t> {
    pub(crate) owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
-    /// Whether we spawned the inner Timeline's tasks such that we must later shut it down
-    /// if aborting the timeline creation
-    needs_shutdown: bool,
 }

 impl<'t> UninitializedTimeline<'t> {
@@ -40,50 +36,6 @@ impl<'t> UninitializedTimeline<'t> {
            owning_tenant,
            timeline_id,
            raw_timeline,
-            needs_shutdown: false,
-        }
-    }
-
-    /// When writing data to this timeline during creation, use this wrapper: it will take care of
-    /// setup of Timeline tasks required for I/O (flush loop) and making sure they are torn down
-    /// later.
-    pub(crate) async fn write<F, Fut>(&mut self, f: F) -> anyhow::Result<()>
-    where
-        F: FnOnce(Arc<Timeline>) -> Fut,
-        Fut: Future<Output = Result<(), CreateTimelineError>>,
-    {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
-        // Remember that we did I/O (spawned the flush loop), so that we can check we shut it down on drop
-        self.needs_shutdown = true;
-
-        let timeline = self.raw_timeline()?;
-
-        // Spawn flush loop so that the Timeline is ready to accept writes
-        timeline.maybe_spawn_flush_loop();
-
-        // Invoke the provided function, which will write some data into the new timeline
-        if let Err(e) = f(timeline.clone()).await {
-            self.abort().await;
-            return Err(e.into());
-        }
-
-        // Flush the underlying timeline's ephemeral layers to disk
-        if let Err(e) = timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after timeline creation writes")
-        {
-            self.abort().await;
-            return Err(e);
-        }
-
-        Ok(())
-    }
-
-    pub(crate) async fn abort(&self) {
-        if let Some((raw_timeline, _)) = self.raw_timeline.as_ref() {
-            raw_timeline.shutdown(super::ShutdownMode::Hard).await;
        }
    }

@@ -92,13 +44,11 @@ impl<'t> UninitializedTimeline<'t> {
    /// This function launches the flush loop if not already done.
    ///
    /// The caller is responsible for activating the timeline (function `.activate()`).
-    pub(crate) async fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
+    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
        let timeline_id = self.timeline_id;
        let tenant_shard_id = self.owning_tenant.tenant_shard_id;

        if self.raw_timeline.is_none() {
-            self.abort().await;
-
            return Err(anyhow::anyhow!(
                "No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
            ));
@@ -112,25 +62,16 @@ impl<'t> UninitializedTimeline<'t> {
            .0
            .get_disk_consistent_lsn();

-        if !new_disk_consistent_lsn.is_valid() {
-            self.abort().await;
-
-            return Err(anyhow::anyhow!(
-                "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
-            ));
-        }
+        anyhow::ensure!(
+            new_disk_consistent_lsn.is_valid(),
+            "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
+        );

        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
        match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => {
-                // Unexpected, bug in the caller.  Tenant is responsible for preventing concurrent creation of the same timeline.
-                //
-                // We do not call Self::abort here.  Because we don't cleanly shut down our Timeline, [`Self::drop`] should
-                // skip trying to delete the timeline directory too.
-                anyhow::bail!(
+            Entry::Occupied(_) => anyhow::bail!(
                "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
-                )
-            }
+            ),
            Entry::Vacant(v) => {
                // after taking here should be no fallible operations, because the drop guard will not
                // cleanup after and would block for example the tenant deletion
@@ -152,31 +93,36 @@ impl<'t> UninitializedTimeline<'t> {

    /// Prepares timeline data by loading it from the basebackup archive.
    pub(crate) async fn import_basebackup_from_tar(
-        mut self,
+        self,
        tenant: Arc<Tenant>,
        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        self.write(|raw_timeline| async move {
-            import_datadir::import_basebackup_from_tar(&raw_timeline, copyin_read, base_lsn, ctx)
-                .await
-                .context("Failed to import basebackup")
-                .map_err(CreateTimelineError::Other)?;
+        let raw_timeline = self.raw_timeline()?;

-            fail::fail_point!("before-checkpoint-new-timeline", |_| {
-                Err(CreateTimelineError::Other(anyhow::anyhow!(
-                    "failpoint before-checkpoint-new-timeline"
-                )))
-            });
+        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
+            .await
+            .context("Failed to import basebackup")?;

-            Ok(())
-        })
-        .await?;
+        // Flush the new layer files to disk, before we make the timeline as available to
+        // the outside world.
+        //
+        // Flush loop needs to be spawned in order to be able to flush.
+        raw_timeline.maybe_spawn_flush_loop();
+
+        fail::fail_point!("before-checkpoint-new-timeline", |_| {
+            anyhow::bail!("failpoint before-checkpoint-new-timeline");
+        });
+
+        raw_timeline
+            .freeze_and_flush()
+            .await
+            .context("Failed to flush after basebackup import")?;

        // All the data has been imported. Insert the Timeline into the tenant's timelines map
-        let tl = self.finish_creation().await?;
+        let tl = self.finish_creation()?;
        tl.activate(tenant, broker_client, None, ctx);
        Ok(tl)
    }
@@ -197,19 +143,12 @@ impl<'t> UninitializedTimeline<'t> {

 impl Drop for UninitializedTimeline<'_> {
    fn drop(&mut self) {
-        if let Some((timeline, create_guard)) = self.raw_timeline.take() {
+        if let Some((_, create_guard)) = self.raw_timeline.take() {
            let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
-            if self.needs_shutdown && !timeline.gate.close_complete() {
-                // This should not happen: caller should call [`Self::abort`] on failures
-                tracing::warn!(
-                    "Timeline not shut down after initialization failure, cannot clean up files"
-                );
-            } else {
-                // This is unusual, but can happen harmlessly if the pageserver is stopped while
-                // creating a timeline.
-                info!("Timeline got dropped without initializing, cleaning its files");
-                cleanup_timeline_directory(create_guard);
-            }
+            // This is unusual, but can happen harmlessly if the pageserver is stopped while
+            // creating a timeline.
+            info!("Timeline got dropped without initializing, cleaning its files");
+            cleanup_timeline_directory(create_guard);
        }
    }
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -234,19 +234,6 @@ impl VirtualFile {
    ) -> (FullSlice<Buf>, Result<usize, Error>) {
        self.inner.write_all(buf, ctx).await
    }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        self.inner.read_to_end(buf, ctx).await
-    }
-
-    pub(crate) async fn read_to_string(
-        &mut self,
-        ctx: &RequestContext,
-    ) -> Result<String, anyhow::Error> {
-        let mut buf = Vec::new();
-        self.read_to_end(&mut buf, ctx).await?;
-        Ok(String::from_utf8(buf)?)
-    }
 }

 /// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing
@@ -1006,24 +993,6 @@ impl VirtualFileInner {
            (buf, result)
        })
    }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        let mut tmp = vec![0; 128];
-        loop {
-            let slice = tmp.slice(..128);
-            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
-            match res {
-                Ok(0) => return Ok(()),
-                Ok(n) => {
-                    self.pos += n as u64;
-                    buf.extend_from_slice(&slice[..n]);
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-            tmp = slice.into_inner();
-        }
-    }
 }

 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
@@ -1268,6 +1237,10 @@ impl VirtualFile {
    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
        self.inner.read_blk(blknum, ctx).await
    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        self.inner.read_to_end(buf, ctx).await
+    }
 }

 #[cfg(test)]
@@ -1287,6 +1260,24 @@ impl VirtualFileInner {
            slice.into_inner(),
        ))
    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        let mut tmp = vec![0; 128];
+        loop {
+            let slice = tmp.slice(..128);
+            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
+            match res {
+                Ok(0) => return Ok(()),
+                Ok(n) => {
+                    self.pos += n as u64;
+                    buf.extend_from_slice(&slice[..n]);
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+            tmp = slice.into_inner();
+        }
+    }
 }

 impl Drop for VirtualFileInner {
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -505,13 +505,6 @@ async fn main() -> anyhow::Result<()> {
                }
            }

-            if let Some(mut redis_kv_client) = redis_kv_client {
-                maintenance_tasks.spawn(async move {
-                    redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
-                });
-            }
-
            if let Some(regional_redis_client) = regional_redis_client {
                let cache = api.caches.endpoints_cache.clone();
                let con = regional_redis_client;
@@ -524,6 +517,15 @@ async fn main() -> anyhow::Result<()> {
        }
    }

+    if let Some(mut redis_kv_client) = redis_kv_client {
+        maintenance_tasks.spawn(async move {
+            if let Err(err) = redis_kv_client.try_connect().await {
+                tracing::error!(?err, "could not connect to redis")
+            }
+            handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
+        });
+    }
+
    let maintenance = loop {
        // get one complete task
        match futures::future::select(
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -1,114 +0,0 @@
-use core::net::IpAddr;
-use std::sync::Arc;
-
-use pq_proto::CancelKeyData;
-use tokio::sync::Mutex;
-use uuid::Uuid;
-
-use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
-
-pub trait CancellationPublisherMut: Send + Sync + 'static {
-    #[allow(async_fn_in_trait)]
-    async fn try_publish(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()>;
-}
-
-pub trait CancellationPublisher: Send + Sync + 'static {
-    #[allow(async_fn_in_trait)]
-    async fn try_publish(
-        &self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()>;
-}
-
-impl CancellationPublisher for () {
-    async fn try_publish(
-        &self,
-        _cancel_key_data: CancelKeyData,
-        _session_id: Uuid,
-        _peer_addr: IpAddr,
-    ) -> anyhow::Result<()> {
-        Ok(())
-    }
-}
-
-impl<P: CancellationPublisher> CancellationPublisherMut for P {
-    async fn try_publish(
-        &mut self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()> {
-        <P as CancellationPublisher>::try_publish(self, cancel_key_data, session_id, peer_addr)
-            .await
-    }
-}
-
-impl<P: CancellationPublisher> CancellationPublisher for Option<P> {
-    async fn try_publish(
-        &self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()> {
-        if let Some(p) = self {
-            p.try_publish(cancel_key_data, session_id, peer_addr).await
-        } else {
-            Ok(())
-        }
-    }
-}
-
-impl<P: CancellationPublisherMut> CancellationPublisher for Arc<Mutex<P>> {
-    async fn try_publish(
-        &self,
-        cancel_key_data: CancelKeyData,
-        session_id: Uuid,
-        peer_addr: IpAddr,
-    ) -> anyhow::Result<()> {
-        self.lock()
-            .await
-            .try_publish(cancel_key_data, session_id, peer_addr)
-            .await
-    }
-}
-
-pub struct RedisPublisherClient {
-    #[allow(dead_code)]
-    client: ConnectionWithCredentialsProvider,
-    _region_id: String,
-    _limiter: GlobalRateLimiter,
-}
-
-impl RedisPublisherClient {
-    pub fn new(
-        client: ConnectionWithCredentialsProvider,
-        region_id: String,
-        info: &'static [RateBucketInfo],
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
-            client,
-            _region_id: region_id,
-            _limiter: GlobalRateLimiter::new(info.into()),
-        })
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn try_connect(&mut self) -> anyhow::Result<()> {
-        match self.client.connect().await {
-            Ok(()) => {}
-            Err(e) => {
-                tracing::error!("failed to connect to redis: {e}");
-                return Err(e);
-            }
-        }
-        Ok(())
-    }
-}
--- a/proxy/src/redis/mod.rs
+++ b/proxy/src/redis/mod.rs
@@ -1,4 +1,3 @@
-pub mod cancellation_publisher;
 pub mod connection_with_credentials_provider;
 pub mod elasticache;
 pub mod keys;
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -45,12 +45,11 @@ strum_macros.workspace = true

 diesel = { version = "2.2.6", features = [
    "serde_json",
-    "postgres",
-    "r2d2",
    "chrono",
 ] }
+diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
-r2d2 = { version = "0.8.10" }
+scoped-futures = "0.1.4"

 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -308,7 +308,7 @@ async fn async_main() -> anyhow::Result<()> {
    // Validate that we can connect to the database
    Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;

-    let persistence = Arc::new(Persistence::new(secrets.database_url));
+    let persistence = Arc::new(Persistence::new(secrets.database_url).await);

    let service = Service::spawn(config, persistence.clone()).await?;

--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -96,38 +96,29 @@ impl ChaosInjector {
        let batch_size = 128;
        let mut inner = self.service.inner.write().unwrap();
        let (nodes, tenants, scheduler) = inner.parts_mut();
+        let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();

        // Prefer to migrate tenants that are currently outside their home AZ.  This avoids the chaos injector
        // continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
        // random tenants to move, and then on next chaos iteration moving them back, then picking some new
        // random tenants on the next iteration.
-        let (out_of_home_az, in_home_az): (Vec<_>, Vec<_>) = tenants
-            .values()
-            .map(|shard| {
-                (
-                    shard.tenant_shard_id,
-                    shard.is_attached_outside_preferred_az(nodes),
-                )
-            })
-            .partition(|(_id, is_outside)| *is_outside);
-
-        let mut out_of_home_az: Vec<_> = out_of_home_az.into_iter().map(|(id, _)| id).collect();
-        let mut in_home_az: Vec<_> = in_home_az.into_iter().map(|(id, _)| id).collect();
-
        let mut victims = Vec::with_capacity(batch_size);
-        if out_of_home_az.len() >= batch_size {
-            tracing::info!("Injecting chaos: found {batch_size} shards to migrate back to home AZ (total {} out of home AZ)", out_of_home_az.len());
+        for shard in tenants.values() {
+            if shard.is_attached_outside_preferred_az(nodes) {
+                victims.push(shard.tenant_shard_id);
+            }

-            out_of_home_az.shuffle(&mut thread_rng());
-            victims.extend(out_of_home_az.into_iter().take(batch_size));
-        } else {
-            tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {} random shards to migrate", out_of_home_az.len(), std::cmp::min(batch_size - out_of_home_az.len(), in_home_az.len()));
-
-            victims.extend(out_of_home_az);
-            in_home_az.shuffle(&mut thread_rng());
-            victims.extend(in_home_az.into_iter().take(batch_size - victims.len()));
+            if victims.len() >= batch_size {
+                break;
+            }
        }

+        let choose_random = batch_size.saturating_sub(victims.len());
+        tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
+
+        let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
+        victims.extend(random_victims);
+
        for victim in victims {
            self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);
        }
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1806,7 +1806,7 @@ impl TenantShard {
                        .get(&node_id)
                        .expect("referenced node exists")
                        .get_availability_zone_id(),
-                ) != self.intent.preferred_az_id.as_ref()
+                ) == self.intent.preferred_az_id.as_ref()
            })
            .unwrap_or(false)
    }
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -158,9 +158,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    "pageserver_pitr_history_size",
    "pageserver_layer_bytes",
    "pageserver_layer_count",
-    "pageserver_layers_per_read_bucket",
-    "pageserver_layers_per_read_count",
-    "pageserver_layers_per_read_sum",
    "pageserver_visible_physical_size",
    "pageserver_storage_operations_seconds_count_total",
    "pageserver_storage_operations_seconds_sum_total",
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -86,9 +86,9 @@ page_cache_size=10
    log.info("Checking layer access metrics ...")

    layer_access_metric_names = [
-        "pageserver_layers_per_read_global_sum",
-        "pageserver_layers_per_read_global_count",
-        "pageserver_layers_per_read_global_bucket",
+        "pageserver_layers_visited_per_vectored_read_global_sum",
+        "pageserver_layers_visited_per_vectored_read_global_count",
+        "pageserver_layers_visited_per_vectored_read_global_bucket",
    ]

    metrics = env.pageserver.http_client().get_metrics()
@@ -96,8 +96,8 @@ page_cache_size=10
        layer_access_metrics = metrics.query_all(name)
        log.info(f"Got metrics: {layer_access_metrics}")

-    vectored_sum = metrics.query_one("pageserver_layers_per_read_global_sum")
-    vectored_count = metrics.query_one("pageserver_layers_per_read_global_count")
+    vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
+    vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
    if vectored_count.value > 0:
        assert vectored_sum.value > 0
        vectored_average = vectored_sum.value / vectored_count.value
--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -59,9 +59,6 @@ def test_pgdata_import_smoke(
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
    env = neon_env_builder.init_start()

-    # The test needs LocalFs support, which is only built in testing mode.
-    env.pageserver.is_testing_enabled_or_skip()
-
    env.pageserver.patch_config_toml_nonrecursive(
        {
            "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api"
@@ -70,12 +67,6 @@ def test_pgdata_import_smoke(
    env.pageserver.stop()
    env.pageserver.start()

-    # By default our tests run with a tiny shared_buffers=1MB setting. That
-    # doesn't allow any prefetching on v17 and above, where the new streaming
-    # read machinery keeps buffers pinned while prefetching them.  Use a higher
-    # setting to enable prefetching and speed up the tests
-    ep_config = ["shared_buffers=64MB"]
-
    #
    # Put data in vanilla pg
    #
@@ -252,11 +243,7 @@ def test_pgdata_import_smoke(
    #

    ro_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name,
-        endpoint_id="ro",
-        tenant_id=tenant_id,
-        lsn=last_record_lsn,
-        config_lines=ep_config,
+        branch_name=import_branch_name, endpoint_id="ro", tenant_id=tenant_id, lsn=last_record_lsn
    )

    validate_vanilla_equivalence(ro_endpoint)
@@ -286,10 +273,7 @@ def test_pgdata_import_smoke(
    # validate that we can write
    #
    rw_endpoint = env.endpoints.create_start(
-        branch_name=import_branch_name,
-        endpoint_id="rw",
-        tenant_id=tenant_id,
-        config_lines=ep_config,
+        branch_name=import_branch_name, endpoint_id="rw", tenant_id=tenant_id
    )
    rw_endpoint.safe_psql("create table othertable(values text)")
    rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
@@ -309,7 +293,7 @@ def test_pgdata_import_smoke(
        ancestor_start_lsn=rw_lsn,
    )
    br_tip_endpoint = env.endpoints.create_start(
-        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config
+        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id
    )
    validate_vanilla_equivalence(br_tip_endpoint)
    br_tip_endpoint.safe_psql("select * from othertable")
@@ -322,10 +306,7 @@ def test_pgdata_import_smoke(
        ancestor_start_lsn=initdb_lsn,
    )
    br_initdb_endpoint = env.endpoints.create_start(
-        branch_name="br-initdb",
-        endpoint_id="br-initdb-ro",
-        tenant_id=tenant_id,
-        config_lines=ep_config,
+        branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id
    )
    validate_vanilla_equivalence(br_initdb_endpoint)
    with pytest.raises(psycopg2.errors.UndefinedTable):
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -27,7 +27,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import query_scalar, wait_until
-from urllib3 import Retry

 if TYPE_CHECKING:
    from typing import Any
@@ -677,14 +676,16 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
            "compaction_period": "0s",
        }
    )
-
-    # Disable retries, because we'll hit code paths that can give us
-    # 503 and want to see that directly
-    client = env.pageserver.http_client(retries=Retry(status=0))
-
+    client = env.pageserver.http_client()
    failpoint = "before-downloading-layer-stream-pausable"
    client.configure_failpoints((failpoint, "pause"))

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*downloading failed, possibly for shutdown.*",
+        ]
+    )
+
    info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
    assert len(info.delta_layers()) == 1

@@ -719,9 +720,13 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu

        client.configure_failpoints((failpoint, "off"))

-        with pytest.raises(PageserverApiException, match="Shutting down"):
+        with pytest.raises(
+            PageserverApiException, match="downloading failed, possibly for shutdown"
+        ):
            download.result()

+        env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*")
+
        detach.result()

        client.configure_failpoints((failpoint, "pause"))
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -582,12 +582,12 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
                # This is expected: we are injecting chaos, API calls will sometimes fail.
                # TODO: can we narrow this to assert we are getting friendly 503s?
                log.info(f"Iteration error, will retry: {e}")
-                shutdown.wait(random.random() * 0.5)
+                shutdown.wait(random.random())
            except requests.exceptions.RetryError as e:
                # Retryable error repeated more times than `requests` is configured to tolerate, this
                # is expected when a pageserver remains unavailable for a couple seconds
                log.info(f"Iteration error, will retry: {e}")
-                shutdown.wait(random.random() * 0.5)
+                shutdown.wait(random.random())
            except Exception as e:
                log.warning(
                    f"Unexpected worker exception (current timeline {state.timeline_id}): {e}"
@@ -632,7 +632,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):

                # Make sure we're up for as long as we spent restarting, to ensure operations can make progress
                log.info(f"Staying alive for {restart_duration}s")
-                time.sleep(restart_duration * 2)
+                time.sleep(restart_duration)
            else:
                # Migrate our tenant between pageservers
                origin_ps = env.get_tenant_pageserver(tenant_shard_id)
@@ -651,7 +651,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):

    # Sanity check that during our run we did exercise some full timeline lifecycles, in case
    # one of our workers got stuck
-    assert len(timelines_deleted) > 5
+    assert len(timelines_deleted) > 10

    # That no invariant-violations were reported by workers
    assert violations == []
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
  "v17": [
    "17.2",
-    "f0ffc8279dbcbbc439981a4fd001a9687e5d665d"
+    "b654fa88b6fd2ad24a03a14a7cd417ec66e518f9"
  ],
  "v16": [
    "16.6",