hopefully unbroken wip

pass aws creds via cli
Merge branch 'alek_targz' of github.com:neondatabase/neon into alek_targz_default_on
2026-05-18 13:40:37 +00:00 · 2023-07-18 08:45:39 -04:00 · 2023-07-17 08:31:12 -04:00 · 2023-07-17 07:59:30 -04:00 · 2023-07-14 13:55:14 -04:00 · 2023-07-14 10:54:16 -04:00
168 changed files with 2283 additions and 2529 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -12,11 +12,6 @@ opt-level = 3
 # Turn on a small amount of optimization in Development mode.
 opt-level = 1

-[build]
-# This is only present for local builds, as it will be overridden
-# by the RUSTDOCFLAGS env var in CI.
-rustdocflags = ["-Arustdoc::private_intra_doc_links"]
-
 [alias]
 build_testing = ["build", "--features", "testing"]
 neon = ["run", "--bin", "neon_local"]
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -150,14 +150,6 @@ runs:
          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi

-        # We use pytest-split plugin to run benchmarks in parallel on different CI runners
-        if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
-          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
-
-          EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
-        fi
-
        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -1,55 +0,0 @@
-name: Handle `approved-for-ci-run` label
-# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
-
-on:
-  pull_request:
-    types:
-      # Default types that triggers a workflow ([1]):
-      # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
-      - opened
-      - synchronize
-      - reopened
-      # Types that we wand to handle in addition to keep labels tidy:
-      - closed
-      # Actual magic happens here:
-      - labeled
-
-env:
-  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-
-jobs:
-  remove-label:
-    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
-    # The PR should be reviewed and labelled manually again.
-
-    runs-on: [ ubuntu-latest ]
-
-    if: |
-      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
-      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
-
-    steps:
-      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
-
-  create-branch:
-    # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
-
-    runs-on: [ ubuntu-latest ]
-
-    if: |
-      github.event.action == 'labeled' &&
-      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
-
-    steps:
-      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
-
-      - uses: actions/checkout@v3
-        with:
-          ref: main
-
-      - run: gh pr checkout "${PR_NUMBER}"
-
-      - run: git checkout -b "ci-run/pr-${PR_NUMBER}"
-
-      - run: git push --force origin "ci-run/pr-${PR_NUMBER}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,7 +5,6 @@ on:
    branches:
      - main
      - release
-      - ci-run/pr-*
  pull_request:

 defaults:
@@ -128,11 +127,6 @@ jobs:
      - name: Run cargo clippy (release)
        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS

-      - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
-        env:
-            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
-
      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
      - name: Check formatting
        if: ${{ !cancelled() }}
@@ -396,11 +390,13 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        pytest_split_group: [ 1, 2, 3, 4 ]
        build_type: [ release ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1

      - name: Pytest benchmarks
        uses: ./.github/actions/run-python-test-set
@@ -409,11 +405,9 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -794,7 +788,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.13.1
+      VM_BUILDER_VERSION: v0.12.1

    steps:
      - name: Checkout
@@ -1007,8 +1001,6 @@ jobs:
          done

      - name: Upload postgres-extensions to S3
-        # TODO: Reenable step after switching to the new extensions format (tar-gzipped + index.json)
-        if: false
        run: |
          for BUCKET in $(echo ${S3_BUCKETS}); do
            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -3,8 +3,7 @@ name: Check neon with extra platform builds
 on:
  push:
    branches:
-      - main
-      - ci-run/pr-*
+    - main
  pull_request:

 defaults:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -907,12 +907,14 @@ dependencies = [
 "opentelemetry",
 "postgres",
 "regex",
+ "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
 "tar",
 "tokio",
 "tokio-postgres",
+ "toml_edit",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -980,6 +982,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "tracing",
 "url",
 "utils",
 "workspace_hack",
@@ -2379,9 +2382,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
+checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
 dependencies = [
 "opentelemetry_api",
 "opentelemetry_sdk",
@@ -2389,9 +2392,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-http"
-version = "0.8.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
+checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
 dependencies = [
 "async-trait",
 "bytes",
@@ -2402,9 +2405,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-otlp"
-version = "0.12.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
+checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
 dependencies = [
 "async-trait",
 "futures",
@@ -2420,47 +2423,48 @@ dependencies = [

 [[package]]
 name = "opentelemetry-proto"
-version = "0.2.0"
+version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
+checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
 dependencies = [
 "futures",
 "futures-util",
 "opentelemetry",
 "prost",
 "tonic 0.8.3",
+ "tonic-build 0.8.4",
 ]

 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.11.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
+checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
 dependencies = [
 "opentelemetry",
 ]

 [[package]]
 name = "opentelemetry_api"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
+checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
 dependencies = [
 "fnv",
 "futures-channel",
 "futures-util",
 "indexmap",
+ "js-sys",
 "once_cell",
 "pin-project-lite",
 "thiserror",
- "urlencoding",
 ]

 [[package]]
 name = "opentelemetry_sdk"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
+checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
 dependencies = [
 "async-trait",
 "crossbeam-channel",
@@ -2936,9 +2940,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"

 [[package]]
 name = "proc-macro2"
-version = "1.0.64"
+version = "1.0.58"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
+checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
 dependencies = [
 "unicode-ident",
 ]
@@ -3327,9 +3331,9 @@ dependencies = [

 [[package]]
 name = "reqwest-tracing"
-version = "0.4.5"
+version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
+checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -3854,8 +3858,7 @@ dependencies = [
 [[package]]
 name = "sharded-slab"
 version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
+source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
 dependencies = [
 "lazy_static",
 ]
@@ -3998,7 +4001,7 @@ dependencies = [
 "tokio",
 "tokio-stream",
 "tonic 0.9.2",
- "tonic-build",
+ "tonic-build 0.9.2",
 "tracing",
 "utils",
 "workspace_hack",
@@ -4099,7 +4102,7 @@ checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
 dependencies = [
 "filetime",
 "libc",
- "xattr 0.2.3",
+ "xattr",
 ]

 [[package]]
@@ -4380,17 +4383,16 @@ dependencies = [

 [[package]]
 name = "tokio-tar"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75"
+version = "0.3.0"
+source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
 dependencies = [
 "filetime",
 "futures-core",
 "libc",
- "redox_syscall 0.3.5",
+ "redox_syscall 0.2.16",
 "tokio",
 "tokio-stream",
- "xattr 1.0.0",
+ "xattr",
 ]

 [[package]]
@@ -4517,6 +4519,19 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "tonic-build"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
+dependencies = [
+ "prettyplease 0.1.25",
+ "proc-macro2",
+ "prost-build",
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "tonic-build"
 version = "0.9.2"
@@ -4640,9 +4655,9 @@ dependencies = [

 [[package]]
 name = "tracing-opentelemetry"
-version = "0.19.0"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
+checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
 dependencies = [
 "once_cell",
 "opentelemetry",
@@ -5364,15 +5379,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "xattr"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea263437ca03c1522846a4ddafbca2542d0ad5ed9b784909d4b27b76f62bc34a"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "xmlparser"
 version = "0.13.5"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -84,9 +84,9 @@ notify = "5.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.19.0"
-opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.11.0"
+opentelemetry = "0.18.0"
+opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.10.0"
 parking_lot = "0.12"
 pbkdf2 = "0.12.1"
 pin-project-lite = "0.2"
@@ -95,7 +95,7 @@ prost = "0.11"
 rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
+reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
 reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
@@ -124,14 +124,13 @@ tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.9.0"
 tokio-rustls = "0.23"
 tokio-stream = "0.1"
-tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.19.0"
+tracing-opentelemetry = "0.18.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
@@ -149,6 +148,7 @@ postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -185,6 +185,11 @@ tonic-build = "0.9"
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }

+# Changes the MAX_THREADS limit from 4096 to 32768.
+# This is a temporary workaround for using tracing from many threads in safekeepers code,
+# until async safekeepers patch is merged to the main.
+sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }
+
 ################# Binary contents sections

 [profile.release]
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -535,10 +535,10 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
+# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
 # There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
-    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
+    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -32,3 +32,5 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
+toml_edit.workspace = true
+remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -5,6 +5,8 @@
 //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
+//! - If remote_extension_config is provided, it will be used to fetch extensions list
+//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -27,7 +29,8 @@
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
-//!             -b /usr/local/bin/postgres
+//!             -b /usr/local/bin/postgres \
+//!             -r {"bucket": "my-bucket", "region": "eu-central-1", "endpoint": "http:://localhost:9000"} \
 //! ```
 //!
 use std::collections::HashMap;
@@ -35,7 +38,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex};
+use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -48,6 +51,8 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
+use compute_tools::extension_server::launch_download_extensions;
+use compute_tools::extension_server::{get_pg_version, init_remote_storage};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
@@ -55,15 +60,42 @@ use compute_tools::params::*;
 use compute_tools::spec::*;

 const BUILD_TAG_DEFAULT: &str = "local";
+const DEFAULT_REMOTE_EXT_CONFIG: &str = r#"{"bucket": "neon-dev-extensions", "region": "eu-central-1", "endpoint": null, "prefix": "5555"}"#;

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
-
    info!("build_tag: {build_tag}");

    let matches = cli().get_matches();
+    let pgbin_default = String::from("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+
+    let remote_ext_config = matches
+        .get_one::<String>("remote-ext-config")
+        .map(|x| x.to_string());
+    // let remote_ext_config =
+    //     Some(remote_ext_config.unwrap_or(DEFAULT_REMOTE_EXT_CONFIG.to_string()));
+
+    let ext_remote_storage = remote_ext_config.map(|x| {
+        init_remote_storage(&x, build_tag)
+            .expect("cannot initialize remote extension storage from config")
+    });
+    // creds used to connect to remote extensions bucket
+    // let aws_creds = matches.get_one::<String>("awscreds");
+    // if let Some(aws_creds) = aws_creds {
+    //     // not sure if this is a bad idea?
+    //     let aws_creds_dict: serde_json::Value = serde_json::from_str(aws_creds)?;
+    //     std::env::set_var(
+    //         "AWS_ACCESS_KEY_ID",
+    //         aws_creds_dict["ID"].as_str().expect("config parse error"),
+    //     );
+    //     std::env::set_var(
+    //         "AWS_SECRET_ACCESS_KEY",
+    //         aws_creds_dict["key"].as_str().expect("config parse error"),
+    //     );
+    // }

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -128,9 +160,6 @@ fn main() -> Result<()> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.get_one::<String>("pgbin").unwrap();
-
    let spec;
    let mut live_config_allowed = false;
    match spec_json {
@@ -168,6 +197,7 @@ fn main() -> Result<()> {

    let mut new_state = ComputeState::new();
    let spec_set;
+
    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
        new_state.pspec = Some(pspec);
@@ -179,9 +209,12 @@ fn main() -> Result<()> {
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
+        pgversion: get_pg_version(pgbin),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
+        ext_remote_storage,
+        available_extensions: OnceLock::new(),
    };
    let compute = Arc::new(compute_node);

@@ -190,6 +223,8 @@ fn main() -> Result<()> {
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

+    let extension_server_port: u16 = http_port;
+
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
@@ -222,10 +257,18 @@ fn main() -> Result<()> {
    compute.state_changed.notify_all();
    drop(state);

+    // Launch remaining service threads
+    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
+    let _configurator_handle =
+        launch_configurator(&compute).expect("cannot launch configurator thread");
+
+    let _download_extensions_handle =
+        launch_download_extensions(&compute).expect("cannot launch download extensions thread");
+
    // Start Postgres
    let mut delay_exit = false;
    let mut exit_code = None;
-    let pg = match compute.start_compute() {
+    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
@@ -238,14 +281,6 @@ fn main() -> Result<()> {
        }
    };

-    // Launch remaining service threads
-    //
-    // NOTE we do this after starting postgres so that these two extra threads
-    //      don't blow the cpu budget and throttle the startup process.
-    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
-    let _configurator_handle =
-        launch_configurator(&compute).expect("cannot launch configurator thread");
-
    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
    if let Some(mut pg) = pg {
@@ -362,6 +397,18 @@ fn cli() -> clap::Command {
                .long("control-plane-uri")
                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
+        .arg(
+            Arg::new("remote-ext-config")
+                .short('r')
+                .long("remote-ext-config")
+                .value_name("REMOTE_EXT_CONFIG"),
+        )
+        .arg(
+            Arg::new("awscreds")
+                .short('k')
+                .long("awscreds")
+                .value_name("AWS_CREDENTIALS"),
+        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,14 +1,17 @@
+use std::collections::HashSet;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex};
+use std::sync::{Condvar, Mutex, OnceLock};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use futures::future::join_all;
 use postgres::{Client, NoTls};
+use tokio;
 use tokio_postgres;
 use tracing::{info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -18,9 +21,11 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use crate::config;
+use remote_storage::GenericRemoteStorage;
+
 use crate::pg_helpers::*;
 use crate::spec::*;
+use crate::{config, extension_server};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -28,6 +33,7 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
+    pub pgversion: String,
    /// We should only allow live re- / configuration of the compute node if
    /// it uses 'pull model', i.e. it can go to control-plane and fetch
    /// the latest configuration. Otherwise, there could be a case:
@@ -47,6 +53,10 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
+    ///  the S3 bucket that we search for extensions in
+    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    // cached lists of available extensions and libraries
+    pub available_extensions: OnceLock<HashSet<String>>,
 }

 #[derive(Clone, Debug)]
@@ -357,14 +367,22 @@ impl ComputeNode {
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip_all)]
-    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+    pub fn prepare_pgdata(
+        &self,
+        compute_state: &ComputeState,
+        extension_server_port: u16,
+    ) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
+        config::write_postgres_conf(
+            &pgdata_path.join("postgresql.conf"),
+            &pspec.spec,
+            Some(extension_server_port),
+        )?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
        // is already connected it will be kicked out, so a secondary (standby)
@@ -506,7 +524,7 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
        self.pg_reload_conf(&mut client)?;
@@ -536,7 +554,7 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(&self) -> Result<std::process::Child> {
+    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -547,7 +565,26 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        self.prepare_pgdata(&compute_state)?;
+        // This part is sync, because we need to download
+        // remote shared_preload_libraries before postgres start (if any)
+        {
+            let library_load_start_time = Utc::now();
+            self.prepare_preload_libraries(&compute_state)?;
+
+            let library_load_time = Utc::now()
+                .signed_duration_since(library_load_start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            let mut state = self.state.lock().unwrap();
+            state.metrics.load_libraries_ms = library_load_time;
+            info!(
+                "Loading shared_preload_libraries took {:?}ms",
+                library_load_time
+            );
+        }
+
+        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
@@ -695,4 +732,92 @@ LIMIT 100",
            "{{\"pg_stat_statements\": []}}".to_string()
        }
    }
+
+    // If remote extension storage is configured,
+    // download extension control files
+    #[tokio::main]
+    pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
+        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
+            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+            let spec = &pspec.spec;
+            let custom_ext_prefixes = spec.custom_extensions.clone().unwrap_or(Vec::new());
+            info!("custom_ext_prefixes: {:?}", &custom_ext_prefixes);
+            let available_extensions = extension_server::get_available_extensions(
+                ext_remote_storage,
+                &self.pgbin,
+                &self.pgversion,
+                &custom_ext_prefixes,
+            )
+            .await?;
+            self.available_extensions
+                .set(available_extensions)
+                .expect("available_extensions.set error");
+        }
+        Ok(())
+    }
+
+    pub async fn download_extension(&self, ext_name: &str) -> Result<()> {
+        match &self.ext_remote_storage {
+            None => anyhow::bail!("No remote extension storage"),
+            Some(remote_storage) => {
+                extension_server::download_extension(
+                    ext_name,
+                    remote_storage,
+                    &self.pgbin,
+                    &self.pgversion,
+                )
+                .await
+            }
+        }
+    }
+
+    #[tokio::main]
+    pub async fn prepare_preload_libraries(&self, compute_state: &ComputeState) -> Result<()> {
+        if self.ext_remote_storage.is_none() {
+            return Ok(());
+        }
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        let spec = &pspec.spec;
+
+        info!("parse shared_preload_libraries from spec.cluster.settings");
+        let mut libs_vec = Vec::new();
+        if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+            libs_vec = libs
+                .split(&[',', '\'', ' '])
+                .filter(|s| *s != "neon" && !s.is_empty())
+                .map(str::to_string)
+                .collect();
+        }
+        info!("parse shared_preload_libraries from provided postgresql.conf");
+        // that is used in neon_local and python tests
+        if let Some(conf) = &spec.cluster.postgresql_conf {
+            let conf_lines = conf.split('\n').collect::<Vec<&str>>();
+            let mut shared_preload_libraries_line = "";
+            for line in conf_lines {
+                if line.starts_with("shared_preload_libraries") {
+                    shared_preload_libraries_line = line;
+                }
+            }
+            let mut preload_libs_vec = Vec::new();
+            if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
+                preload_libs_vec = libs
+                    .split(&[',', '\'', ' '])
+                    .filter(|s| *s != "neon" && !s.is_empty())
+                    .map(str::to_string)
+                    .collect();
+            }
+            libs_vec.extend(preload_libs_vec);
+        }
+
+        info!("Downloading to shared preload libraries: {:?}", &libs_vec);
+        let mut download_tasks = Vec::new();
+        for library in &libs_vec {
+            download_tasks.push(self.download_extension(library));
+        }
+        let results = join_all(download_tasks).await;
+        for result in results {
+            result?; // propogate any errors
+        }
+        Ok(())
+    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 }

 /// Create or completely rewrite configuration file specified by `path`
-pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
+pub fn write_postgres_conf(
+    path: &Path,
+    spec: &ComputeSpec,
+    extension_server_port: Option<u16>,
+) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

@@ -87,5 +91,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        writeln!(file, "# Managed by compute_ctl: end")?;
    }

+    if let Some(port) = extension_server_port {
+        writeln!(file, "neon.extension_server_port={}", port)?;
+    }
+
    Ok(())
 }
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -42,13 +42,15 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    }
 }

-pub fn launch_configurator(compute: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_configurator(
+    compute: &Arc<ComputeNode>,
+) -> Result<thread::JoinHandle<()>, std::io::Error> {
    let compute = Arc::clone(compute);

-    Ok(thread::Builder::new()
+    thread::Builder::new()
        .name("compute-configurator".into())
        .spawn(move || {
            configurator_main_loop(&compute);
            info!("configurator thread is exited");
-        })?)
+        })
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -0,0 +1,237 @@
+// Download extension files from the extension store
+// and put them in the right place in the postgres directory
+/*
+The layout of the S3 bucket is as follows:
+
+v14/ext_index.json
+    -- this contains information necessary to create control files
+v14/extensions/test_ext1.tar.gz
+    -- this contains the library files and sql files necessary to create this extension
+v14/extensions/custom_ext1.tar.gz
+
+The difference between a private and public extensions is determined by who can
+load the extension this is specified in ext_index.json
+
+Speicially, ext_index.json has a list of public extensions, and a list of
+extensions enabled for specific tenant-ids.
+*/
+use crate::compute::ComputeNode;
+use anyhow::Context;
+use anyhow::{self, Result};
+use flate2::read::GzDecoder;
+use remote_storage::*;
+use serde_json::{self, Value};
+use std::collections::HashSet;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::path::Path;
+use std::str;
+use std::sync::Arc;
+use std::thread;
+use tar::Archive;
+use tokio::io::AsyncReadExt;
+use tracing::info;
+
+fn get_pg_config(argument: &str, pgbin: &str) -> String {
+    // gives the result of `pg_config [argument]`
+    // where argument is a flag like `--version` or `--sharedir`
+    let pgconfig = pgbin
+        .strip_suffix("postgres")
+        .expect("bad pgbin")
+        .to_owned()
+        + "/pg_config";
+    let config_output = std::process::Command::new(pgconfig)
+        .arg(argument)
+        .output()
+        .expect("pg_config error");
+    std::str::from_utf8(&config_output.stdout)
+        .expect("pg_config error")
+        .trim()
+        .to_string()
+}
+
+pub fn get_pg_version(pgbin: &str) -> String {
+    // pg_config --version returns a (platform specific) human readable string
+    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    let human_version = get_pg_config("--version", pgbin);
+    if human_version.contains("15") {
+        return "v15".to_string();
+    } else if human_version.contains("14") {
+        return "v14".to_string();
+    }
+    panic!("Unsuported postgres version {human_version}");
+}
+
+// download extension control files
+// if custom_ext_prefixes is provided - search also in custom extension paths
+pub async fn get_available_extensions(
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+    pg_version: &str,
+    custom_ext_prefixes: &[String],
+) -> Result<HashSet<String>> {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    let index_path = pg_version.to_owned() + "/ext_index.json";
+    let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
+    info!("download ext_index.json: {:?}", &index_path);
+
+    // TODO: potential optimization: cache ext_index.json
+    let mut download = remote_storage.download(&index_path).await?;
+    let mut write_data_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut write_data_buffer)
+        .await?;
+    let ext_index_str = match str::from_utf8(&write_data_buffer) {
+        Ok(v) => v,
+        Err(e) => panic!("Invalid UTF-8 sequence: {}", e),
+    };
+
+    let ext_index_full: Value = serde_json::from_str(ext_index_str)?;
+    let ext_index_full = ext_index_full.as_object().context("error parsing json")?;
+    let control_data = ext_index_full["control_data"]
+        .as_object()
+        .context("json parse error")?;
+    let enabled_extensions = ext_index_full["enabled_extensions"]
+        .as_object()
+        .context("json parse error")?;
+    info!("{:?}", control_data.clone());
+    info!("{:?}", enabled_extensions.clone());
+
+    let mut prefixes = vec!["public".to_string()];
+    prefixes.extend(custom_ext_prefixes.to_owned());
+    info!("{:?}", &prefixes);
+    let mut all_extensions = HashSet::new();
+    for prefix in prefixes {
+        let prefix_extensions = match enabled_extensions.get(&prefix) {
+            Some(Value::Array(ext_name)) => ext_name,
+            _ => {
+                info!("prefix {} has no extensions", prefix);
+                continue;
+            }
+        };
+        info!("{:?}", prefix_extensions);
+        for ext_name in prefix_extensions {
+            all_extensions.insert(ext_name.as_str().context("json parse error")?.to_string());
+        }
+    }
+
+    for prefix in &all_extensions {
+        let control_contents = control_data[prefix].as_str().context("json parse error")?;
+        let control_path = local_sharedir.join(prefix.to_owned() + ".control");
+
+        info!("WRITING FILE {:?}{:?}", control_path, control_contents);
+        std::fs::write(control_path, control_contents)?;
+    }
+
+    Ok(all_extensions.into_iter().collect())
+}
+
+// download all sqlfiles (and possibly data files) for a given extension name
+pub async fn download_extension(
+    ext_name: &str,
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+    pg_version: &str,
+) -> Result<()> {
+    // TODO: potential optimization: only download the extension if it doesn't exist
+    // problem: how would we tell if it exists?
+    let ext_name = ext_name.replace(".so", "");
+    let ext_name_targz = ext_name.to_owned() + ".tar.gz";
+    if Path::new(&ext_name_targz).exists() {
+        info!("extension {:?} already exists", ext_name_targz);
+        return Ok(());
+    }
+    let ext_path = RemotePath::new(
+        &Path::new(pg_version)
+            .join("extensions")
+            .join(ext_name_targz.clone()),
+    )?;
+    info!(
+        "Start downloading extension {:?} from {:?}",
+        ext_name, ext_path
+    );
+    let mut download = remote_storage.download(&ext_path).await?;
+    let mut write_data_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut write_data_buffer)
+        .await?;
+    let unzip_dest = pgbin.strip_suffix("/bin/postgres").expect("bad pgbin");
+    let tar = GzDecoder::new(write_data_buffer.as_slice());
+    let mut archive = Archive::new(tar);
+    archive.unpack(unzip_dest)?;
+    info!("Download + unzip {:?} completed successfully", &ext_path);
+
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    let zip_sharedir = format!("{unzip_dest}/extensions/{ext_name}/share/extension");
+    info!("mv {zip_sharedir:?}/* {local_sharedir:?}");
+    for file in std::fs::read_dir(zip_sharedir)? {
+        let old_file = file?.path();
+        let new_file =
+            Path::new(&local_sharedir).join(old_file.file_name().context("error parsing file")?);
+        std::fs::rename(old_file, new_file)?;
+    }
+    let local_libdir = Path::new(&get_pg_config("--libdir", pgbin)).join("postgresql");
+    let zip_libdir = format!("{unzip_dest}/extensions/{ext_name}/lib");
+    info!("mv {zip_libdir:?}/* {local_libdir:?}");
+    for file in std::fs::read_dir(zip_libdir)? {
+        let old_file = file?.path();
+        let new_file =
+            Path::new(&local_libdir).join(old_file.file_name().context("error parsing file")?);
+        std::fs::rename(old_file, new_file)?;
+    }
+    Ok(())
+}
+
+// This function initializes the necessary structs to use remmote storage (should be fairly cheap)
+pub fn init_remote_storage(
+    remote_ext_config: &str,
+    default_prefix: &str,
+) -> anyhow::Result<GenericRemoteStorage> {
+    let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
+
+    let remote_ext_bucket = remote_ext_config["bucket"]
+        .as_str()
+        .context("config parse error")?;
+    let remote_ext_region = remote_ext_config["region"]
+        .as_str()
+        .context("config parse error")?;
+    let remote_ext_endpoint = remote_ext_config["endpoint"].as_str();
+    let remote_ext_prefix = remote_ext_config["prefix"]
+        .as_str()
+        .unwrap_or(default_prefix)
+        .to_string();
+
+    // TODO: potentially allow modification of other parameters
+    // however, default values should be fine for now
+    let config = S3Config {
+        bucket_name: remote_ext_bucket.to_string(),
+        bucket_region: remote_ext_region.to_string(),
+        prefix_in_bucket: Some(remote_ext_prefix),
+        endpoint: remote_ext_endpoint.map(|x| x.to_string()),
+        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
+        max_keys_per_list_response: None,
+    };
+    let config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
+        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
+        storage: RemoteStorageKind::AwsS3(config),
+    };
+    GenericRemoteStorage::from_config(&config)
+}
+
+pub fn launch_download_extensions(
+    compute: &Arc<ComputeNode>,
+) -> Result<thread::JoinHandle<()>, std::io::Error> {
+    let compute = Arc::clone(compute);
+    thread::Builder::new()
+        .name("download-extensions".into())
+        .spawn(move || {
+            info!("start download_extension_files");
+            let compute_state = compute.state.lock().expect("error unlocking compute.state");
+            compute
+                .prepare_external_extensions(&compute_state)
+                .expect("error preparing extensions");
+            info!("download_extension_files done, exiting thread");
+        })
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -121,6 +121,27 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        // download extension files from S3 on demand
+        (&Method::POST, route) if route.starts_with("/extension_server/") => {
+            info!("serving {:?} POST request", route);
+            info!("req.uri {:?}", req.uri());
+            let filename = route.split('/').last().unwrap().to_string();
+            info!(
+                "serving /extension_server POST request, filename: {:?}",
+                &filename
+            );
+
+            match compute.download_extension(&filename).await {
+                Ok(_) => Response::new(Body::from("OK")),
+                Err(e) => {
+                    error!("extension download failed: {}", e);
+                    let mut resp = Response::new(Body::from(e.to_string()));
+                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                    resp
+                }
+            }
+        }
+
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -139,6 +139,34 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/GenericError"
+  /extension_server:
+    post:
+      tags:
+      - Extension
+      summary: Download extension from S3 to local folder.
+      description: ""
+      operationId: downloadExtension
+      responses:
+        200:
+          description: Extension downloaded
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'OK' if download succeeded.
+                example: "OK"
+        400:
+        description: Request is invalid.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"
+        500:
+        description: Extension download request failed.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -9,6 +9,7 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
+pub mod extension_server;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -105,10 +105,10 @@ fn watch_compute_activity(compute: &ComputeNode) {
 }

 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>, std::io::Error> {
    let state = Arc::clone(state);

-    Ok(thread::Builder::new()
+    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(&state))?)
+        .spawn(move || watch_compute_activity(&state))
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -19,7 +19,7 @@ const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // mil
 /// Escape a string for including it in a SQL literal. Wrapping the result
 /// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
 /// SQL string literal, e.g. `'db'''` or `E'db\\'`.
-/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
+/// See https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47
 /// for the original implementation.
 pub fn escape_literal(s: &str) -> String {
    let res = s.replace('\'', "''").replace('\\', "\\\\");
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
 pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;

    update_pg_hba(pgdata_path)?;

--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -32,3 +32,4 @@ utils.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
+tracing.workspace = true
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -10,7 +10,7 @@
 //! (non-Neon binaries don't necessarily follow our pidfile conventions).
 //! The pid stored in the file is later used to stop the service.
 //!
-//! See the [`lock_file`](utils::lock_file) module for more info.
+//! See [`lock_file`] module for more info.

 use std::ffi::OsStr;
 use std::io::Write;
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -658,6 +658,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
+
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -699,7 +701,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers)?;
+                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -743,7 +745,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token, safekeepers)?;
+                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
        "stop" => {
@@ -1003,6 +1005,12 @@ fn cli() -> Command {
        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
        .required(false);

+    let remote_ext_config_args = Arg::new("remote-ext-config")
+        .long("remote-ext-config")
+        .num_args(1)
+        .help("Configure the S3 bucket that we search for extensions in.")
+        .required(false);
+
    let lsn_arg = Arg::new("lsn")
        .long("lsn")
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1161,6 +1169,7 @@ fn cli() -> Command {
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
+                    .arg(remote_ext_config_args)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -2,9 +2,8 @@
 //!
 //! In the local test environment, the data for each safekeeper is stored in
 //!
-//! ```text
 //!   .neon/safekeepers/<safekeeper id>
-//! ```
+//!
 use anyhow::Context;

 use std::path::PathBuf;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -2,9 +2,7 @@
 //!
 //! In the local test environment, the data for each endpoint is stored in
 //!
-//! ```text
 //!   .neon/endpoints/<endpoint id>
-//! ```
 //!
 //! Some basic information about the endpoint, like the tenant and timeline IDs,
 //! are stored in the `endpoint.json` file. The `endpoint.json` file is created
@@ -24,7 +22,7 @@
 //!
 //! Directory contents:
 //!
-//! ```text
+//! ```ignore
 //! .neon/endpoints/main/
 //!     compute.log               - log output of `compute_ctl` and `postgres`
 //!     endpoint.json             - serialized `EndpointConf` struct
@@ -289,7 +287,7 @@ impl Endpoint {
                        .env
                        .safekeepers
                        .iter()
-                        .map(|sk| format!("localhost:{}", sk.get_compute_port()))
+                        .map(|sk| format!("localhost:{}", sk.pg_port))
                        .collect::<Vec<String>>()
                        .join(",");
                    conf.append("neon.safekeepers", &safekeepers);
@@ -313,12 +311,12 @@ impl Endpoint {

                // TODO: use future host field from safekeeper spec
                // Pass the list of safekeepers to the replica so that it can connect to any of them,
-                // whichever is availiable.
+                // whichever is available.
                let sk_ports = self
                    .env
                    .safekeepers
                    .iter()
-                    .map(|x| x.get_compute_port().to_string())
+                    .map(|x| x.pg_port.to_string())
                    .collect::<Vec<_>>()
                    .join(",");
                let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
@@ -420,7 +418,12 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
+    pub fn start(
+        &self,
+        auth_token: &Option<String>,
+        safekeepers: Vec<NodeId>,
+        remote_ext_config: Option<&String>,
+    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }
@@ -463,7 +466,7 @@ impl Endpoint {
                    .iter()
                    .find(|node| node.id == sk_id)
                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
            }
        }

@@ -488,6 +491,15 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
+            // TODO FIXME: This is a hack to test custom extensions locally.
+            // In test_download_extensions, we assume that the custom extension
+            // prefix is the tenant ID. So we set it here.
+            //
+            // The proper way to implement this is to pass the custom extension
+            // in spec, but we don't have a way to do that yet in the python tests.
+            // NEW HACK: we enable the anon custom extension for everyone! this is of course just for testing
+            // how will we do it for real?
+            custom_extensions: Some(vec!["123454321".to_string(), self.tenant_id.to_string()]),
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -519,6 +531,11 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
+
+        if let Some(remote_ext_config) = remote_ext_config {
+            cmd.args(["--remote-ext-config", remote_ext_config]);
+        }
+
        let child = cmd.spawn()?;

        // Write down the pid so we can wait for it when we want to stop
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -137,7 +137,6 @@ impl Default for PageServerConf {
 pub struct SafekeeperConf {
    pub id: NodeId,
    pub pg_port: u16,
-    pub pg_tenant_only_port: Option<u16>,
    pub http_port: u16,
    pub sync: bool,
    pub remote_storage: Option<String>,
@@ -150,7 +149,6 @@ impl Default for SafekeeperConf {
        Self {
            id: NodeId(0),
            pg_port: 0,
-            pg_tenant_only_port: None,
            http_port: 0,
            sync: true,
            remote_storage: None,
@@ -160,14 +158,6 @@ impl Default for SafekeeperConf {
    }
 }

-impl SafekeeperConf {
-    /// Compute is served by port on which only tenant scoped tokens allowed, if
-    /// it is configured.
-    pub fn get_compute_port(&self) -> u16 {
-        self.pg_tenant_only_port.unwrap_or(self.pg_port)
-    }
-}
-
 impl LocalEnv {
    pub fn pg_distrib_dir_raw(&self) -> PathBuf {
        self.pg_distrib_dir.clone()
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -2,9 +2,8 @@
 //!
 //! In the local test environment, the data for each safekeeper is stored in
 //!
-//! ```text
 //!   .neon/safekeepers/<safekeeper id>
-//! ```
+//!
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
@@ -120,55 +119,45 @@ impl SafekeeperNode {
        let availability_zone = format!("sk-{}", id_string);

        let mut args = vec![
-            "-D".to_owned(),
-            datadir
-                .to_str()
-                .with_context(|| {
-                    format!("Datadir path {datadir:?} cannot be represented as a unicode string")
-                })?
-                .to_owned(),
-            "--id".to_owned(),
-            id_string,
-            "--listen-pg".to_owned(),
-            listen_pg,
-            "--listen-http".to_owned(),
-            listen_http,
-            "--availability-zone".to_owned(),
-            availability_zone,
+            "-D",
+            datadir.to_str().with_context(|| {
+                format!("Datadir path {datadir:?} cannot be represented as a unicode string")
+            })?,
+            "--id",
+            &id_string,
+            "--listen-pg",
+            &listen_pg,
+            "--listen-http",
+            &listen_http,
+            "--availability-zone",
+            &availability_zone,
        ];
-        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
-            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
-        }
        if !self.conf.sync {
-            args.push("--no-sync".to_owned());
+            args.push("--no-sync");
        }

        let broker_endpoint = format!("{}", self.env.broker.client_url());
-        args.extend(["--broker-endpoint".to_owned(), broker_endpoint]);
+        args.extend(["--broker-endpoint", &broker_endpoint]);

        let mut backup_threads = String::new();
        if let Some(threads) = self.conf.backup_threads {
            backup_threads = threads.to_string();
-            args.extend(["--backup-threads".to_owned(), backup_threads]);
+            args.extend(["--backup-threads", &backup_threads]);
        } else {
            drop(backup_threads);
        }

        if let Some(ref remote_storage) = self.conf.remote_storage {
-            args.extend(["--remote-storage".to_owned(), remote_storage.clone()]);
+            args.extend(["--remote-storage", remote_storage]);
        }

        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
            args.extend([
-                "--auth-validation-public-key-path".to_owned(),
-                key_path
-                    .to_str()
-                    .with_context(|| {
-                        format!("Key path {key_path:?} cannot be represented as a unicode string")
-                    })?
-                    .to_owned(),
+                "--auth-validation-public-key-path",
+                key_path.to_str().with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?,
            ]);
        }

--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -30,8 +30,8 @@ or similar, to wake up on shutdown.

 In async Rust, futures can be "cancelled" at any await point, by
 dropping the Future. For example, `tokio::select!` returns as soon as
-one of the Futures returns, and drops the others. `tokio::time::timeout`
-is another example. In the Rust ecosystem, some functions are
+one of the Futures returns, and drops the others. `tokio::timeout!` is
+another example. In the Rust ecosystem, some functions are
 cancellation-safe, meaning they can be safely dropped without
 side-effects, while others are not. See documentation of
 `tokio::select!` for examples.
@@ -42,9 +42,9 @@ function that you call cannot be assumed to be async
 cancellation-safe, and must be polled to completion.

 The downside of non-cancellation safe code is that you have to be very
-careful when using `tokio::select!`, `tokio::time::timeout`, and other
-such functions that can cause a Future to be dropped. They can only be
-used with functions that are explicitly documented to be cancellation-safe,
+careful when using `tokio::select!`, `tokio::timeout!`, and other such
+functions that can cause a Future to be dropped. They can only be used
+with functions that are explicitly documented to be cancellation-safe,
 or you need to spawn a separate task to shield from the cancellation.

 At the entry points to the code, we also take care to poll futures to
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -0,0 +1,187 @@
+# Supporting custom user Extensions (Dynamic Extension Loading)
+Created 2023-05-03
+
+## Motivation
+
+There are many extensions in the PostgreSQL ecosystem, and not all extensions
+are of a quality that we can confidently support them. Additionally, our
+current extension inclusion mechanism has several problems because we build all
+extensions into the primary Compute image: We build the extensions every time
+we build the compute image regardless of whether we actually need to rebuild
+the image, and the inclusion of these extensions in the image adds a hard
+dependency on all supported extensions - thus increasing the image size, and
+with it the time it takes to download that image - increasing first start
+latency.
+
+This RFC proposes a dynamic loading mechanism that solves most of these
+problems.
+
+## Summary
+
+`compute_ctl` is made responsible for loading extensions on-demand into
+the container's file system for dynamically loaded extensions, and will also
+make sure that the extensions in `shared_preload_libraries` are downloaded
+before the compute node starts.
+
+## Components
+
+compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
+
+## Requirements
+
+Compute nodes with no extra extensions should not be negatively impacted by
+the existence of support for many extensions.
+
+Installing an extension into PostgreSQL should be easy.
+
+Non-preloaded extensions shouldn't impact startup latency.
+
+Uninstalled extensions shouldn't impact query latency.
+
+A small latency penalty for dynamically loaded extensions is acceptable in
+the first seconds of compute startup, but not in steady-state operations.
+
+## Proposed implementation
+
+### On-demand, JIT-loading of extensions
+
+Before postgres starts we download 
+- control files for all extensions available to that compute node;
+- all `shared_preload_libraries`;
+
+After postgres is running, `compute_ctl` listens for requests to load files.
+When PostgreSQL requests a file, `compute_ctl` downloads it.
+
+PostgreSQL requests files in the following cases:
+- When loading a preload library set in `local_preload_libraries`
+- When explicitly loading a library with `LOAD`
+- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
+
+
+#### Summary
+
+Pros:
+ - Startup is only as slow as it takes to load all (shared_)preload_libraries
+ - Supports BYO Extension
+
+Cons:
+ - O(sizeof(extensions)) IO requirement for loading all extensions.
+
+### Alternative solutions
+
+1. Allow users to add their extensions to the base image
+   
+   Pros:
+    - Easy to deploy
+
+   Cons:
+    - Doesn't scale - first start size is dependent on image size;
+    - All extensions are shared across all users: It doesn't allow users to
+      bring their own restrictive-licensed extensions
+
+2. Bring Your Own compute image
+   
+   Pros:
+    - Still easy to deploy
+    - User can bring own patched version of PostgreSQL
+
+   Cons:
+    - First start latency is O(sizeof(extensions image))
+    - Warm instance pool for skipping pod schedule latency is not feasible with
+      O(n) custom images
+    - Support channels are difficult to manage
+
+3. Download all user extensions in bulk on compute start
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues for "clean" users.
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - Downloading all extensions in advance takes a lot of time, thus startup
+      latency issues
+
+4. Store user's extensions in persistent storage
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - EC2 instances have only limited number of attachments shared between EBS
+      volumes, direct-attached NVMe drives, and ENIs.
+    - Compute instance migration isn't trivially solved for EBS mounts (e.g.
+      the device is unavailable whilst moving the mount between instances).
+    - EBS can only mount on one instance at a time (except the expensive IO2
+      device type).
+
+5. Store user's extensions in network drive
+   
+   Pros:
+    - Easy to deploy
+    - Few startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - We'd need networked drives, and a lot of them, which would store many
+      duplicate extensions.
+    - **UNCHECKED:** Compute instance migration may not work nicely with
+      networked IOs
+
+
+### Idea extensions
+
+The extension store does not have to be S3 directly, but could be a Node-local
+caching service on top of S3. This would reduce the load on the network for
+popular extensions.
+
+## Extension Storage implementation
+
+The layout of the S3 bucket is as follows:
+```
+v14/ext_index.json
+    -- this contains information necessary to create control files
+v14/extensions/test_ext1.tar.gz
+    -- this contains the library files and sql files necessary to create this extension
+v14/extensions/custom_ext1.tar.gz
+```
+The difference between private and public extensions is determined by who can
+load the extension. This is specified in `ext_index.json`.
+Speicially, `ext_index.json` has a list of public extensions, and a list of
+extensions enabled for specific tenant-ids. Here is an example `ext_index.json`:
+```
+{
+  "enabled_extensions": {
+    "123454321": [
+      "anon"
+    ],
+    "public": [
+      "embedding"
+    ]
+  },
+  "control_data": {
+    "embedding": "comment = 'hnsw index' \ndefault_version = '0.1.0' \nmodule_pathname = '$libdir/embedding' \nrelocatable = true \ntrusted = true",
+    "anon": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+  }
+}
+```
+
+### How to add new extension to the Extension Storage?
+
+Simply upload build artifacts to the S3 bucket.
+Implement a CI step for that. Splitting it from ompute-node-image build.
+
+### How do we deal with extension versions and updates?
+
+Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
+This is needed to ensure that `/share` and `/lib` files are in sync.
+
+For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
+
+### Alternatives
+
+For extensions written on trusted languages we can also adopt
+`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
+This will increase the amount supported extensions and decrease the amount of work required to support them.
--- a/docs/tools.md
+++ b/docs/tools.md
@@ -1,22 +0,0 @@
-# Useful development tools
-
-This readme contains some hints on how to set up some optional development tools.
-
-## ccls
-
-[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup
-to work well. There are different ways to do it but here's what works for me:
-1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`)
-2. Go to `vendor/postgres-v15`
-3. Run `make clean && ./configure`
-4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4`
-5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent)
-6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories
-
-With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well.
-
-Some additional tips for various IDEs:
-
-### Emacs
-
-To improve performance: `(setq lsp-lens-enable nil)`
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -75,6 +75,7 @@ pub struct ComputeMetrics {
    pub start_postgres_ms: u64,
    pub config_ms: u64,
    pub total_startup_ms: u64,
+    pub load_libraries_ms: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -60,6 +60,9 @@ pub struct ComputeSpec {
    /// If set, 'storage_auth_token' is used as the password to authenticate to
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
+
+    // list of prefixes to search for custom extensions in remote extension storage
+    pub custom_extensions: Option<Vec<String>>,
 }

 #[serde_as]
--- a/libs/metrics/src/metric_vec_duration.rs
+++ b/libs/metrics/src/metric_vec_duration.rs
@@ -1,4 +1,4 @@
-//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
+//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.

 use std::{future::Future, time::Instant};

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,7 +9,6 @@ use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
-    completion,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
@@ -77,12 +76,7 @@ pub enum TenantState {
    /// system is being shut down.
    ///
    /// Transitions out of this state are possible through `set_broken()`.
-    Stopping {
-        // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
-        // otherwise it will not be skipped during deserialization
-        #[serde(skip)]
-        progress: completion::Barrier,
-    },
+    Stopping,
    /// The tenant is recognized by the pageserver, but can no longer be used for
    /// any operations.
    ///
@@ -124,7 +118,7 @@ impl TenantState {
            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
            // we set the Stopping state irrespective of whether the tenant
            // has finished attaching or not.
-            Self::Stopping { .. } => Maybe,
+            Self::Stopping => Maybe,
        }
    }

@@ -417,16 +411,12 @@ pub struct LayerResidenceEvent {
    pub reason: LayerResidenceEventReason,
 }

-/// The reason for recording a given [`LayerResidenceEvent`].
+/// The reason for recording a given [`ResidenceEvent`].
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum LayerResidenceEventReason {
    /// The layer map is being populated, e.g. during timeline load or attach.
    /// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
    /// We need to record such events because there is no persistent storage for the events.
-    ///
-    // https://github.com/rust-lang/rust/issues/74481
-    /// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
-    /// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
    LayerLoad,
    /// We just created the layer (e.g., freeze_and_flush or compaction).
    /// Such layers are always [`LayerResidenceStatus::Resident`].
@@ -934,13 +924,7 @@ mod tests {
                "Activating",
            ),
            (line!(), TenantState::Active, "Active"),
-            (
-                line!(),
-                TenantState::Stopping {
-                    progress: utils::completion::Barrier::default(),
-                },
-                "Stopping",
-            ),
+            (line!(), TenantState::Stopping, "Stopping"),
            (
                line!(),
                TenantState::Broken {
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -60,9 +60,8 @@ impl Ord for RelTag {

 /// Display RelTag in the same format that's used in most PostgreSQL debug messages:
 ///
-/// ```text
 /// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
-/// ```
+///
 impl fmt::Display for RelTag {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        if let Some(forkname) = forknumber_to_name(self.forknum) {
--- a/libs/postgres_ffi/src/relfile_utils.rs
+++ b/libs/postgres_ffi/src/relfile_utils.rs
@@ -49,16 +49,14 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
    }
 }

+///
 /// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
 ///
 /// Formats:
-///
-/// ```text
 /// <oid>
 /// <oid>_<fork name>
 /// <oid>.<segment number>
 /// <oid>_<fork name>.<segment number>
-/// ```
 ///
 /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
 ///
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -5,11 +5,11 @@
 //! It is similar to what tokio_util::codec::Framed with appropriate codec
 //! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
 //! separately without using split from futures::stream::StreamExt (which
-//! allocates a [Box] in polling internally). tokio::io::split is used for splitting
+//! allocates box[1] in polling internally). tokio::io::split is used for splitting
 //! instead. Plus we customize error messages more than a single type for all io
 //! calls.
 //!
-//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
+//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
 use bytes::{Buf, BytesMut};
 use std::{
    future::Future,
@@ -117,7 +117,7 @@ impl<S: AsyncWrite + Unpin> Framed<S> {
 impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
    /// Split into owned read and write parts. Beware of potential issues with
    /// using halves in different tasks on TLS stream:
-    /// <https://github.com/tokio-rs/tls/issues/40>
+    /// https://github.com/tokio-rs/tls/issues/40
    pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
        let (read_half, write_half) = tokio::io::split(self.stream);
        let reader = FramedReader {
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -934,15 +934,6 @@ impl<'a> BeMessage<'a> {
    }
 }

-fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
-    let mut terminated = [0; 6];
-    for (i, &elem) in code.iter().enumerate() {
-        terminated[i] = elem;
-    }
-
-    terminated
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -974,3 +965,12 @@ mod tests {
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
 }
+
+fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
+    let mut terminated = [0; 6];
+    for (i, &elem) in code.iter().enumerate() {
+        terminated[i] = elem;
+    }
+
+    terminated
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
 pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
-/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
+/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
-/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
+/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
-/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
+/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
@@ -50,12 +50,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct RemotePath(PathBuf);

-impl std::fmt::Display for RemotePath {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0.display())
-    }
-}
-
 impl RemotePath {
    pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
        anyhow::ensure!(
@@ -190,6 +184,20 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
+    // A function for listing all the files in a "directory"
+    // Example:
+    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
+    // lists common *prefixes*, if any of files
+    // Example:
+    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
@@ -201,14 +209,6 @@ impl GenericRemoteStorage {
        }
    }

-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
-        }
-    }
-
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -151,7 +151,10 @@ impl RemoteStorage for LocalFs {
        let mut files = vec![];
        let mut directory_queue = vec![full_path.clone()];

-        while let Some(cur_folder) = directory_queue.pop() {
+        while !directory_queue.is_empty() {
+            let cur_folder = directory_queue
+                .pop()
+                .expect("queue cannot be empty: we just checked");
            let mut entries = fs::read_dir(cur_folder.clone()).await?;
            while let Some(entry) = entries.next_entry().await? {
                let file_name: PathBuf = entry.file_name().into();
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -349,10 +349,17 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let folder_name = folder
+        let mut folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());

+        // remove leading "/" if one exists
+        if let Some(folder_name_slash) = folder_name.clone() {
+            if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                folder_name = Some(folder_name_slash[1..].to_string());
+            }
+        }
+
        // AWS may need to break the response into several parts
        let mut continuation_token = None;
        let mut all_files = vec![];
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
 // 2. D+C+a+b
 // 3. D+A+B

-/// `Segment` which has had its size calculated.
+/// [`Segment`] which has had it's size calculated.
 #[derive(Clone, Debug)]
 struct SegmentSize {
    method: SegmentMethod,
--- a/libs/tracing-utils/src/http.rs
+++ b/libs/tracing-utils/src/http.rs
@@ -33,7 +33,7 @@ pub enum OtelName<'a> {
 /// directly into HTTP servers. However, I couldn't find one for Hyper,
 /// so I had to write our own. OpenTelemetry website has a registry of
 /// instrumentation libraries at:
-/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
+/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
 /// If a Hyper crate appears, consider switching to that.
 pub async fn tracing_handler<F, R>(
    req: Request<Body>,
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -16,7 +16,7 @@ use crate::id::TenantId;
 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;

-#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -12,13 +12,6 @@ pub struct Completion(mpsc::Sender<()>);
 #[derive(Clone)]
 pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);

-impl Default for Barrier {
-    fn default() -> Self {
-        let (_, rx) = channel();
-        rx
-    }
-}
-
 impl Barrier {
    pub async fn wait(self) {
        self.0.lock().await.recv().await;
@@ -31,15 +24,6 @@ impl Barrier {
    }
 }

-impl PartialEq for Barrier {
-    fn eq(&self, other: &Self) -> bool {
-        // we don't use dyn so this is good
-        Arc::ptr_eq(&self.0, &other.0)
-    }
-}
-
-impl Eq for Barrier {}
-
 /// Create new Guard and Barrier pair.
 pub fn channel() -> (Completion, Barrier) {
    let (tx, rx) = mpsc::channel::<()>(1);
--- a/libs/utils/src/error.rs
+++ b/libs/utils/src/error.rs
@@ -1,111 +0,0 @@
-/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
-///
-/// It can be used with `anyhow::Error` as well.
-///
-/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
-/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
-/// formatting.
-///
-/// ## Usage
-///
-/// ```rust
-/// #[derive(Debug, thiserror::Error)]
-/// enum MyCoolError {
-///   #[error("should never happen")]
-///   Bad(#[source] std::io::Error),
-/// }
-///
-/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
-///
-/// # fn main() {
-/// use utils::error::report_compact_sources;
-///
-/// if let Err(e) = failing_call() {
-///     let e = report_compact_sources(&e);
-///     assert_eq!(format!("{e}"), "should never happen: permission denied");
-/// }
-/// # }
-/// ```
-///
-/// ## TODO
-///
-/// When we are able to describe return position impl trait in traits, this should of course be an
-/// extension trait. Until then avoid boxing with this more ackward interface.
-pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
-    struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
-
-    impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{}", self.0)?;
-
-            // why is E a generic parameter here? hope that rustc will see through a default
-            // Error::source implementation and leave the following out if there cannot be any
-            // sources:
-            Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
-        }
-    }
-
-    struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
-
-    impl<'a> Iterator for Sources<'a> {
-        type Item = &'a (dyn std::error::Error + 'static);
-
-        fn next(&mut self) -> Option<Self::Item> {
-            let rem = self.0;
-
-            let next = self.0.and_then(|x| x.source());
-            self.0 = next;
-            rem
-        }
-    }
-
-    AnyhowDisplayAlternateAlike(e)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::report_compact_sources;
-
-    #[test]
-    fn report_compact_sources_examples() {
-        use std::fmt::Write;
-
-        #[derive(Debug, thiserror::Error)]
-        enum EvictionError {
-            #[error("cannot evict a remote layer")]
-            CannotEvictRemoteLayer,
-            #[error("stat failed")]
-            StatFailed(#[source] std::io::Error),
-            #[error("layer was no longer part of LayerMap")]
-            LayerNotFound(#[source] anyhow::Error),
-        }
-
-        let examples = [
-            (
-                line!(),
-                EvictionError::CannotEvictRemoteLayer,
-                "cannot evict a remote layer",
-            ),
-            (
-                line!(),
-                EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
-                "stat failed: permission denied",
-            ),
-            (
-                line!(),
-                EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
-                "layer was no longer part of LayerMap: foobar",
-            ),
-        ];
-
-        let mut s = String::new();
-
-        for (line, example, expected) in examples {
-            s.clear();
-
-            write!(s, "{}", report_compact_sources(&example)).expect("string grows");
-
-            assert_eq!(s, expected, "example on line {line}");
-        }
-    }
-}
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -14,7 +14,7 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
        .map_err(ApiError::BadRequest)
 }

-/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
+/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
 pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
    request: &mut Request<Body>,
 ) -> Result<Option<T>, ApiError> {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -63,9 +63,6 @@ pub mod rate_limit;
 /// Simple once-barrier and a guard which keeps barrier awaiting.
 pub mod completion;

-/// Reporting utilities
-pub mod error;
-
 mod failpoint_macro_helpers {

    /// use with fail::cfg("$name", "return(2000)")
@@ -133,8 +130,8 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
 /// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
 ///
 /// #############################################################################################
-/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
-/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
+/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
+/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
 /// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
 /// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
 /// The problem needs further investigation and regular `const` declaration instead of a macro.
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,10 +1,9 @@
 //! A module to create and read lock files.
 //!
 //! File locking is done using [`fcntl::flock`] exclusive locks.
-//! The only consumer of this module is currently
-//! [`pid_file`](crate::pid_file). See the module-level comment
-//! there for potential pitfalls with lock files that are used
-//! to store PIDs (pidfiles).
+//! The only consumer of this module is currently [`pid_file`].
+//! See the module-level comment there for potential pitfalls
+//! with lock files that are used to store PIDs (pidfiles).

 use std::{
    fs,
@@ -82,7 +81,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFi
 }

 /// Returned by [`read_and_hold_lock_file`].
-/// Check out the [`pid_file`](crate::pid_file) module for what the variants mean
+/// Check out the [`pid_file`] module for what the variants mean
 /// and potential caveats if the lock files that are used to store PIDs.
 pub enum LockFileRead {
    /// No file exists at the given path.
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -112,7 +112,7 @@ pub fn init(
 ///
 /// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
 /// If the assumptions about the initialization order are not held, use
-/// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
+/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
 /// lost.
 #[must_use]
 pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -23,9 +23,9 @@ pub enum SeqWaitError {

 /// Monotonically increasing value
 ///
-/// It is handy to store some other fields under the same mutex in `SeqWait<S>`
+/// It is handy to store some other fields under the same mutex in SeqWait<S>
 /// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
-/// any type that can expose counter. `V` is the type of exposed counter.
+/// any type that can expose counter. <V> is the type of exposed counter.
 pub trait MonotonicCounter<V> {
    /// Bump counter value and check that it goes forward
    /// N.B.: new_val is an actual new value, not a difference.
@@ -90,7 +90,7 @@ impl<T: Ord> Eq for Waiter<T> {}
 /// [`wait_for`]: SeqWait::wait_for
 /// [`advance`]: SeqWait::advance
 ///
-/// `S` means Storage, `V` is type of counter that this storage exposes.
+/// <S> means Storage, <V> is type of counter that this storage exposes.
 ///
 pub struct SeqWait<S, V>
 where
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -1,15 +1,8 @@
 //! Assert that the current [`tracing::Span`] has a given set of fields.
 //!
-//! Can only produce meaningful positive results when tracing has been configured as in example.
-//! Absence of `tracing_error::ErrorLayer` is not detected yet.
-//!
-//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing
-//! is completly unconfigured.
-//!
 //! # Usage
 //!
-//! ```rust
-//! # fn main() {
+//! ```
 //! use tracing_subscriber::prelude::*;
 //! let registry = tracing_subscriber::registry()
 //!    .with(tracing_error::ErrorLayer::default());
@@ -27,18 +20,23 @@
 //!
 //! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
 //! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
-//! if let Err(missing) = check_fields_present!([&extractor]) {
-//!    // if you copypaste this to a custom assert method, remember to add #[track_caller]
-//!    // to get the "user" code location for the panic.
-//!    panic!("Missing fields: {missing:?}");
+//! match check_fields_present([&extractor]) {
+//!    Ok(()) => {},
+//!    Err(missing) => {
+//!        panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
+//!    }
 //! }
-//! # }
 //! ```
 //!
-//! Recommended reading: <https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering>
+//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
 //!

-#[derive(Debug)]
+use std::{
+    collections::HashSet,
+    fmt::{self},
+    hash::{Hash, Hasher},
+};
+
 pub enum ExtractionResult {
    Present,
    Absent,
@@ -73,101 +71,49 @@ impl<const L: usize> Extractor for MultiNameExtractor<L> {
    }
 }

-/// Checks that the given extractors are satisfied with the current span hierarchy.
-///
-/// This should not be called directly, but used through [`check_fields_present`] which allows
-/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default.
-#[doc(hidden)]
-pub fn check_fields_present0<const L: usize>(
-    must_be_present: [&dyn Extractor; L],
-) -> Result<Summary, Vec<&dyn Extractor>> {
-    let mut missing = must_be_present.into_iter().collect::<Vec<_>>();
-    let trace = tracing_error::SpanTrace::capture();
-    trace.with_spans(|md, _formatted_fields| {
-        // when trying to understand the inner workings of how does the matching work, note that
-        // this closure might be called zero times if the span is disabled. normally it is called
-        // once per span hierarchy level.
-        missing.retain(|extractor| match extractor.extract(md.fields()) {
-            ExtractionResult::Present => false,
-            ExtractionResult::Absent => true,
-        });
+struct MemoryIdentity<'a>(&'a dyn Extractor);

-        // continue walking up until we've found all missing
-        !missing.is_empty()
-    });
-    if missing.is_empty() {
-        Ok(Summary::FoundEverything)
-    } else if !tracing_subscriber_configured() {
-        Ok(Summary::Unconfigured)
-    } else {
-        // we can still hit here if a tracing subscriber has been configured but the ErrorLayer is
-        // missing, which can be annoying. for this case, we could probably use
-        // SpanTrace::status().
-        //
-        // another way to end up here is with RUST_LOG=pageserver=off while configuring the
-        // logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid.
-        // this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`.
-        Err(missing)
+impl<'a> MemoryIdentity<'a> {
+    fn as_ptr(&self) -> *const () {
+        self.0 as *const _ as *const ()
+    }
+}
+impl<'a> PartialEq for MemoryIdentity<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_ptr() == other.as_ptr()
+    }
+}
+impl<'a> Eq for MemoryIdentity<'a> {}
+impl<'a> Hash for MemoryIdentity<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.as_ptr().hash(state);
+    }
+}
+impl<'a> fmt::Debug for MemoryIdentity<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
    }
 }

-/// Checks that the given extractors are satisfied with the current span hierarchy.
-///
-/// The macro is the preferred way of checking if fields exist while passing checks if a test does
-/// not have tracing configured.
-///
-/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present.
-/// However we can game a module namespaced macro for `use` purposes by re-exporting the
-/// #[macro_export] exported name with an alias (below).
-#[doc(hidden)]
-#[macro_export]
-macro_rules! __check_fields_present {
-    ($extractors:expr) => {{
-        {
-            use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor};
-
-            match check_fields_present0($extractors) {
-                Ok(FoundEverything) => Ok(()),
-                Ok(Unconfigured) if cfg!(test) => {
-                    // allow unconfigured in tests
-                    Ok(())
-                },
-                Ok(Unconfigured) => {
-                    panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
-                },
-                Err(missing) => Err(missing)
-            }
-        }
-    }}
-}
-
-pub use crate::__check_fields_present as check_fields_present;
-
-/// Explanation for why the check was deemed ok.
-///
-/// Mainly useful for testing, or configuring per-crate behaviour as in with
-/// [`check_fields_present`].
-#[derive(Debug)]
-pub enum Summary {
-    /// All extractors were found.
-    ///
-    /// Should only happen when tracing is properly configured.
-    FoundEverything,
-
-    /// Tracing has not been configured at all. This is ok for tests running without tracing set
-    /// up.
-    Unconfigured,
-}
-
-fn tracing_subscriber_configured() -> bool {
-    let mut noop_configured = false;
-    tracing::dispatcher::get_default(|d| {
-        // it is possible that this closure will not be invoked, but the current implementation
-        // always invokes it
-        noop_configured = d.is::<tracing::subscriber::NoSubscriber>();
+/// The extractor names passed as keys to [`new`].
+pub fn check_fields_present<const L: usize>(
+    must_be_present: [&dyn Extractor; L],
+) -> Result<(), Vec<&dyn Extractor>> {
+    let mut missing: HashSet<MemoryIdentity> =
+        HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
+    let trace = tracing_error::SpanTrace::capture();
+    trace.with_spans(|md, _formatted_fields| {
+        missing.retain(|extractor| match extractor.0.extract(md.fields()) {
+            ExtractionResult::Present => false,
+            ExtractionResult::Absent => true,
+        });
+        !missing.is_empty() // continue walking up until we've found all missing
    });
-
-    !noop_configured
+    if missing.is_empty() {
+        Ok(())
+    } else {
+        Err(missing.into_iter().map(|mi| mi.0).collect())
+    }
 }

 #[cfg(test)]
@@ -177,36 +123,6 @@ mod tests {

    use super::*;

-    use std::{
-        collections::HashSet,
-        fmt::{self},
-        hash::{Hash, Hasher},
-    };
-
-    struct MemoryIdentity<'a>(&'a dyn Extractor);
-
-    impl<'a> MemoryIdentity<'a> {
-        fn as_ptr(&self) -> *const () {
-            self.0 as *const _ as *const ()
-        }
-    }
-    impl<'a> PartialEq for MemoryIdentity<'a> {
-        fn eq(&self, other: &Self) -> bool {
-            self.as_ptr() == other.as_ptr()
-        }
-    }
-    impl<'a> Eq for MemoryIdentity<'a> {}
-    impl<'a> Hash for MemoryIdentity<'a> {
-        fn hash<H: Hasher>(&self, state: &mut H) {
-            self.as_ptr().hash(state);
-        }
-    }
-    impl<'a> fmt::Debug for MemoryIdentity<'a> {
-        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
-        }
-    }
-
    struct Setup {
        _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
        tenant_extractor: MultiNameExtractor<2>,
@@ -243,8 +159,7 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
    }

    #[test]
@@ -252,8 +167,8 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor])
-            .unwrap_err();
+        let missing =
+            check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -270,8 +185,7 @@ mod tests {
        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
    }

    #[test]
@@ -284,7 +198,7 @@ mod tests {
        let span = tracing::info_span!("child", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -293,8 +207,7 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let res = check_fields_present0([&setup.tenant_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor]).unwrap();
    }

    #[test]
@@ -310,8 +223,7 @@ mod tests {
        let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let res = check_fields_present0([&setup.tenant_extractor]);
-        assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
+        check_fields_present([&setup.tenant_extractor]).unwrap();
    }

    #[test]
@@ -319,7 +231,7 @@ mod tests {
        let setup = setup_current_thread();
        let span = tracing::info_span!("root", timeline_id = "timeline-1");
        let _guard = span.enter();
-        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

@@ -333,107 +245,43 @@ mod tests {
        let span = tracing::info_span!("child", timeline_id = "timeline-1");
        let _guard = span.enter();

-        let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
+        let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
        assert_missing(missing, vec![&setup.tenant_extractor]);
    }

    #[test]
-    fn tracing_error_subscriber_not_set_up_straight_line() {
+    fn tracing_error_subscriber_not_set_up() {
        // no setup
+
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

        let extractor = MultiNameExtractor::new("E", ["e"]);
-        let res = check_fields_present0([&extractor]);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
-
-        // similarly for a not found key
-        let extractor = MultiNameExtractor::new("F", ["foobar"]);
-        let res = check_fields_present0([&extractor]);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
    }

    #[test]
-    fn tracing_error_subscriber_not_set_up_with_instrument() {
-        // no setup
-
-        // demo a case where span entering is used to establish a parent child connection, but
-        // when we re-enter the subspan SpanTrace::with_spans iterates over nothing.
-        let span = tracing::info_span!("foo", e = "some value");
-        let _guard = span.enter();
-
-        let subspan = tracing::info_span!("bar", f = "foobar");
-        drop(_guard);
-
-        // normally this would work, but without any tracing-subscriber configured, both
-        // check_field_present find nothing
-        let _guard = subspan.enter();
-        let extractors: [&dyn Extractor; 2] = [
-            &MultiNameExtractor::new("E", ["e"]),
-            &MultiNameExtractor::new("F", ["f"]),
-        ];
-
-        let res = check_fields_present0(extractors);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
-
-        // similarly for a not found key
-        let extractor = MultiNameExtractor::new("G", ["g"]);
-        let res = check_fields_present0([&extractor]);
-        assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
-    }
-
-    #[test]
-    fn tracing_subscriber_configured() {
-        // this will fail if any utils::logging::init callers appear, but let's hope they do not
-        // appear.
-        assert!(!super::tracing_subscriber_configured());
-
-        let _g = setup_current_thread();
-
-        assert!(super::tracing_subscriber_configured());
-    }
-
-    #[test]
-    fn not_found_when_disabled_by_filter() {
+    #[should_panic]
+    fn panics_if_tracing_error_subscriber_has_wrong_filter() {
        let r = tracing_subscriber::registry().with({
-            tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn(
-                |md| !(md.is_span() && *md.level() == tracing::Level::INFO),
-            ))
+            tracing_error::ErrorLayer::default().with_filter(
+                tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
+                    if md.is_span() && *md.level() == tracing::Level::INFO {
+                        return false;
+                    }
+                    true
+                }),
+            )
        });

        let _guard = tracing::subscriber::set_default(r);

-        // this test is a rather tricky one, it has a number of possible outcomes depending on the
-        // execution order when executed with other tests even if no test sets the global default
-        // subscriber.
-
        let span = tracing::info_span!("foo", e = "some value");
        let _guard = span.enter();

-        let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
-
-        if span.is_disabled() {
-            // the tests are running single threaded, or we got lucky and no other tests subscriber
-            // was got to register their per-CALLSITE::META interest between `set_default` and
-            // creation of the span, thus the filter got to apply and registered interest of Never,
-            // so the span was never created.
-            //
-            // as the span is disabled, no keys were recorded to it, leading check_fields_present0
-            // to find an error.
-
-            let missing = check_fields_present0(extractors).unwrap_err();
-            assert_missing(missing, vec![extractors[0]]);
-        } else {
-            // when the span is enabled, it is because some other test is running at the same time,
-            // and that tests registry has filters which are interested in our above span.
-            //
-            // because the span is now enabled, all keys will be found for it. the
-            // tracing_error::SpanTrace does not consider layer filters during the span hierarchy
-            // walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in
-            // this test-induced issue.
-
-            let res = check_fields_present0(extractors);
-            assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
-        }
+        let extractor = MultiNameExtractor::new("E", ["e"]);
+        let missing = check_fields_present([&extractor]).unwrap_err();
+        assert_missing(missing, vec![&extractor]);
    }
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -82,7 +82,6 @@ strum_macros.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tempfile.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

 [[bench]]
 name = "bench_layer_map"
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -7,10 +7,10 @@
 //! - The y axis represents LSN, growing upwards.
 //!
 //! Coordinates in both axis are compressed for better readability.
-//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
+//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
 //!
 //! Example use:
-//! ```bash
+//! ```
 //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
 //! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
 //! $ firefox out.svg
@@ -20,7 +20,7 @@
 //! or from pageserver log files.
 //!
 //! TODO Consider shipping this as a grafana panel plugin:
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//!      https://grafana.com/tutorials/build-a-panel-plugin/
 use anyhow::Result;
 use pageserver::repository::Key;
 use std::cmp::Ordering;
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -19,6 +19,12 @@ use tokio::io;
 use tokio::io::AsyncWrite;
 use tracing::*;

+/// NB: This relies on a modified version of tokio_tar that does *not* write the
+/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
+/// without explicitly calling 'finish' or 'into_inner'!
+///
+/// See https://github.com/neondatabase/tokio-tar/pull/1
+///
 use tokio_tar::{Builder, EntryType, Header};

 use crate::context::RequestContext;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -396,8 +396,8 @@ fn start_pageserver(

            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));

-            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
-                Ok(_) => {
+            let init_sizes_done = tokio::select! {
+                _ = &mut init_sizes_done => {
                    let now = std::time::Instant::now();
                    tracing::info!(
                        from_init_done_millis = (now - init_done).as_millis(),
@@ -406,7 +406,7 @@ fn start_pageserver(
                    );
                    None
                }
-                Err(_) => {
+                _ = tokio::time::sleep(timeout) => {
                    tracing::info!(
                        timeout_millis = timeout.as_millis(),
                        "Initial logical size timeout elapsed; starting background jobs"
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -171,13 +171,11 @@ pub struct PageServerConf {

    pub log_format: LogFormat,

-    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
+    /// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
    /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
    /// See the comment in `eviction_task` for details.
-    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
    pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,

    // How often to collect metrics and send them to the metrics endpoint.
@@ -995,8 +993,6 @@ impl ConfigurableSemaphore {
    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
    /// behave like [`futures::future::pending`], just waiting until new permits are added.
-    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
    pub fn new(initial_permits: NonZeroUsize) -> Self {
        ConfigurableSemaphore {
            initial_permits,
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -179,9 +179,6 @@ impl RequestContext {
    /// a context and you are unwilling to change all callers to provide one.
    ///
    /// Before we add cancellation, we should get rid of this method.
-    ///
-    /// [`attached_child`]: Self::attached_child
-    /// [`detached_child`]: Self::detached_child
    pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
        Self::new(task_kind, download_behavior)
    }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,7 +60,7 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
+    tenant::{self, storage_layer::PersistentLayer, Timeline},
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -166,11 +166,11 @@ async fn disk_usage_eviction_task(
        .await;

        let sleep_until = start + task_config.period;
-        if tokio::time::timeout_at(sleep_until, cancel.cancelled())
-            .await
-            .is_ok()
-        {
-            break;
+        tokio::select! {
+            _ = tokio::time::sleep_until(sleep_until) => {},
+            _ = cancel.cancelled() => {
+                break
+            }
        }
    }
 }
@@ -390,22 +390,13 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    assert_eq!(results.len(), batch.len());
                    for (result, layer) in results.into_iter().zip(batch.iter()) {
                        match result {
-                            Some(Ok(())) => {
+                            Some(Ok(true)) => {
                                usage_assumed.add_available_bytes(layer.file_size());
                            }
-                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
-                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
-                            }
-                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += layer.file_size();
-                                evictions_failed.count += 1;
-                            }
-                            Some(Err(
-                                e @ EvictionError::LayerNotFound(_)
-                                | e @ EvictionError::StatFailed(_),
-                            )) => {
-                                let e = utils::error::report_compact_sources(&e);
-                                warn!(%layer, "failed to evict layer: {e}");
+                            Some(Ok(false)) => {
+                                // this is:
+                                // - Replacement::{NotFound, Unexpected}
+                                // - it cannot be is_remote_layer, filtered already
                                evictions_failed.file_sizes += layer.file_size();
                                evictions_failed.count += 1;
                            }
@@ -413,6 +404,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                                assert!(cancel.is_cancelled());
                                return;
                            }
+                            Some(Err(e)) => {
+                                // we really shouldn't be getting this, precondition failure
+                                error!("failed to evict layer: {:#}", e);
+                            }
                        }
                    }
                }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1143,7 +1143,7 @@ async fn disk_usage_eviction_run(
    let Some(storage) = state.remote_storage.clone() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
-        )));
+        )))
    };

    let state = state.disk_usage_eviction_state.clone();
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -385,7 +385,7 @@ pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-/// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
+/// Each [`Timeline`]'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
    data_source: &'static str,
@@ -541,17 +541,6 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-// keep in sync with control plane Go code so that we can validate
-// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
-static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
-    // Go code uses milliseconds. Variable is called `computeStartupBuckets`
-    [
-        5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
-        1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
-    ]
-    .map(|ms| (ms as f64) / 1000.0)
-});
-
 pub struct BasebackupQueryTime(HistogramVec);
 pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
    BasebackupQueryTime({
@@ -559,7 +548,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
            "pageserver_basebackup_query_seconds",
            "Histogram of basebackup queries durations, by result type",
            &["result"],
-            COMPUTE_STARTUP_BUCKETS.to_vec(),
+            CRITICAL_OP_BUCKETS.into(),
        )
        .expect("failed to define a metric")
    })
@@ -829,7 +818,7 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

-/// Similar to `prometheus::HistogramTimer` but does not record on drop.
+/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
 pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
    start: Instant,
@@ -887,7 +876,7 @@ impl StorageTimeMetrics {

    /// Starts timing a new operation.
    ///
-    /// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
+    /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
    pub fn start_timer(&self) -> StorageTimeMetricsTimer {
        StorageTimeMetricsTimer::new(self.clone())
    }
@@ -1267,7 +1256,7 @@ impl RemoteTimelineClientMetrics {
    /// Update the metrics that change when a call to the remote timeline client instance starts.
    ///
    /// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
-    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
+    /// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
    /// is more suitable.
    /// Never do both.
    pub(crate) fn call_begin(
@@ -1300,7 +1289,7 @@ impl RemoteTimelineClientMetrics {

    /// Manually udpate the metrics that track completions, instead of using the guard object.
    /// Using the guard object is generally preferable.
-    /// See [`call_begin`](Self::call_begin) for more context.
+    /// See [`call_begin`] for more context.
    pub(crate) fn call_end(
        &self,
        file_kind: &RemoteOpFileKind,
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1131,7 +1131,7 @@ impl<'a> DatadirModification<'a> {
    /// context, breaking the atomicity is OK. If the import is interrupted, the
    /// whole import fails and the timeline will be deleted anyway.
    /// (Or to be precise, it will be left behind for debugging purposes and
-    /// ignored, see <https://github.com/neondatabase/neon/pull/1809>)
+    /// ignored, see https://github.com/neondatabase/neon/pull/1809)
    ///
    /// Note: A consequence of flushing the pending operations is that they
    /// won't be visible to subsequent operations until `commit`. The function
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -205,7 +205,7 @@ pub enum TaskKind {
    ///
    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
    /// That abstraction doesn't use `task_mgr`.
-    /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
+    /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
    ///
    /// Once the connection is established, the `TaskHandle` task creates a
@@ -213,21 +213,16 @@ pub enum TaskKind {
    /// the `Connection` object.
    /// A `CancellationToken` created by the `TaskHandle` task ensures
    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
-    ///
-    /// [`WalReceiverConnectionHandler`]: Self::WalReceiverConnectionHandler
-    /// [`WalReceiverConnectionPoller`]: Self::WalReceiverConnectionPoller
    WalReceiverManager,

-    /// The `TaskHandle` task that executes `handle_walreceiver_connection`.
+    /// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
    /// See the comment on [`WalReceiverManager`].
-    ///
-    /// [`WalReceiverManager`]: Self::WalReceiverManager
    WalReceiverConnectionHandler,

    /// The task that polls the `tokio-postgres::Connection` object.
-    /// Spawned by task [`WalReceiverConnectionHandler`](Self::WalReceiverConnectionHandler).
-    /// See the comment on [`WalReceiverManager`](Self::WalReceiverManager).
+    /// Spawned by task [`WalReceiverConnectionHandler`].
+    /// See the comment on [`WalReceiverManager`].
    WalReceiverConnectionPoller,

    // Garbage collection worker. One per tenant
@@ -511,13 +506,17 @@ pub async fn shutdown_tasks(
                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
-            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
-                .await
-                .is_err()
-            {
-                // allow some time to elapse before logging to cut down the number of log
-                // lines.
-                info!("waiting for {} to shut down", task.name);
+            let join_handle = tokio::select! {
+                biased;
+                _ = &mut join_handle => { None },
+                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
+                    // allow some time to elapse before logging to cut down the number of log
+                    // lines.
+                    info!("waiting for {} to shut down", task.name);
+                    Some(join_handle)
+                }
+            };
+            if let Some(join_handle) = join_handle {
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -121,7 +121,7 @@ pub mod mgr;
 pub mod tasks;
 pub mod upload_queue;

-pub(crate) mod timeline;
+mod timeline;

 pub mod size;

@@ -133,7 +133,7 @@ pub use timeline::{
 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;

-// re-export for use in remote_timeline_client.rs
+// re-export for use in storage_sync.rs
 pub use crate::tenant::metadata::save_metadata;

 // re-export for use in walreceiver
@@ -281,7 +281,7 @@ pub enum DeleteTimelineError {
 }

 pub enum SetStoppingError {
-    AlreadyStopping(completion::Barrier),
+    AlreadyStopping,
    Broken,
 }

@@ -318,6 +318,10 @@ impl std::fmt::Display for WaitToBecomeActiveError {
    }
 }

+pub(crate) enum ShutdownError {
+    AlreadyStopping,
+}
+
 struct DeletionGuard(OwnedMutexGuard<bool>);

 impl DeletionGuard {
@@ -1168,7 +1172,7 @@ impl Tenant {
        )
    }

-    /// Helper for unit tests to create an empty timeline.
+    /// Helper for unit tests to create an emtpy timeline.
    ///
    /// The timeline is has state value `Active` but its background loops are not running.
    // This makes the various functions which anyhow::ensure! for Active state work in tests.
@@ -1455,7 +1459,7 @@ impl Tenant {
            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
            info!("got layer_removal_cs.lock(), deleting layer files");

-            // NB: remote_timeline_client upload tasks that reference these layers have been cancelled
+            // NB: storage_sync upload tasks that reference these layers have been cancelled
            //     by the caller.

            let local_timeline_directory = self
@@ -1717,7 +1721,7 @@ impl Tenant {
        self.state.send_modify(|current_state| {
            use pageserver_api::models::ActivatingFrom;
            match &*current_state {
-                TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
+                TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping => {
                    panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
                }
                TenantState::Loading => {
@@ -1781,16 +1785,7 @@ impl Tenant {
    /// - detach + ignore (freeze_and_flush == false)
    ///
    /// This will attempt to shutdown even if tenant is broken.
-    ///
-    /// `shutdown_progress` is a [`completion::Barrier`] for the shutdown initiated by this call.
-    /// If the tenant is already shutting down, we return a clone of the first shutdown call's
-    /// `Barrier` as an `Err`. This not-first caller can use the returned barrier to join with
-    /// the ongoing shutdown.
-    async fn shutdown(
-        &self,
-        shutdown_progress: completion::Barrier,
-        freeze_and_flush: bool,
-    ) -> Result<(), completion::Barrier> {
+    pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> {
        span::debug_assert_current_span_has_tenant_id();
        // Set tenant (and its timlines) to Stoppping state.
        //
@@ -1809,16 +1804,12 @@ impl Tenant {
        // But the tenant background loops are joined-on in our caller.
        // It's mesed up.
        // we just ignore the failure to stop
-
-        match self.set_stopping(shutdown_progress).await {
+        match self.set_stopping().await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
                // assume that this is acceptable
            }
-            Err(SetStoppingError::AlreadyStopping(other)) => {
-                // give caller the option to wait for this this shutdown
-                return Err(other);
-            }
+            Err(SetStoppingError::AlreadyStopping) => return Err(ShutdownError::AlreadyStopping),
        };

        if freeze_and_flush {
@@ -1850,7 +1841,7 @@ impl Tenant {
    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
    ///
    /// This function is not cancel-safe!
-    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
+    async fn set_stopping(&self) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();

        // cannot stop before we're done activating, so wait out until we're done activating
@@ -1862,7 +1853,7 @@ impl Tenant {
                );
                false
            }
-            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
+            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true,
        })
        .await
        .expect("cannot drop self.state while on a &self method");
@@ -1877,7 +1868,7 @@ impl Tenant {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                // are created after the transition to Stopping. That's harmless, as the Timelines
                // won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
-                *current_state = TenantState::Stopping { progress };
+                *current_state = TenantState::Stopping;
                // Continue stopping outside the closure. We need to grab timelines.lock()
                // and we plan to turn it into a tokio::sync::Mutex in a future patch.
                true
@@ -1889,9 +1880,9 @@ impl Tenant {
                err = Some(SetStoppingError::Broken);
                false
            }
-            TenantState::Stopping { progress } => {
+            TenantState::Stopping => {
                info!("Tenant is already in Stopping state");
-                err = Some(SetStoppingError::AlreadyStopping(progress.clone()));
+                err = Some(SetStoppingError::AlreadyStopping);
                false
            }
        });
@@ -1935,7 +1926,7 @@ impl Tenant {
                );
                false
            }
-            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
+            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true,
        })
        .await
        .expect("cannot drop self.state while on a &self method");
@@ -1958,7 +1949,7 @@ impl Tenant {
                    warn!("Tenant is already in Broken state");
                }
                // This is the only "expected" path, any other path is a bug.
-                TenantState::Stopping { .. } => {
+                TenantState::Stopping => {
                    warn!(
                        "Marking Stopping tenant as Broken state, reason: {}",
                        reason
@@ -1991,7 +1982,7 @@ impl Tenant {
                TenantState::Active { .. } => {
                    return Ok(());
                }
-                TenantState::Broken { .. } | TenantState::Stopping { .. } => {
+                TenantState::Broken { .. } | TenantState::Stopping => {
                    // There's no chance the tenant can transition back into ::Active
                    return Err(WaitToBecomeActiveError::WillNotBecomeActive {
                        tenant_id: self.tenant_id,
@@ -3359,18 +3350,14 @@ pub mod harness {
        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
            (
-                self.try_load(&ctx, None)
+                self.try_load(&ctx)
                    .await
                    .expect("failed to load test tenant"),
                ctx,
            )
        }

-        pub async fn try_load(
-            &self,
-            ctx: &RequestContext,
-            remote_storage: Option<remote_storage::GenericRemoteStorage>,
-        ) -> anyhow::Result<Arc<Tenant>> {
+        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
            let walredo_mgr = Arc::new(TestRedoManager);

            let tenant = Arc::new(Tenant::new(
@@ -3379,7 +3366,7 @@ pub mod harness {
                TenantConfOpt::from(self.tenant_conf),
                walredo_mgr,
                self.tenant_id,
-                remote_storage,
+                None,
            ));
            tenant
                .load(None, ctx)
@@ -3917,11 +3904,7 @@ mod tests {
        metadata_bytes[8] ^= 1;
        std::fs::write(metadata_path, metadata_bytes)?;

-        let err = harness
-            .try_load(&ctx, None)
-            .await
-            .err()
-            .expect("should fail");
+        let err = harness.try_load(&ctx).await.err().expect("should fail");
        // get all the stack with all .context, not tonly the last one
        let message = format!("{err:#}");
        let expected = "Failed to parse metadata bytes from path";
@@ -4352,13 +4335,13 @@ mod tests {
        // assert freeze_and_flush exercised the initdb optimization
        {
            let state = tline.flush_loop_state.lock().unwrap();
-            let timeline::FlushLoopState::Running {
-                expect_initdb_optimization,
-                initdb_optimization_count,
-            } = *state
-            else {
-                panic!("unexpected state: {:?}", *state);
-            };
+            let
+                timeline::FlushLoopState::Running {
+                    expect_initdb_optimization,
+                    initdb_optimization_count,
+                } = *state else {
+                    panic!("unexpected state: {:?}", *state);
+                };
            assert!(expect_initdb_optimization);
            assert!(initdb_optimization_count > 0);
        }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -442,7 +442,7 @@ where
    writer: W,

    ///
-    /// `stack[0]` is the current root page, `stack.last()` is the leaf.
+    /// stack[0] is the current root page, stack.last() is the leaf.
    ///
    /// We maintain the length of the stack to be always greater than zero.
    /// Two exceptions are:
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -16,7 +16,7 @@
 //! Other read methods are less critical but still impact performance of background tasks.
 //!
 //! This data structure relies on a persistent/immutable binary search tree. See the
-//! following lecture for an introduction <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
+//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
 //! Summary: A persistent/immutable BST (and persistent data structures in general) allows
 //! you to modify the tree in such a way that each modification creates a new "version"
 //! of the tree. When you modify it, you get a new version, but all previous versions are
@@ -40,7 +40,7 @@
 //! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
 //! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
 //! to throw away most of the persistent BST and build a new one, starting from the oldest
-//! LSN. See [`LayerMap::flush_updates()`].
+//! LSN. See `LayerMap::flush_updates()`.
 //!

 mod historic_layer_coverage;
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -122,7 +122,8 @@ impl<Value: Clone> HistoricLayerCoverage<Value> {
        self.head = self
            .historic
            .iter()
-            .next_back()
+            .rev()
+            .next()
            .map(|(_, v)| v.clone())
            .unwrap_or_default();
    }
@@ -411,7 +412,7 @@ fn test_persistent_overlapping() {
 /// still be more critical.
 ///
 /// See this for more on persistent and retroactive techniques:
-/// <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
+/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
 pub struct BufferedHistoricLayerCoverage<Value> {
    /// A persistent layer map that we rebuild when we need to retroactively update
    historic_coverage: HistoricLayerCoverage<Value>,
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -2,7 +2,7 @@ use std::ops::Range;

 // NOTE the `im` crate has 20x more downloads and also has
 // persistent/immutable BTree. But it's bugged so rpds is a
-// better choice <https://github.com/neondatabase/neon/issues/3395>
+// better choice https://github.com/neondatabase/neon/issues/3395
 use rpds::RedBlackTreeMapSync;

 /// Data structure that can efficiently:
@@ -11,7 +11,7 @@ use rpds::RedBlackTreeMapSync;
 /// - insert layers in non-decreasing lsn.start order
 ///
 /// For a detailed explanation and justification of this approach, see:
-/// <https://neon.tech/blog/persistent-structures-in-neons-wal-indexing>
+/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
 ///
 /// NOTE The struct is parameterized over Value for easier
 ///      testing, but in practice it's some sort of layer.
@@ -113,7 +113,8 @@ impl<Value: Clone> LayerCoverage<Value> {
    pub fn query(&self, key: i128) -> Option<Value> {
        self.nodes
            .range(..=key)
-            .next_back()?
+            .rev()
+            .next()?
            .1
            .as_ref()
            .map(|(_, v)| v.clone())
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -24,7 +24,7 @@
 //! Currently, this is not used in the system. Future refactors will ensure
 //! the storage state will be recorded in this file, and the system can be
 //! recovered from this file. This is tracked in
-//! <https://github.com/neondatabase/neon/issues/4418>
+//! https://github.com/neondatabase/neon/issues/4418

 use std::io::{self, Read, Write};

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,12 +1,10 @@
 //! Every image of a certain timeline from [`crate::tenant::Tenant`]
 //! has a metadata that needs to be stored persistently.
 //!
-//! Later, the file gets used in [`remote_timeline_client`] as a part of
+//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
 //! external storage import and export operations.
 //!
 //! The module contains all structs and related helper methods related to timeline metadata.
-//!
-//! [`remote_timeline_client`]: super::remote_timeline_client

 use std::fs::{File, OpenOptions};
 use std::io::Write;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -233,17 +233,11 @@ pub fn schedule_local_tenant_processing(
 /// That could be easily misinterpreted by control plane, the consumer of the
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
-#[instrument(skip_all)]
+#[instrument]
 pub async fn shutdown_all_tenants() {
-    shutdown_all_tenants0(&TENANTS).await
-}
-
-async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
-    use utils::completion;
-
    // Prevent new tenants from being created.
    let tenants_to_shut_down = {
-        let mut m = tenants.write().await;
+        let mut m = TENANTS.write().await;
        match &mut *m {
            TenantsMap::Initializing => {
                *m = TenantsMap::ShuttingDown(HashMap::default());
@@ -268,41 +262,14 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
    for (tenant_id, tenant) in tenants_to_shut_down {
        join_set.spawn(
            async move {
-                // ordering shouldn't matter for this, either we store true right away or never
-                let ordering = std::sync::atomic::Ordering::Relaxed;
-                let joined_other = std::sync::atomic::AtomicBool::new(false);
+                let freeze_and_flush = true;

-                let mut shutdown = std::pin::pin!(async {
-                    let freeze_and_flush = true;
-
-                    let res = {
-                        let (_guard, shutdown_progress) = completion::channel();
-                        tenant.shutdown(shutdown_progress, freeze_and_flush).await
-                    };
-
-                    if let Err(other_progress) = res {
-                        // join the another shutdown in progress
-                        joined_other.store(true, ordering);
-                        other_progress.wait().await;
+                match tenant.shutdown(freeze_and_flush).await {
+                    Ok(()) => debug!("tenant successfully stopped"),
+                    Err(super::ShutdownError::AlreadyStopping) => {
+                        warn!("tenant was already shutting down")
                    }
-                });
-
-                // in practice we might not have a lot time to go, since systemd is going to
-                // SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
-                // a warning.
-                let warning = std::time::Duration::from_secs(5);
-                let mut warning = std::pin::pin!(tokio::time::sleep(warning));
-
-                tokio::select! {
-                    _ = &mut shutdown => {},
-                    _ = &mut warning => {
-                        let joined_other = joined_other.load(ordering);
-                        warn!(%joined_other, "waiting for the shutdown to complete");
-                        shutdown.await;
-                    }
-                };
-
-                debug!("tenant successfully stopped");
+                }
            }
            .instrument(info_span!("shutdown", %tenant_id)),
        );
@@ -446,15 +413,6 @@ pub async fn detach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    detach_ignored: bool,
-) -> Result<(), TenantStateError> {
-    detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
-}
-
-async fn detach_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &tokio::sync::RwLock<TenantsMap>,
-    tenant_id: TenantId,
-    detach_ignored: bool,
 ) -> Result<(), TenantStateError> {
    let local_files_cleanup_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
@@ -467,8 +425,7 @@ async fn detach_tenant0(
    };

    let removal_result =
-        remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
-            .await;
+        remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;

    // Ignored tenants are not present in memory and will bail the removal from memory operation.
    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
@@ -515,15 +472,7 @@ pub async fn ignore_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
 ) -> Result<(), TenantStateError> {
-    ignore_tenant0(conf, &TENANTS, tenant_id).await
-}
-
-async fn ignore_tenant0(
-    conf: &'static PageServerConf,
-    tenants: &tokio::sync::RwLock<TenantsMap>,
-    tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
-    remove_tenant_from_memory(tenants, tenant_id, async {
+    remove_tenant_from_memory(tenant_id, async {
        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
        fs::File::create(&ignore_mark_file)
            .await
@@ -648,21 +597,18 @@ where
 /// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
 /// operation would be needed to remove it.
 async fn remove_tenant_from_memory<V, F>(
-    tenants: &tokio::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
    tenant_cleanup: F,
 ) -> Result<V, TenantStateError>
 where
    F: std::future::Future<Output = anyhow::Result<V>>,
 {
-    use utils::completion;
-
    // It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
    // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
    // avoid holding the lock for the entire process.
    let tenant = {
-        tenants
+        TENANTS
            .write()
            .await
            .get(&tenant_id)
@@ -670,20 +616,14 @@ where
            .ok_or(TenantStateError::NotFound(tenant_id))?
    };

-    // allow pageserver shutdown to await for our completion
-    let (_guard, progress) = completion::channel();
-
-    // whenever we remove a tenant from memory, we don't want to flush and wait for upload
    let freeze_and_flush = false;

    // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
    // that we can continue safely to cleanup.
-    match tenant.shutdown(progress, freeze_and_flush).await {
+    match tenant.shutdown(freeze_and_flush).await {
        Ok(()) => {}
-        Err(_other) => {
-            // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
-            // wait for it but return an error right away because these are distinct requests.
-            return Err(TenantStateError::IsStopping(tenant_id));
+        Err(super::ShutdownError::AlreadyStopping) => {
+            return Err(TenantStateError::IsStopping(tenant_id))
        }
    }

@@ -692,14 +632,14 @@ where
        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
    {
        Ok(hook_value) => {
-            let mut tenants_accessor = tenants.write().await;
+            let mut tenants_accessor = TENANTS.write().await;
            if tenants_accessor.remove(&tenant_id).is_none() {
                warn!("Tenant {tenant_id} got removed from memory before operation finished");
            }
            Ok(hook_value)
        }
        Err(e) => {
-            let tenants_accessor = tenants.read().await;
+            let tenants_accessor = TENANTS.read().await;
            match tenants_accessor.get(&tenant_id) {
                Some(tenant) => {
                    tenant.set_broken(e.to_string()).await;
@@ -816,109 +756,3 @@ pub async fn immediate_compact(

    Ok(wait_task_done)
 }
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-    use std::sync::Arc;
-    use tracing::{info_span, Instrument};
-
-    use super::{super::harness::TenantHarness, TenantsMap};
-
-    #[tokio::test(start_paused = true)]
-    async fn shutdown_joins_remove_tenant_from_memory() {
-        // the test is a bit ugly with the lockstep together with spawned tasks. the aim is to make
-        // sure `shutdown_all_tenants0` per-tenant processing joins in any active
-        // remove_tenant_from_memory calls, which is enforced by making the operation last until
-        // we've ran `shutdown_all_tenants0` for a long time.
-
-        let (t, _ctx) = TenantHarness::create("shutdown_joins_detach")
-            .unwrap()
-            .load()
-            .await;
-
-        // harness loads it to active, which is forced and nothing is running on the tenant
-
-        let id = t.tenant_id();
-
-        // tenant harness configures the logging and we cannot escape it
-        let _e = info_span!("testing", tenant_id = %id).entered();
-
-        let tenants = HashMap::from([(id, t.clone())]);
-        let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants)));
-
-        let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();
-        let (until_cleanup_started, cleanup_started) = utils::completion::channel();
-
-        // start a "detaching operation", which will take a while, until can_complete_cleanup
-        let cleanup_task = {
-            let jh = tokio::spawn({
-                let tenants = tenants.clone();
-                async move {
-                    let cleanup = async move {
-                        drop(until_cleanup_started);
-                        can_complete_cleanup.wait().await;
-                        anyhow::Ok(())
-                    };
-                    super::remove_tenant_from_memory(&tenants, id, cleanup).await
-                }
-                .instrument(info_span!("foobar", tenant_id = %id))
-            });
-
-            // now the long cleanup should be in place, with the stopping state
-            cleanup_started.wait().await;
-            jh
-        };
-
-        let mut cleanup_progress = std::pin::pin!(t
-            .shutdown(utils::completion::Barrier::default(), false)
-            .await
-            .unwrap_err()
-            .wait());
-
-        let mut shutdown_task = {
-            let (until_shutdown_started, shutdown_started) = utils::completion::channel();
-
-            let shutdown_task = tokio::spawn(async move {
-                drop(until_shutdown_started);
-                super::shutdown_all_tenants0(&tenants).await;
-            });
-
-            shutdown_started.wait().await;
-            shutdown_task
-        };
-
-        // if the joining in is removed from shutdown_all_tenants0, the shutdown_task should always
-        // get to complete within timeout and fail the test. it is expected to continue awaiting
-        // until completion or SIGKILL during normal shutdown.
-        //
-        // the timeout is long to cover anything that shutdown_task could be doing, but it is
-        // handled instantly because we use tokio's time pausing in this test. 100s is much more than
-        // what we get from systemd on shutdown (10s).
-        let long_time = std::time::Duration::from_secs(100);
-        tokio::select! {
-            _ = &mut shutdown_task => unreachable!("shutdown must continue, until_cleanup_completed is not dropped"),
-            _ = &mut cleanup_progress => unreachable!("cleanup progress must continue, until_cleanup_completed is not dropped"),
-            _ = tokio::time::sleep(long_time) => {},
-        }
-
-        // allow the remove_tenant_from_memory and thus eventually the shutdown to continue
-        drop(until_cleanup_completed);
-
-        let (je, ()) = tokio::join!(shutdown_task, cleanup_progress);
-        je.expect("Tenant::shutdown shutdown not have panicked");
-        cleanup_task
-            .await
-            .expect("no panicking")
-            .expect("remove_tenant_from_memory failed");
-
-        futures::future::poll_immediate(
-            t.shutdown(utils::completion::Barrier::default(), false)
-                .await
-                .unwrap_err()
-                .wait(),
-        )
-        .await
-        .expect("the stopping progress must still be complete");
-    }
-}
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -135,7 +135,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
+//!   [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -163,8 +163,8 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
-//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
-//!     for layers that are referenced by `IndexPart` but not present locally
+//!   - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
+//!     but not present locally
 //!   - schedule uploads for layers that are only present locally.
 //!   - if the remote `IndexPart`'s metadata was newer than the metadata in
 //!     the local filesystem, write the remote metadata to the local filesystem
@@ -198,8 +198,6 @@
 //! in remote storage.
 //! But note that we don't test any of this right now.
 //!
-//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
-//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote

 mod delete;
 mod download;
@@ -842,16 +840,14 @@ impl RemoteTimelineClient {
        let remaining: Vec<RemotePath> = remaining
            .into_iter()
            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
-            .inspect(|path| {
-                if let Some(name) = path.object_name() {
-                    info!(%name, "deleting a file not referenced from index_part.json");
-                } else {
-                    warn!(%path, "deleting a nameless or non-utf8 object not referenced from index_part.json");
-                }
-            })
            .collect();

        if !remaining.is_empty() {
+            warn!(
+                "Found {} files not bound to index_file.json, proceeding with their deletion",
+                remaining.len()
+            );
+            warn!("About to remove {} files", remaining.len());
            self.storage_impl.delete_objects(&remaining).await?;
        }

@@ -860,7 +856,7 @@ impl RemoteTimelineClient {
        debug!("deleting index part");
        self.storage_impl.delete(&index_file_path).await?;

-        info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
+        info!(deletions_queued, "done deleting, including index_part.json");

        Ok(())
    }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -62,11 +62,12 @@ pub(super) async fn upload_timeline_layer<'a>(
    let source_file = match source_file_res {
        Ok(source_file) => source_file,
        Err(e) if e.kind() == ErrorKind::NotFound => {
-            // If we encounter this arm, it wasn't intended, but it's also not
-            // a big problem, if it's because the file was deleted before an
-            // upload. However, a nonexistent file can also be indicative of
-            // something worse, like when a file is scheduled for upload before
-            // it has been written to disk yet.
+            // In some situations we might run into the underlying file being deleted by
+            // e.g. compaction before the uploader gets to it. In that instance, we don't
+            // want to retry the error: a deleted file won't come back. In theory, the
+            // file might not have been written in the first place, which also indicates
+            // a bug. Still log the situation so that we can keep an eye on it.
+            // See https://github.com/neondatabase/neon/issues/4526
            info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -110,11 +110,11 @@ pub struct TimelineInputs {
 ///
 /// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
 /// is updated on-demand, during the start of this calculation and separate from the
-/// [`TimelineInputs::latest_gc_cutoff`].
+/// [`Timeline::latest_gc_cutoff`].
 ///
 /// For timelines in general:
 ///
-/// ```text
+/// ```ignore
 /// 0-----|---------|----|------------| · · · · · |·> lsn
 ///   initdb_lsn  branchpoints*  next_gc_cutoff  latest
 /// ```
--- a/pageserver/src/tenant/span.rs
+++ b/pageserver/src/tenant/span.rs
@@ -11,7 +11,10 @@ pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<
 #[cfg(debug_assertions)]
 #[track_caller]
 pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
-        panic!("missing extractors: {missing:?}")
+    if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
+        panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        )
    }
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -162,9 +162,6 @@ impl LayerAccessStats {
    /// The caller is responsible for recording a residence event
    /// using [`record_residence_event`] before calling `latest_activity`.
    /// If they don't, [`latest_activity`] will return `None`.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
-    /// [`latest_activity`]: Self::latest_activity
    pub(crate) fn empty_will_record_residence_event_later() -> Self {
        LayerAccessStats(Mutex::default())
    }
@@ -172,9 +169,6 @@ impl LayerAccessStats {
    /// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
-    /// [`record_residence_event`]: Self::record_residence_event
    pub(crate) fn for_loading_layer(
        layer_map_lock_held_witness: &LayerManager,
        status: LayerResidenceStatus,
@@ -193,8 +187,6 @@ impl LayerAccessStats {
    /// The `new_status` is not recorded in `self`.
    ///
    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
    pub(crate) fn clone_for_residence_change(
        &self,
        layer_map_lock_held_witness: &LayerManager,
@@ -302,13 +294,11 @@ impl LayerAccessStats {
    /// implementation error. This function logs a rate-limited warning in that case.
    ///
    /// TODO: use type system to avoid the need for `fallback`.
-    /// The approach in <https://github.com/neondatabase/neon/pull/3775>
+    /// The approach in https://github.com/neondatabase/neon/pull/3775
    /// could be used to enforce that a residence event is recorded
    /// before a layer is added to the layer map. We could also have
    /// a layer wrapper type that holds the LayerAccessStats, and ensure
    /// that that type can only be produced by inserting into the layer map.
-    ///
-    /// [`record_residence_event`]: Self::record_residence_event
    pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
        let locked = self.0.lock().unwrap();
        let inner = &locked.for_eviction_policy;
@@ -333,7 +323,7 @@ impl LayerAccessStats {
 }

 /// Supertrait of the [`Layer`] trait that captures the bare minimum interface
-/// required by [`LayerMap`](super::layer_map::LayerMap).
+/// required by [`LayerMap`].
 ///
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
@@ -380,10 +370,10 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

-/// Returned by [`PersistentLayer::iter`]
+/// Returned by [`Layer::iter`]
 pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;

-/// Returned by [`PersistentLayer::key_iter`]
+/// Returned by [`Layer::key_iter`]
 pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;

 /// Get a layer descriptor from a layer.
@@ -442,10 +432,6 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
        None
    }

-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        None
-    }
-
    fn is_remote_layer(&self) -> bool {
        false
    }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -7,18 +7,14 @@
 //! must be page images or WAL records with the 'will_init' flag set, so that
 //! they can be replayed without referring to an older page version.
 //!
-//! The delta files are stored in `timelines/<timeline_id>` directory.  Currently,
+//! The delta files are stored in timelines/<timeline_id> directory.  Currently,
 //! there are no subdirectories, and each delta file is named like this:
 //!
-//! ```text
 //!    <key start>-<key end>__<start LSN>-<end LSN>
-//! ```
 //!
 //! For example:
 //!
-//! ```text
 //!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
-//! ```
 //!
 //! Every delta file consists of three parts: "summary", "index", and
 //! "values". The summary is a fixed size header at the beginning of the file,
@@ -51,7 +47,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::Arc;
 use tracing::*;

 use utils::{
@@ -415,10 +410,6 @@ impl AsLayerDesc for DeltaLayer {
 }

 impl PersistentLayer for DeltaLayer {
-    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
-        Some(self)
-    }
-
    fn local_path(&self) -> Option<PathBuf> {
        Some(self.path())
    }
@@ -809,7 +800,7 @@ impl DeltaLayerWriterInner {
 ///
 /// # Note
 ///
-/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
 /// possible for the writer to drop before `finish` is actually called. So this
 /// could lead to odd temporary files in the directory, exhausting file system.
 /// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -57,9 +57,8 @@ impl Ord for DeltaFileName {

 /// Represents the filename of a DeltaLayer
 ///
-/// ```text
 ///    <key start>-<key end>__<LSN start>-<LSN end>
-/// ```
+///
 impl DeltaFileName {
    ///
    /// Parse a string as a delta file name. Returns None if the filename does not
@@ -163,9 +162,7 @@ impl ImageFileName {
 ///
 /// Represents the filename of an ImageLayer
 ///
-/// ```text
 ///    <key start>-<key end>__<LSN>
-/// ```
 impl ImageFileName {
    ///
    /// Parse a string as an image file name. Returns None if the filename does not
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -7,15 +7,11 @@
 //! timelines/<timeline_id> directory.  Currently, there are no
 //! subdirectories, and each image layer file is named like this:
 //!
-//! ```text
 //!    <key start>-<key end>__<LSN>
-//! ```
 //!
 //! For example:
 //!
-//! ```text
 //!    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
-//! ```
 //!
 //! Every image layer file consists of three parts: "summary",
 //! "index", and "values".  The summary is a fixed size header at the
@@ -664,7 +660,7 @@ impl ImageLayerWriterInner {
 ///
 /// # Note
 ///
-/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
+/// As described in https://github.com/neondatabase/neon/issues/2650, it's
 /// possible for the writer to drop before `finish` is actually called. So this
 /// could lead to odd temporary files in the directory, exhausting file system.
 /// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -25,7 +25,7 @@ use super::{
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
-/// [`DeltaLayer`](super::DeltaLayer).
+/// [`crate::storage_layer::DeltaLayer`].
 ///
 /// RemoteLayer might be downloaded on-demand during operations which are
 /// allowed download remote layers and during which, it gets replaced with a
@@ -50,8 +50,6 @@ pub struct RemoteLayer {
    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
    /// a possible fast loop between `Timeline::get_reconstruct_data` and
    /// `Timeline::download_remote_layer`, which also logs.
-    ///
-    /// [`ongoing_download`]: Self::ongoing_download
    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
 }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -122,12 +122,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            warn_when_period_overrun(started_at.elapsed(), period, "compaction");

            // Sleep
-            if tokio::time::timeout(sleep_duration, cancel.cancelled())
-                .await
-                .is_ok()
-            {
-                info!("received cancellation request during idling");
-                break;
+            tokio::select! {
+                _ = cancel.cancelled() => {
+                    info!("received cancellation request during idling");
+                    break;
+                },
+                _ = tokio::time::sleep(sleep_duration) => {},
            }
        }
    }
@@ -196,12 +196,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            warn_when_period_overrun(started_at.elapsed(), period, "gc");

            // Sleep
-            if tokio::time::timeout(sleep_duration, cancel.cancelled())
-                .await
-                .is_ok()
-            {
-                info!("received cancellation request during idling");
-                break;
+            tokio::select! {
+                _ = cancel.cancelled() => {
+                    info!("received cancellation request during idling");
+                    break;
+                },
+                _ = tokio::time::sleep(sleep_duration) => {},
            }
        }
    }
@@ -263,9 +263,9 @@ pub(crate) async fn random_init_delay(
        rng.gen_range(Duration::ZERO..=period)
    };

-    match tokio::time::timeout(d, cancel.cancelled()).await {
-        Ok(_) => Err(Cancelled),
-        Err(_) => Ok(()),
+    tokio::select! {
+        _ = cancel.cancelled() => Err(Cancelled),
+        _ = tokio::time::sleep(d) => Ok(()),
    }
 }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -24,7 +24,7 @@ use tracing::*;
 use utils::id::TenantTimelineId;

 use std::cmp::{max, min, Ordering};
-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BinaryHeap, HashMap};
 use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
@@ -183,7 +183,7 @@ pub struct Timeline {
    walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,

    /// Remote storage client.
-    /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
+    /// See [`storage_sync`] module comment for details.
    pub remote_client: Option<Arc<RemoteTimelineClient>>,

    // What page versions do we hold in the repository? If we get a
@@ -240,8 +240,6 @@ pub struct Timeline {
    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
-    ///
-    /// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
    pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,

    // Needed to ensure that we can't create a branch at a point that was already garbage collected
@@ -981,12 +979,8 @@ impl Timeline {

    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
    pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
-        let Some(layer) = self.find_layer(layer_file_name).await else {
-            return Ok(None);
-        };
-        let Some(remote_layer) = layer.downcast_remote_layer() else {
-            return Ok(Some(false));
-        };
+        let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
+        let Some(remote_layer) = layer.downcast_remote_layer() else { return  Ok(Some(false)) };
        if self.remote_client.is_none() {
            return Ok(Some(false));
        }
@@ -995,12 +989,10 @@ impl Timeline {
        Ok(Some(true))
    }

-    /// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
+    /// Like [`evict_layer_batch`], but for just one layer.
    /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
    pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
-        let Some(local_layer) = self.find_layer(layer_file_name).await else {
-            return Ok(None);
-        };
+        let Some(local_layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
        let remote_client = self
            .remote_client
            .as_ref()
@@ -1011,25 +1003,25 @@ impl Timeline {
            .evict_layer_batch(remote_client, &[local_layer], cancel)
            .await?;
        assert_eq!(results.len(), 1);
-        let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
+        let result: Option<anyhow::Result<bool>> = results.into_iter().next().unwrap();
        match result {
            None => anyhow::bail!("task_mgr shutdown requested"),
-            Some(Ok(())) => Ok(Some(true)),
-            Some(Err(e)) => Err(anyhow::Error::new(e)),
+            Some(Ok(b)) => Ok(Some(b)),
+            Some(Err(e)) => Err(e),
        }
    }

    /// Evict a batch of layers.
    ///
-    /// GenericRemoteStorage reference is required as a (witness)[witness_article] for "remote storage is configured."
+    /// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured."
    ///
-    /// [witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
-    pub(crate) async fn evict_layers(
+    /// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
+    pub async fn evict_layers(
        &self,
        _: &GenericRemoteStorage,
        layers_to_evict: &[Arc<dyn PersistentLayer>],
        cancel: CancellationToken,
-    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
+    ) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
        let remote_client = self.remote_client.clone().expect(
            "GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient",
        );
@@ -1064,7 +1056,7 @@ impl Timeline {
        remote_client: &Arc<RemoteTimelineClient>,
        layers_to_evict: &[Arc<dyn PersistentLayer>],
        cancel: CancellationToken,
-    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
+    ) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
        // ensure that the layers have finished uploading
        // (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
        remote_client
@@ -1110,9 +1102,11 @@ impl Timeline {
        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
        local_layer: &Arc<dyn PersistentLayer>,
        layer_mgr: &mut LayerManager,
-    ) -> Result<(), EvictionError> {
+    ) -> anyhow::Result<bool> {
        if local_layer.is_remote_layer() {
-            return Err(EvictionError::CannotEvictRemoteLayer);
+            // TODO(issue #3851): consider returning an err here instead of false,
+            // which is the same out the match later
+            return Ok(false);
        }

        let layer_file_size = local_layer.file_size();
@@ -1121,22 +1115,13 @@ impl Timeline {
            .local_path()
            .expect("local layer should have a local path")
            .metadata()
-            // when the eviction fails because we have already deleted the layer in compaction for
-            // example, a NotFound error bubbles up from here.
-            .map_err(|e| {
-                if e.kind() == std::io::ErrorKind::NotFound {
-                    EvictionError::FileNotFound
-                } else {
-                    EvictionError::StatFailed(e)
-                }
-            })?
+            .context("get local layer file stat")?
            .modified()
-            .map_err(EvictionError::StatFailed)?;
-
+            .context("get mtime of layer file")?;
        let local_layer_residence_duration =
            match SystemTime::now().duration_since(local_layer_mtime) {
                Err(e) => {
-                    warn!(layer = %local_layer, "layer mtime is in the future: {}", e);
+                    warn!("layer mtime is in the future: {}", e);
                    None
                }
                Ok(delta) => Some(delta),
@@ -1167,65 +1152,54 @@ impl Timeline {

        assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());

-        layer_mgr
-            .replace_and_verify(local_layer.clone(), new_remote_layer)
-            .map_err(EvictionError::LayerNotFound)?;
+        let succeed = match layer_mgr.replace_and_verify(local_layer.clone(), new_remote_layer) {
+            Ok(()) => {
+                if let Err(e) = local_layer.delete_resident_layer_file() {
+                    error!("failed to remove layer file on evict after replacement: {e:#?}");
+                }
+                // Always decrement the physical size gauge, even if we failed to delete the file.
+                // Rationale: we already replaced the layer with a remote layer in the layer map,
+                // and any subsequent download_remote_layer will
+                // 1. overwrite the file on disk and
+                // 2. add the downloaded size to the resident size gauge.
+                //
+                // If there is no re-download, and we restart the pageserver, then load_layer_map
+                // will treat the file as a local layer again, count it towards resident size,
+                // and it'll be like the layer removal never happened.
+                // The bump in resident size is perhaps unexpected but overall a robust behavior.
+                self.metrics
+                    .resident_physical_size_gauge
+                    .sub(layer_file_size);

-        if let Err(e) = local_layer.delete_resident_layer_file() {
-            // this should never happen, because of layer_removal_cs usage and above stat
-            // access for mtime
-            error!("failed to remove layer file on evict after replacement: {e:#?}");
-        }
-        // Always decrement the physical size gauge, even if we failed to delete the file.
-        // Rationale: we already replaced the layer with a remote layer in the layer map,
-        // and any subsequent download_remote_layer will
-        // 1. overwrite the file on disk and
-        // 2. add the downloaded size to the resident size gauge.
-        //
-        // If there is no re-download, and we restart the pageserver, then load_layer_map
-        // will treat the file as a local layer again, count it towards resident size,
-        // and it'll be like the layer removal never happened.
-        // The bump in resident size is perhaps unexpected but overall a robust behavior.
-        self.metrics
-            .resident_physical_size_gauge
-            .sub(layer_file_size);
+                self.metrics.evictions.inc();

-        self.metrics.evictions.inc();
+                if let Some(delta) = local_layer_residence_duration {
+                    self.metrics
+                        .evictions_with_low_residence_duration
+                        .read()
+                        .unwrap()
+                        .observe(delta);
+                    info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
+                } else {
+                    info!(layer=%local_layer, "evicted layer after unknown residence period");
+                }

-        if let Some(delta) = local_layer_residence_duration {
-            self.metrics
-                .evictions_with_low_residence_duration
-                .read()
-                .unwrap()
-                .observe(delta);
-            info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
-        } else {
-            info!(layer=%local_layer, "evicted layer after unknown residence period");
-        }
+                true
+            }
+            Err(err) => {
+                if cfg!(debug_assertions) {
+                    panic!("failed to replace: {err}, evicted: {local_layer:?}");
+                } else {
+                    error!(evicted=?local_layer, "failed to replace: {err}");
+                }
+                false
+            }
+        };

-        Ok(())
+        Ok(succeed)
    }
 }

-#[derive(Debug, thiserror::Error)]
-pub(crate) enum EvictionError {
-    #[error("cannot evict a remote layer")]
-    CannotEvictRemoteLayer,
-    /// Most likely the to-be evicted layer has been deleted by compaction or gc which use the same
-    /// locks, so they got to execute before the eviction.
-    #[error("file backing the layer has been removed already")]
-    FileNotFound,
-    #[error("stat failed")]
-    StatFailed(#[source] std::io::Error),
-    /// In practice, this can be a number of things, but lets assume it means only this.
-    ///
-    /// This case includes situations such as the Layer was evicted and redownloaded in between,
-    /// because the file existed before an replacement attempt was made but now the Layers are
-    /// different objects in memory.
-    #[error("layer was no longer part of LayerMap")]
-    LayerNotFound(#[source] anyhow::Error),
-}
-
 /// Number of times we will compute partition within a checkpoint distance.
 const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;

@@ -1795,7 +1769,7 @@ impl Timeline {
    /// 3. Schedule upload of local-only layer files (which will then also update the remote
    ///    IndexPart to include the new layer files).
    ///
-    /// Refer to the [`remote_timeline_client`] module comment for more context.
+    /// Refer to the `storage_sync` module comment for more context.
    ///
    /// # TODO
    /// May be a bit cleaner to do things based on populated remote client,
@@ -2628,9 +2602,7 @@ impl Timeline {
                    guard.layer_map().frozen_layers.front().cloned()
                    // drop 'layers' lock to allow concurrent reads and writes
                };
-                let Some(layer_to_flush) = layer_to_flush else {
-                    break Ok(());
-                };
+                let Some(layer_to_flush) = layer_to_flush else { break Ok(()) };
                if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
                    error!("could not flush frozen layer: {err:?}");
                    break Err(err);
@@ -2703,7 +2675,7 @@ impl Timeline {
        // files instead. This is possible as long as *all* the data imported into the
        // repository have the same LSN.
        let lsn_range = frozen_layer.get_lsn_range();
-        let (layer_paths_to_upload, delta_layer_to_add) =
+        let layer_paths_to_upload =
            if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
                #[cfg(test)]
                match &mut *self.flush_loop_state.lock().unwrap() {
@@ -2722,12 +2694,8 @@ impl Timeline {
                let (partitioning, _lsn) = self
                    .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
                    .await?;
-                // For image layers, we add them immediately into the layer map.
-                (
-                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
-                        .await?,
-                    None,
-                )
+                self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
+                    .await?
            } else {
                #[cfg(test)]
                match &mut *self.flush_loop_state.lock().unwrap() {
@@ -2741,50 +2709,29 @@ impl Timeline {
                        assert!(!*expect_initdb_optimization, "expected initdb optimization");
                    }
                }
-                // Normal case, write out a L0 delta layer file.
-                // `create_delta_layer` will not modify the layer map.
-                // We will remove frozen layer and add delta layer in one atomic operation later.
-                let layer = self.create_delta_layer(&frozen_layer).await?;
-                (
-                    HashMap::from([(layer.filename(), LayerFileMetadata::new(layer.file_size()))]),
-                    Some(layer),
-                )
+                // normal case, write out a L0 delta layer file.
+                let (delta_path, metadata) = self.create_delta_layer(&frozen_layer).await?;
+                HashMap::from([(delta_path, metadata)])
            };

+        pausable_failpoint!("flush-frozen-before-sync");
+
        // The new on-disk layers are now in the layer map. We can remove the
        // in-memory layer from the map now. The flushed layer is stored in
        // the mapping in `create_delta_layer`.
        {
            let mut guard = self.layers.write().await;
+            let l = guard.layer_map_mut().frozen_layers.pop_front();

-            if let Some(ref l) = delta_layer_to_add {
-                // TODO: move access stats, metrics update, etc. into layer manager.
-                l.access_stats().record_residence_event(
-                    &guard,
-                    LayerResidenceStatus::Resident,
-                    LayerResidenceEventReason::LayerCreate,
-                );
+            // Only one thread may call this function at a time (for this
+            // timeline). If two threads tried to flush the same frozen
+            // layer to disk at the same time, that would not work.
+            assert!(compare_arced_layers(&l.unwrap(), &frozen_layer));

-                // update metrics
-                let sz = l.file_size();
-                self.metrics.resident_physical_size_gauge.add(sz);
-                self.metrics.num_persistent_files_created.inc_by(1);
-                self.metrics.persistent_bytes_written.inc_by(sz);
-            }
-
-            guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer);
            // release lock on 'layers'
        }

-        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
-        // a compaction can delete the file and then it won't be available for uploads any more.
-        // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
-        // race situation.
-        // See https://github.com/neondatabase/neon/issues/4526
-        pausable_failpoint!("flush-frozen-pausable");
-
-        // This failpoint is used by another test case `test_pageserver_recovery`.
-        fail_point!("flush-frozen-exit");
+        fail_point!("checkpoint-after-sync");

        // Update the metadata file, with new 'disk_consistent_lsn'
        //
@@ -2866,12 +2813,11 @@ impl Timeline {
        Ok(())
    }

-    // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
-    // in layer map immediately. The caller is responsible to put it into the layer map.
+    // Write out the given frozen in-memory layer as a new L0 delta file
    async fn create_delta_layer(
        self: &Arc<Self>,
        frozen_layer: &Arc<InMemoryLayer>,
-    ) -> anyhow::Result<DeltaLayer> {
+    ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
        let span = tracing::info_span!("blocking");
        let new_delta: DeltaLayer = tokio::task::spawn_blocking({
            let _g = span.entered();
@@ -2908,8 +2854,25 @@ impl Timeline {
        })
        .await
        .context("spawn_blocking")??;
+        let new_delta_name = new_delta.filename();
+        let sz = new_delta.desc.file_size;

-        Ok(new_delta)
+        // Add it to the layer map
+        let l = Arc::new(new_delta);
+        let mut guard = self.layers.write().await;
+        l.access_stats().record_residence_event(
+            &guard,
+            LayerResidenceStatus::Resident,
+            LayerResidenceEventReason::LayerCreate,
+        );
+        guard.track_new_l0_delta_layer(l);
+
+        // update metrics
+        self.metrics.resident_physical_size_gauge.add(sz);
+        self.metrics.num_persistent_files_created.inc_by(1);
+        self.metrics.persistent_bytes_written.inc_by(sz);
+
+        Ok((new_delta_name, LayerFileMetadata::new(sz)))
    }

    async fn repartition(
@@ -3161,7 +3124,7 @@ impl Timeline {

 #[derive(Default)]
 struct CompactLevel0Phase1Result {
-    new_layers: Vec<Arc<DeltaLayer>>,
+    new_layers: Vec<DeltaLayer>,
    deltas_to_compact: Vec<Arc<PersistentLayerDesc>>,
 }

@@ -3310,8 +3273,6 @@ impl Timeline {
    /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
    /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
    /// start of level0 files compaction, the on-demand download should be revisited as well.
-    ///
-    /// [`compact_inner`]: Self::compact_inner
    fn compact_level0_phase1(
        self: Arc<Self>,
        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
@@ -3339,37 +3300,6 @@ impl Timeline {
            return Ok(CompactLevel0Phase1Result::default());
        }

-        // This failpoint is used together with `test_duplicate_layers` integration test.
-        // It returns the compaction result exactly the same layers as input to compaction.
-        // We want to ensure that this will not cause any problem when updating the layer map
-        // after the compaction is finished.
-        //
-        // Currently, there are two rare edge cases that will cause duplicated layers being
-        // inserted.
-        // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
-        //    is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
-        //    map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
-        //    point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
-        //    and this causes an overwrite. This is acceptable because the content is the same, and we should do a
-        //    layer replace instead of the normal remove / upload process.
-        // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
-        //    size length. Compaction will likely create the same set of n files afterwards.
-        //
-        // This failpoint is a superset of both of the cases.
-        fail_point!("compact-level0-phase1-return-same", |_| {
-            println!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
-            Ok(CompactLevel0Phase1Result {
-                new_layers: level0_deltas
-                    .iter()
-                    .map(|x| x.clone().downcast_delta_layer().unwrap())
-                    .collect(),
-                deltas_to_compact: level0_deltas
-                    .iter()
-                    .map(|x| x.layer_desc().clone().into())
-                    .collect(),
-            })
-        });
-
        // Gather the files to compact in this iteration.
        //
        // Start with the oldest Level 0 delta file, and collect any other
@@ -3628,9 +3558,7 @@ impl Timeline {
                        || contains_hole
                    {
                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(Arc::new(
-                            writer.take().unwrap().finish(prev_key.unwrap().next())?,
-                        ));
+                        new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
                        writer = None;

                        if contains_hole {
@@ -3668,7 +3596,7 @@ impl Timeline {
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
-            new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
+            new_layers.push(writer.finish(prev_key.unwrap().next())?);
        }

        // Sync layers
@@ -3764,7 +3692,7 @@ impl Timeline {
        }

        // Before deleting any layers, we need to wait for their upload ops to finish.
-        // See remote_timeline_client module level comment on consistency.
+        // See storage_sync module level comment on consistency.
        // Do it here because we don't want to hold self.layers.write() while waiting.
        if let Some(remote_client) = &self.remote_client {
            debug!("waiting for upload ops to complete");
@@ -3777,11 +3705,6 @@ impl Timeline {
        let mut guard = self.layers.write().await;
        let mut new_layer_paths = HashMap::with_capacity(new_layers.len());

-        // In some rare cases, we may generate a file with exactly the same key range / LSN as before the compaction.
-        // We should move to numbering the layer files instead of naming them using key range / LSN some day. But for
-        // now, we just skip the file to avoid unintentional modification to files on the disk and in the layer map.
-        let mut duplicated_layers = HashSet::new();
-
        let mut insert_layers = Vec::new();
        let mut remove_layers = Vec::new();

@@ -3808,33 +3731,21 @@ impl Timeline {
                .add(metadata.len());

            new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
-            l.access_stats().record_residence_event(
+            let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
+            x.access_stats().record_residence_event(
                &guard,
                LayerResidenceStatus::Resident,
                LayerResidenceEventReason::LayerCreate,
            );
-            let l = l as Arc<dyn PersistentLayer>;
-            if guard.contains(&l) {
-                duplicated_layers.insert(l.layer_desc().key());
-            } else {
-                if LayerMap::is_l0(l.layer_desc()) {
-                    return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
-                }
-                insert_layers.push(l);
-            }
+            insert_layers.push(x);
        }

        // Now that we have reshuffled the data to set of new delta layers, we can
        // delete the old ones
        let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
-        for ldesc in deltas_to_compact {
-            if duplicated_layers.contains(&ldesc.key()) {
-                // skip duplicated layers, they will not be removed; we have already overwritten them
-                // with new layers in the compaction phase 1.
-                continue;
-            }
-            layer_names_to_delete.push(ldesc.filename());
-            remove_layers.push(guard.get_from_desc(&ldesc));
+        for l in deltas_to_compact {
+            layer_names_to_delete.push(l.filename());
+            remove_layers.push(guard.get_from_desc(&l));
        }

        guard.finish_compact_l0(
@@ -4593,7 +4504,6 @@ impl LocalLayerInfoForDiskUsageEviction {
 }

 impl Timeline {
-    /// Returns non-remote layers for eviction.
    pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
        let guard = self.layers.read().await;
        let layers = guard.layer_map();
@@ -4763,179 +4673,3 @@ pub fn compare_arced_layers<L: ?Sized>(left: &Arc<L>, right: &Arc<L>) -> bool {

    left == right
 }
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use utils::{id::TimelineId, lsn::Lsn};
-
-    use crate::tenant::{harness::TenantHarness, storage_layer::PersistentLayer};
-
-    use super::{EvictionError, Timeline};
-
-    #[tokio::test]
-    async fn two_layer_eviction_attempts_at_the_same_time() {
-        let harness =
-            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
-
-        let remote_storage = {
-            // this is never used for anything, because of how the create_test_timeline works, but
-            // it is with us in spirit and a Some.
-            use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
-            let path = harness.conf.workdir.join("localfs");
-            std::fs::create_dir_all(&path).unwrap();
-            let config = RemoteStorageConfig {
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
-                storage: RemoteStorageKind::LocalFs(path),
-            };
-            GenericRemoteStorage::from_config(&config).unwrap()
-        };
-
-        let ctx = any_context();
-        let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
-        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
-            .await
-            .unwrap();
-
-        let rc = timeline
-            .remote_client
-            .clone()
-            .expect("just configured this");
-
-        let layer = find_some_layer(&timeline).await;
-
-        let cancel = tokio_util::sync::CancellationToken::new();
-        let batch = [layer];
-
-        let first = {
-            let cancel = cancel.clone();
-            async {
-                timeline
-                    .evict_layer_batch(&rc, &batch, cancel)
-                    .await
-                    .unwrap()
-            }
-        };
-        let second = async {
-            timeline
-                .evict_layer_batch(&rc, &batch, cancel)
-                .await
-                .unwrap()
-        };
-
-        let (first, second) = tokio::join!(first, second);
-
-        let (first, second) = (only_one(first), only_one(second));
-
-        match (first, second) {
-            (Ok(()), Err(EvictionError::FileNotFound))
-            | (Err(EvictionError::FileNotFound), Ok(())) => {
-                // one of the evictions gets to do it,
-                // other one gets FileNotFound. all is good.
-            }
-            other => unreachable!("unexpected {:?}", other),
-        }
-    }
-
-    #[tokio::test]
-    async fn layer_eviction_aba_fails() {
-        let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap();
-
-        let remote_storage = {
-            // this is never used for anything, because of how the create_test_timeline works, but
-            // it is with us in spirit and a Some.
-            use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
-            let path = harness.conf.workdir.join("localfs");
-            std::fs::create_dir_all(&path).unwrap();
-            let config = RemoteStorageConfig {
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
-                storage: RemoteStorageKind::LocalFs(path),
-            };
-            GenericRemoteStorage::from_config(&config).unwrap()
-        };
-
-        let ctx = any_context();
-        let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
-        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
-            .await
-            .unwrap();
-
-        let _e = tracing::info_span!("foobar", tenant_id = %tenant.tenant_id, timeline_id = %timeline.timeline_id).entered();
-
-        let rc = timeline.remote_client.clone().unwrap();
-
-        // TenantHarness allows uploads to happen given GenericRemoteStorage is configured
-        let layer = find_some_layer(&timeline).await;
-
-        let cancel = tokio_util::sync::CancellationToken::new();
-        let batch = [layer];
-
-        let first = {
-            let cancel = cancel.clone();
-            async {
-                timeline
-                    .evict_layer_batch(&rc, &batch, cancel)
-                    .await
-                    .unwrap()
-            }
-        };
-
-        // lets imagine this is stuck somehow, still referencing the original `Arc<dyn PersistentLayer>`
-        let second = {
-            let cancel = cancel.clone();
-            async {
-                timeline
-                    .evict_layer_batch(&rc, &batch, cancel)
-                    .await
-                    .unwrap()
-            }
-        };
-
-        // while it's stuck, we evict and end up redownloading it
-        only_one(first.await).expect("eviction succeeded");
-
-        let layer = find_some_layer(&timeline).await;
-        let layer = layer.downcast_remote_layer().unwrap();
-        timeline.download_remote_layer(layer).await.unwrap();
-
-        let res = only_one(second.await);
-
-        assert!(
-            matches!(res, Err(EvictionError::LayerNotFound(_))),
-            "{res:?}"
-        );
-
-        // no more specific asserting, outside of preconds this is the only valid replacement
-        // failure
-    }
-
-    fn any_context() -> crate::context::RequestContext {
-        use crate::context::*;
-        use crate::task_mgr::*;
-        RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
-    }
-
-    fn only_one<T>(mut input: Vec<Option<T>>) -> T {
-        assert_eq!(1, input.len());
-        input
-            .pop()
-            .expect("length just checked")
-            .expect("no cancellation")
-    }
-
-    async fn find_some_layer(timeline: &Timeline) -> Arc<dyn PersistentLayer> {
-        let layers = timeline.layers.read().await;
-        let desc = layers
-            .layer_map()
-            .iter_historic_layers()
-            .next()
-            .expect("must find one layer to evict");
-
-        layers.get_from_desc(&desc)
-    }
-}
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,6 @@ use crate::{
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
        storage_layer::PersistentLayer,
-        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
 };
@@ -101,11 +100,11 @@ impl Timeline {
            match cf {
                ControlFlow::Break(()) => break,
                ControlFlow::Continue(sleep_until) => {
-                    if tokio::time::timeout_at(sleep_until, cancel.cancelled())
-                        .await
-                        .is_ok()
-                    {
-                        break;
+                    tokio::select! {
+                        _ = cancel.cancelled() => {
+                            break;
+                        }
+                        _ = tokio::time::sleep_until(sleep_until) => { }
                    }
                }
            }
@@ -271,22 +270,20 @@ impl Timeline {
                None => {
                    stats.skipped_for_shutdown += 1;
                }
-                Some(Ok(())) => {
+                Some(Ok(true)) => {
+                    debug!("evicted layer {l:?}");
                    stats.evicted += 1;
                }
-                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                Some(Ok(false)) => {
+                    debug!("layer is not evictable: {l:?}");
                    stats.not_evictable += 1;
                }
-                Some(Err(EvictionError::FileNotFound)) => {
-                    // compaction/gc removed the file while we were waiting on layer_removal_cs
-                    stats.not_evictable += 1;
-                }
-                Some(Err(
-                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
-                )) => {
-                    let e = utils::error::report_compact_sources(&e);
-                    warn!(layer = %l, "failed to evict layer: {e}");
-                    stats.not_evictable += 1;
+                Some(Err(e)) => {
+                    // This variant is the case where an unexpected error happened during eviction.
+                    // Expected errors that result in non-eviction are `Some(Ok(false))`.
+                    // So, dump Debug here to gather as much info as possible in this rare case.
+                    warn!("failed to evict layer {l:?}: {e:?}");
+                    stats.errors += 1;
                }
            }
        }
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -194,23 +194,10 @@ impl LayerManager {
        updates.flush();
    }

-    /// Flush a frozen layer and add the written delta layer to the layer map.
-    pub fn finish_flush_l0_layer(
-        &mut self,
-        delta_layer: Option<DeltaLayer>,
-        frozen_layer_for_check: &Arc<InMemoryLayer>,
-    ) {
-        let l = self.layer_map.frozen_layers.pop_front();
+    /// Insert into the layer map when a new delta layer is created, called from `create_delta_layer`.
+    pub fn track_new_l0_delta_layer(&mut self, delta_layer: Arc<DeltaLayer>) {
        let mut updates = self.layer_map.batch_update();
-
-        // Only one thread may call this function at a time (for this
-        // timeline). If two threads tried to flush the same frozen
-        // layer to disk at the same time, that would not work.
-        assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));
-
-        if let Some(delta_layer) = delta_layer {
-            Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
-        }
+        Self::insert_historic_layer(delta_layer, &mut updates, &mut self.layer_fmgr);
        updates.flush();
    }

@@ -308,10 +295,6 @@ impl LayerManager {

        Ok(())
    }
-
-    pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
-        self.layer_fmgr.contains(layer)
-    }
 }

 pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
@@ -336,10 +319,6 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
        }
    }

-    pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
-        self.0.contains_key(&layer.layer_desc().key())
-    }
-
    pub(crate) fn new() -> Self {
        Self(HashMap::new())
    }
--- a/pageserver/src/tenant/timeline/span.rs
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -14,7 +14,10 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
        &*crate::tenant::span::TENANT_ID_EXTRACTOR,
        &*TIMELINE_ID_EXTRACTOR,
    ];
-    if let Err(missing) = check_fields_present!(fields) {
-        panic!("missing extractors: {missing:?}")
+    if let Err(missing) = check_fields_present(fields) {
+        panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        )
    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -6,7 +6,7 @@
 //! Current connection state is tracked too, to ensure it's not getting stale.
 //!
 //! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
-//! then a (re)connection happens, if necessary.
+//! then a [re]connection happens, if necessary.
 //! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel.

 use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -1,4 +1,4 @@
-comment = '** Deprecated ** Please use pg_embedding instead'
+comment = 'hnsw index'
 default_version = '0.1.0'
 module_pathname = '$libdir/hnsw'
 relocatable = true
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,6 +4,7 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	extension_server.o \
 	file_cache.o \
 	libpagestore.o \
 	libpqwalproposer.o \
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -0,0 +1,104 @@
+
+/*-------------------------------------------------------------------------
+ *
+ * extension_server.c
+ *	  Request compute_ctl to download extension files.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/extension_server.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "tcop/pquery.h"
+#include "tcop/utility.h"
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "commands/defrem.h"
+#include "miscadmin.h"
+#include "utils/acl.h"
+#include "fmgr.h"
+#include "utils/guc.h"
+#include "port.h"
+#include "fmgr.h"
+
+#include <curl/curl.h>
+
+static int extension_server_port = 0;
+
+static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
+
+// to download all SQL (and data) files for an extension:
+// curl -X POST http://localhost:8080/extension_server/postgis
+// it covers two possible extension files layouts:
+// 1. extension_name--version--platform.sql
+// 2. extension_name/extension_name--version.sql
+//    extension_name/extra_files.csv
+//
+// to download specific library file:
+// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
+static bool
+neon_download_extension_file_http(const char *filename, bool is_library)
+{
+    CURL *curl;
+    CURLcode res;
+    char *compute_ctl_url;
+    char *postdata;
+    bool ret = false;
+
+    if ((curl = curl_easy_init()) == NULL)
+    {
+        elog(ERROR, "Failed to initialize curl handle");
+    }
+
+    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
+                               extension_server_port, filename, is_library ? "?is_library=true" : "");
+
+    elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
+
+    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
+    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
+    // NOTE: 15L may be insufficient time for large extensions like postgis
+    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L /* seconds */);
+
+    if (curl)
+    {
+        /* Perform the request, res will get the return code */
+        res = curl_easy_perform(curl);
+        /* Check for errors */
+        if (res == CURLE_OK)
+        {
+            ret = true;
+        }
+        else
+        {
+            // Don't error here because postgres will try to find the file
+            // and will fail with some proper error message if it's not found.
+            elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
+        }
+
+        /* always cleanup */
+        curl_easy_cleanup(curl);
+    }
+
+    return ret;
+}
+
+void pg_init_extension_server()
+{
+    // Port to connect to compute_ctl on localhost
+    // to request extension files.
+    DefineCustomIntVariable("neon.extension_server_port",
+                            "connection string to the compute_ctl",
+                            NULL,
+                            &extension_server_port,
+                            0, 0, INT_MAX,
+                            PGC_POSTMASTER,
+                            0, /* no flags required */
+                            NULL, NULL, NULL);
+
+    // set download_extension_file_hook
+    prev_download_extension_file_hook = download_extension_file_hook;
+    download_extension_file_hook = neon_download_extension_file_http;
+}
--- a/pgxn/neon/extension_server.h
+++ b/pgxn/neon/extension_server.h
@@ -0,0 +1 @@
+
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -35,8 +35,11 @@ _PG_init(void)
 {
 	pg_init_libpagestore();
 	pg_init_walproposer();
+
 	InitControlPlaneConnector();

+	pg_init_extension_server();
+
        // Important: This must happen after other parts of the extension
        // are loaded, otherwise any settings to GUCs that were set before
        // the extension was loaded will be removed.
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -21,6 +21,8 @@ extern char *neon_tenant;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

+extern void pg_init_extension_server(void);
+
 /*
 * Returns true if we shouldn't do REDO on that block in record indicated by
 * block_id; false otherwise.
--- a/poetry.lock
+++ b/poetry.lock
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -5,6 +5,7 @@ use proxy::http;
 use proxy::metrics;

 use anyhow::bail;
+use clap::{self, Arg};
 use proxy::config::{self, ProxyConfig};
 use std::pin::pin;
 use std::{borrow::Cow, net::SocketAddr};
@@ -17,70 +18,6 @@ use utils::{project_git_version, sentry_init::init_sentry};

 project_git_version!(GIT_VERSION);

-use clap::{Parser, ValueEnum};
-
-#[derive(Clone, Debug, ValueEnum)]
-enum AuthBackend {
-    Console,
-    Postgres,
-    Link,
-}
-
-/// Neon proxy/router
-#[derive(Parser)]
-#[command(version = GIT_VERSION, about)]
-struct ProxyCliArgs {
-    /// listen for incoming client connections on ip:port
-    #[clap(short, long, default_value = "127.0.0.1:4432")]
-    proxy: String,
-    #[clap(value_enum, long, default_value_t = AuthBackend::Link)]
-    auth_backend: AuthBackend,
-    /// listen for management callback connection on ip:port
-    #[clap(short, long, default_value = "127.0.0.1:7000")]
-    mgmt: String,
-    /// listen for incoming http connections (metrics, etc) on ip:port
-    #[clap(long, default_value = "127.0.0.1:7001")]
-    http: String,
-    /// listen for incoming wss connections on ip:port
-    #[clap(long)]
-    wss: Option<String>,
-    /// redirect unauthenticated users to the given uri in case of link auth
-    #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
-    uri: String,
-    /// cloud API endpoint for authenticating users
-    #[clap(
-        short,
-        long,
-        default_value = "http://localhost:3000/authenticate_proxy_request/"
-    )]
-    auth_endpoint: String,
-    /// path to TLS key for client postgres connections
-    ///
-    /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
-    #[clap(short = 'k', long, alias = "ssl-key")]
-    tls_key: Option<String>,
-    /// path to TLS cert for client postgres connections
-    ///
-    /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
-    #[clap(short = 'c', long, alias = "ssl-cert")]
-    tls_cert: Option<String>,
-    /// path to directory with TLS certificates for client postgres connections
-    #[clap(long)]
-    certs_dir: Option<String>,
-    /// http endpoint to receive periodic metric updates
-    #[clap(long)]
-    metric_collection_endpoint: Option<String>,
-    /// how often metrics should be sent to a collection endpoint
-    #[clap(long)]
-    metric_collection_interval: Option<String>,
-    /// cache for `wake_compute` api method (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
-    wake_compute_cache: String,
-    /// Allow self-signed certificates for compute nodes (for testing)
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    allow_self_signed_compute: bool,
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let _logging_guard = proxy::logging::init().await?;
@@ -90,21 +27,21 @@ async fn main() -> anyhow::Result<()> {
    info!("Version: {GIT_VERSION}");
    ::metrics::set_build_info_metric(GIT_VERSION);

-    let args = ProxyCliArgs::parse();
+    let args = cli().get_matches();
    let config = build_config(&args)?;

    info!("Authentication backend: {}", config.auth_backend);

    // Check that we can bind to address before further initialization
-    let http_address: SocketAddr = args.http.parse()?;
+    let http_address: SocketAddr = args.get_one::<String>("http").unwrap().parse()?;
    info!("Starting http on {http_address}");
    let http_listener = TcpListener::bind(http_address).await?.into_std()?;

-    let mgmt_address: SocketAddr = args.mgmt.parse()?;
+    let mgmt_address: SocketAddr = args.get_one::<String>("mgmt").unwrap().parse()?;
    info!("Starting mgmt on {mgmt_address}");
    let mgmt_listener = TcpListener::bind(mgmt_address).await?;

-    let proxy_address: SocketAddr = args.proxy.parse()?;
+    let proxy_address: SocketAddr = args.get_one::<String>("proxy").unwrap().parse()?;
    info!("Starting proxy on {proxy_address}");
    let proxy_listener = TcpListener::bind(proxy_address).await?;
    let cancellation_token = CancellationToken::new();
@@ -118,7 +55,7 @@ async fn main() -> anyhow::Result<()> {
        cancellation_token.clone(),
    ));

-    if let Some(wss_address) = args.wss {
+    if let Some(wss_address) = args.get_one::<String>("wss") {
        let wss_address: SocketAddr = wss_address.parse()?;
        info!("Starting wss on {wss_address}");
        let wss_listener = TcpListener::bind(wss_address).await?;
@@ -165,24 +102,31 @@ async fn main() -> anyhow::Result<()> {
 }

 /// ProxyConfig is created at proxy startup, and lives forever.
-fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
-    let tls_config = match (&args.tls_key, &args.tls_cert) {
+fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> {
+    let tls_config = match (
+        args.get_one::<String>("tls-key"),
+        args.get_one::<String>("tls-cert"),
+    ) {
        (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
            key_path,
            cert_path,
-            args.certs_dir.as_ref(),
+            args.get_one::<String>("certs-dir"),
        )?),
        (None, None) => None,
        _ => bail!("either both or neither tls-key and tls-cert must be specified"),
    };

-    if args.allow_self_signed_compute {
+    let allow_self_signed_compute: bool = args
+        .get_one::<String>("allow-self-signed-compute")
+        .unwrap()
+        .parse()?;
+    if allow_self_signed_compute {
        warn!("allowing self-signed compute certificates");
    }

    let metric_collection = match (
-        &args.metric_collection_endpoint,
-        &args.metric_collection_interval,
+        args.get_one::<String>("metric-collection-endpoint"),
+        args.get_one::<String>("metric-collection-interval"),
    ) {
        (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig {
            endpoint: endpoint.parse()?,
@@ -195,38 +139,145 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        ),
    };

-    let auth_backend = match &args.auth_backend {
-        AuthBackend::Console => {
-            let config::CacheOptions { size, ttl } = args.wake_compute_cache.parse()?;
+    let auth_backend = match args.get_one::<String>("auth-backend").unwrap().as_str() {
+        "console" => {
+            let config::CacheOptions { size, ttl } = args
+                .get_one::<String>("wake-compute-cache")
+                .unwrap()
+                .parse()?;

            info!("Using NodeInfoCache (wake_compute) with size={size} ttl={ttl:?}");
            let caches = Box::leak(Box::new(console::caches::ApiCaches {
                node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
            }));

-            let url = args.auth_endpoint.parse()?;
+            let url = args.get_one::<String>("auth-endpoint").unwrap().parse()?;
            let endpoint = http::Endpoint::new(url, http::new_client());

            let api = console::provider::neon::Api::new(endpoint, caches);
            auth::BackendType::Console(Cow::Owned(api), ())
        }
-        AuthBackend::Postgres => {
-            let url = args.auth_endpoint.parse()?;
+        "postgres" => {
+            let url = args.get_one::<String>("auth-endpoint").unwrap().parse()?;
            let api = console::provider::mock::Api::new(url);
            auth::BackendType::Postgres(Cow::Owned(api), ())
        }
-        AuthBackend::Link => {
-            let url = args.uri.parse()?;
+        "link" => {
+            let url = args.get_one::<String>("uri").unwrap().parse()?;
            auth::BackendType::Link(Cow::Owned(url))
        }
+        other => bail!("unsupported auth backend: {other}"),
    };

    let config = Box::leak(Box::new(ProxyConfig {
        tls_config,
        auth_backend,
        metric_collection,
-        allow_self_signed_compute: args.allow_self_signed_compute,
+        allow_self_signed_compute,
    }));

    Ok(config)
 }
+
+fn cli() -> clap::Command {
+    clap::Command::new("Neon proxy/router")
+        .disable_help_flag(true)
+        .version(GIT_VERSION)
+        .arg(
+            Arg::new("proxy")
+                .short('p')
+                .long("proxy")
+                .help("listen for incoming client connections on ip:port")
+                .default_value("127.0.0.1:4432"),
+        )
+        .arg(
+            Arg::new("auth-backend")
+                .long("auth-backend")
+                .value_parser(["console", "postgres", "link"])
+                .default_value("link"),
+        )
+        .arg(
+            Arg::new("mgmt")
+                .short('m')
+                .long("mgmt")
+                .help("listen for management callback connection on ip:port")
+                .default_value("127.0.0.1:7000"),
+        )
+        .arg(
+            Arg::new("http")
+                .long("http")
+                .help("listen for incoming http connections (metrics, etc) on ip:port")
+                .default_value("127.0.0.1:7001"),
+        )
+        .arg(
+            Arg::new("wss")
+                .long("wss")
+                .help("listen for incoming wss connections on ip:port"),
+        )
+        .arg(
+            Arg::new("uri")
+                .short('u')
+                .long("uri")
+                .help("redirect unauthenticated users to the given uri in case of link auth")
+                .default_value("http://localhost:3000/psql_session/"),
+        )
+        .arg(
+            Arg::new("auth-endpoint")
+                .short('a')
+                .long("auth-endpoint")
+                .help("cloud API endpoint for authenticating users")
+                .default_value("http://localhost:3000/authenticate_proxy_request/"),
+        )
+        .arg(
+            Arg::new("tls-key")
+                .short('k')
+                .long("tls-key")
+                .alias("ssl-key") // backwards compatibility
+                .help("path to TLS key for client postgres connections"),
+        )
+        .arg(
+            Arg::new("tls-cert")
+                .short('c')
+                .long("tls-cert")
+                .alias("ssl-cert") // backwards compatibility
+                .help("path to TLS cert for client postgres connections"),
+        )
+        // tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir
+        .arg(
+            Arg::new("certs-dir")
+                .long("certs-dir")
+                .help("path to directory with TLS certificates for client postgres connections"),
+        )
+        .arg(
+            Arg::new("metric-collection-endpoint")
+                .long("metric-collection-endpoint")
+                .help("http endpoint to receive periodic metric updates"),
+        )
+        .arg(
+            Arg::new("metric-collection-interval")
+                .long("metric-collection-interval")
+                .help("how often metrics should be sent to a collection endpoint"),
+        )
+        .arg(
+            Arg::new("wake-compute-cache")
+                .long("wake-compute-cache")
+                .help("cache for `wake_compute` api method (use `size=0` to disable)")
+                .default_value(config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO),
+        )
+        .arg(
+            Arg::new("allow-self-signed-compute")
+                .long("allow-self-signed-compute")
+                .help("Allow self-signed certificates for compute nodes (for testing)")
+                .default_value("false"),
+        )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn verify_cli() {
+        cli().debug_assert();
+    }
+}
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -262,21 +262,24 @@ pub mod timed_lru {
        token: Option<(C, C::LookupInfo<C::Key>)>,

        /// The value itself.
-        value: C::Value,
+        pub value: C::Value,
    }

    impl<C: Cache> Cached<C> {
        /// Place any entry into this wrapper; invalidation will be a no-op.
-        pub fn new_uncached(value: C::Value) -> Self {
-            Self { token: None, value }
+        /// Unfortunately, rust doesn't let us implement [`From`] or [`Into`].
+        pub fn new_uncached(value: impl Into<C::Value>) -> Self {
+            Self {
+                token: None,
+                value: value.into(),
+            }
        }

        /// Drop this entry from a cache if it's still there.
-        pub fn invalidate(self) -> C::Value {
+        pub fn invalidate(&self) {
            if let Some((cache, info)) = &self.token {
                cache.invalidate(info);
            }
-            self.value
        }

        /// Tell if this entry is actually cached.
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -110,7 +110,7 @@ impl<'a> Session<'a> {

 impl Session<'_> {
    /// Store the cancel token for the given session.
-    /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
+    /// This enables query cancellation in [`crate::proxy::handshake`].
    pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
        info!("enabling query cancellation for this session");
        self.cancel_map
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,9 +1,4 @@
-use crate::{
-    auth::parse_endpoint_param,
-    cancellation::CancelClosure,
-    console::errors::WakeComputeError,
-    error::{io_error, UserFacingError},
-};
+use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError};
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -18,7 +13,7 @@ const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 #[derive(Debug, Error)]
 pub enum ConnectionError {
    /// This error doesn't seem to reveal any secrets; for instance,
-    /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such.
+    /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such.
    #[error("{COULD_NOT_CONNECT}: {0}")]
    Postgres(#[from] tokio_postgres::Error),

@@ -29,12 +24,6 @@ pub enum ConnectionError {
    TlsError(#[from] native_tls::Error),
 }

-impl From<WakeComputeError> for ConnectionError {
-    fn from(value: WakeComputeError) -> Self {
-        io_error(value).into()
-    }
-}
-
 impl UserFacingError for ConnectionError {
    fn to_string_client(&self) -> String {
        use ConnectionError::*;
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -211,7 +211,7 @@ pub struct CacheOptions {
 }

 impl CacheOptions {
-    /// Default options for [`crate::console::provider::NodeInfoCache`].
+    /// Default options for [`crate::auth::caches::NodeInfoCache`].
    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m";

    /// Parse cache options passed via cmdline.
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -186,18 +186,18 @@ pub trait Api {
    async fn get_auth_info(
        &self,
        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ClientCredentials<'_>,
    ) -> Result<Option<AuthInfo>, errors::GetAuthInfoError>;

    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(
        &self,
        extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ClientCredentials<'_>,
    ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }

-/// Various caches for [`console`](super).
+/// Various caches for [`console`].
 pub struct ApiCaches {
    /// Cache for the `wake_compute` API method.
    pub node_info: NodeInfoCache,
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -106,7 +106,7 @@ impl super::Api for Api {
    async fn get_auth_info(
        &self,
        _extra: &ConsoleReqExtra<'_>,
-        creds: &ClientCredentials,
+        creds: &ClientCredentials<'_>,
    ) -> Result<Option<AuthInfo>, GetAuthInfoError> {
        self.do_get_auth_info(creds).await
    }
@@ -115,7 +115,7 @@ impl super::Api for Api {
    async fn wake_compute(
        &self,
        _extra: &ConsoleReqExtra<'_>,
-        _creds: &ClientCredentials,
+        _creds: &ClientCredentials<'_>,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
        self.do_wake_compute()
            .map_ok(CachedNodeInfo::new_uncached)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Alek Westover	4af6a4d5e8	hopefully unbroken wip	2023-07-18 08:45:39 -04:00
Alek Westover	b27fa34c00	pass aws creds via cli	2023-07-17 08:31:12 -04:00
Alek Westover	ca22453627	Merge branch 'alek_targz' of github.com:neondatabase/neon into alek_targz_default_on	2023-07-17 07:59:30 -04:00
Alek Westover	0a00869615	this should pass github tests, but will fail with my local cloud repo	2023-07-14 13:55:14 -04:00
Alek Westover	87eead5220	Update rfc	2023-07-14 10:54:16 -04:00
Alek Westover	3cf83014d4	patch rfc	2023-07-14 09:21:46 -04:00
Alek Westover	353a735acb	@arpad-m suggested using as_slice instead of creating a cursor	2023-07-14 07:58:05 -04:00
Alek Westover	107ebd3d21	turn remote extensions on by default	2023-07-13 17:05:52 -04:00
Alek Westover	89c93457f3	Add support for remote extensions. When requested, downloads a tar.gz file for the extension and then organizes the contained files. For instance, placing .so files in sharelib.	2023-07-13 16:15:18 -04:00