From a8ac895b83cd7339398d153b8ce73db959c21686 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 19 Nov 2024 18:22:51 +0100
Subject: [PATCH 01/24] re-acquire S3 OIDC token after long running tests for
 report upload to S3 (#9799)

## Problem

If a benchmark or test-case runs longer than the AWS OIDC token lifetime
successive upload of test reports to S3 fail - example:


https://github.com/neondatabase/neon/actions/runs/11905529176/job/33176168174#step:9:243

## Summary of changes

In actions that require access to S3 and which are invoked after a long
running python testcase we re-acquire the OIDC token explicitly.
Note that we need to pass down the aws_oicd_role_arn from the workflow
to the action because actions have no access to GitHub vars for security
reasons.

Sample run
https://github.com/neondatabase/neon/actions/runs/11912328276/job/33195676867
---
 .../actions/allure-report-generate/action.yml | 12 ++++++++++
 .../actions/allure-report-store/action.yml    | 12 ++++++++++
 .github/workflows/benchmarking.yml            | 24 +++++++++++++++++++
 3 files changed, 48 insertions(+)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 16b6e71498..d1d09223db 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -7,6 +7,10 @@ inputs:
     type: boolean
     required: false
     default: false
+  aws_oicd_role_arn:
+    description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role'
+    required: false
+    default: ''
 
 outputs:
   base-url:
@@ -79,6 +83,14 @@ runs:
         ALLURE_VERSION: 2.27.0
         ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
 
+    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
+      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-duration-seconds: 3600 # 1 hour should be more than enough to upload report
+
     # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
     - name: Acquire lock
       shell: bash -euxo pipefail {0}
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
index df4a6712ac..9c376f420a 100644
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -8,6 +8,10 @@ inputs:
   unique-key:
     description: 'string to distinguish different results in the same run'
     required: true
+  aws_oicd_role_arn:
+    description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role'
+    required: false
+    default: ''
 
 runs:
   using: "composite"
@@ -31,6 +35,14 @@ runs:
       env:
         REPORT_DIR: ${{ inputs.report-dir }}
 
+    - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test
+      if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }}
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ inputs.aws_oicd_role_arn }}
+        role-duration-seconds: 3600 # 1 hour should be more than enough to upload report
+
     - name: Upload test results
       shell: bash -euxo pipefail {0}
       run: |
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 0e3c31ec57..0289f552f9 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -122,6 +122,7 @@ jobs:
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
         # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
@@ -150,6 +151,8 @@ jobs:
       id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
+      with:
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -211,6 +214,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -227,6 +231,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 5400
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -238,6 +243,7 @@ jobs:
       uses: ./.github/actions/allure-report-generate
       with:
         store-test-results-into-db: true
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
 
@@ -446,6 +452,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -460,6 +467,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -474,6 +482,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -490,6 +499,8 @@ jobs:
       id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
+      with:
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -600,6 +611,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -614,6 +626,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -623,6 +636,8 @@ jobs:
       id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
+      with:
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -724,6 +739,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -736,6 +752,8 @@ jobs:
       id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
+      with:
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -838,6 +856,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_tpch
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -848,6 +867,8 @@ jobs:
       id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
+      with:
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -936,6 +957,7 @@ jobs:
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
         extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -945,6 +967,8 @@ jobs:
       id: create-allure-report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
+      with:
+        aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}

From 15468cd23c8398ad37cc568e2140fd5413c4653d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 19 Nov 2024 19:08:00 +0000
Subject: [PATCH 02/24] build(deps): bump aiohttp from 3.10.2 to 3.10.11
 (#9794)

---
 poetry.lock    | 459 +++++++++++++++++++++++++++++++------------------
 pyproject.toml |   2 +-
 2 files changed, 288 insertions(+), 173 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index d869761e8e..6171f92391 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -13,97 +13,112 @@ files = [
 
 [[package]]
 name = "aiohttp"
-version = "3.10.2"
+version = "3.10.11"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:95213b3d79c7e387144e9cb7b9d2809092d6ff2c044cb59033aedc612f38fb6d"},
-    {file = "aiohttp-3.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1aa005f060aff7124cfadaa2493f00a4e28ed41b232add5869e129a2e395935a"},
-    {file = "aiohttp-3.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eabe6bf4c199687592f5de4ccd383945f485779c7ffb62a9b9f1f8a3f9756df8"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e010736fc16d21125c7e2dc5c350cd43c528b85085c04bf73a77be328fe944"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99f81f9c1529fd8e03be4a7bd7df32d14b4f856e90ef6e9cbad3415dbfa9166c"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d611d1a01c25277bcdea06879afbc11472e33ce842322496b211319aa95441bb"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00191d38156e09e8c81ef3d75c0d70d4f209b8381e71622165f22ef7da6f101"},
-    {file = "aiohttp-3.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74c091a5ded6cb81785de2d7a8ab703731f26de910dbe0f3934eabef4ae417cc"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:18186a80ec5a701816adbf1d779926e1069392cf18504528d6e52e14b5920525"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5a7ceb2a0d2280f23a02c64cd0afdc922079bb950400c3dd13a1ab2988428aac"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8bd7be6ff6c162a60cb8fce65ee879a684fbb63d5466aba3fa5b9288eb04aefa"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:fae962b62944eaebff4f4fddcf1a69de919e7b967136a318533d82d93c3c6bd1"},
-    {file = "aiohttp-3.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0fde16d284efcacbe15fb0c1013f0967b6c3e379649239d783868230bf1db42"},
-    {file = "aiohttp-3.10.2-cp310-cp310-win32.whl", hash = "sha256:f81cd85a0e76ec7b8e2b6636fe02952d35befda4196b8c88f3cec5b4fb512839"},
-    {file = "aiohttp-3.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:54ba10eb5a3481c28282eb6afb5f709aedf53cf9c3a31875ffbdc9fc719ffd67"},
-    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87fab7f948e407444c2f57088286e00e2ed0003ceaf3d8f8cc0f60544ba61d91"},
-    {file = "aiohttp-3.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ec6ad66ed660d46503243cbec7b2b3d8ddfa020f984209b3b8ef7d98ce69c3f2"},
-    {file = "aiohttp-3.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a4be88807283bd96ae7b8e401abde4ca0bab597ba73b5e9a2d98f36d451e9aac"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01c98041f90927c2cbd72c22a164bb816fa3010a047d264969cf82e1d4bcf8d1"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54e36c67e1a9273ecafab18d6693da0fb5ac48fd48417e4548ac24a918c20998"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7de3ddb6f424af54535424082a1b5d1ae8caf8256ebd445be68c31c662354720"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dd9c7db94b4692b827ce51dcee597d61a0e4f4661162424faf65106775b40e7"},
-    {file = "aiohttp-3.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e57e21e1167705f8482ca29cc5d02702208d8bf4aff58f766d94bcd6ead838cd"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a1a50e59b720060c29e2951fd9f13c01e1ea9492e5a527b92cfe04dd64453c16"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:686c87782481fda5ee6ba572d912a5c26d9f98cc5c243ebd03f95222af3f1b0f"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:dafb4abb257c0ed56dc36f4e928a7341b34b1379bd87e5a15ce5d883c2c90574"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:494a6f77560e02bd7d1ab579fdf8192390567fc96a603f21370f6e63690b7f3d"},
-    {file = "aiohttp-3.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6fe8503b1b917508cc68bf44dae28823ac05e9f091021e0c41f806ebbb23f92f"},
-    {file = "aiohttp-3.10.2-cp311-cp311-win32.whl", hash = "sha256:4ddb43d06ce786221c0dfd3c91b4892c318eaa36b903f7c4278e7e2fa0dd5102"},
-    {file = "aiohttp-3.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:ca2f5abcb0a9a47e56bac173c01e9f6c6e7f27534d91451c5f22e6a35a5a2093"},
-    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:14eb6b17f6246959fb0b035d4f4ae52caa870c4edfb6170aad14c0de5bfbf478"},
-    {file = "aiohttp-3.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:465e445ec348d4e4bd349edd8b22db75f025da9d7b6dc1369c48e7935b85581e"},
-    {file = "aiohttp-3.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:341f8ece0276a828d95b70cd265d20e257f5132b46bf77d759d7f4e0443f2906"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c01fbb87b5426381cd9418b3ddcf4fc107e296fa2d3446c18ce6c76642f340a3"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c474af073e1a6763e1c5522bbb2d85ff8318197e4c6c919b8d7886e16213345"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d9076810a5621236e29b2204e67a68e1fe317c8727ee4c9abbfbb1083b442c38"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8f515d6859e673940e08de3922b9c4a2249653b0ac181169313bd6e4b1978ac"},
-    {file = "aiohttp-3.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:655e583afc639bef06f3b2446972c1726007a21003cd0ef57116a123e44601bc"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8da9449a575133828cc99985536552ea2dcd690e848f9d41b48d8853a149a959"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:19073d57d0feb1865d12361e2a1f5a49cb764bf81a4024a3b608ab521568093a"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c8e98e1845805f184d91fda6f9ab93d7c7b0dddf1c07e0255924bfdb151a8d05"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:377220a5efde6f9497c5b74649b8c261d3cce8a84cb661be2ed8099a2196400a"},
-    {file = "aiohttp-3.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92f7f4a4dc9cdb5980973a74d43cdbb16286dacf8d1896b6c3023b8ba8436f8e"},
-    {file = "aiohttp-3.10.2-cp312-cp312-win32.whl", hash = "sha256:9bb2834a6f11d65374ce97d366d6311a9155ef92c4f0cee543b2155d06dc921f"},
-    {file = "aiohttp-3.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:518dc3cb37365255708283d1c1c54485bbacccd84f0a0fb87ed8917ba45eda5b"},
-    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7f98e70bbbf693086efe4b86d381efad8edac040b8ad02821453083d15ec315f"},
-    {file = "aiohttp-3.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f6f0b252a009e98fe84028a4ec48396a948e7a65b8be06ccfc6ef68cf1f614d"},
-    {file = "aiohttp-3.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9360e3ffc7b23565600e729e8c639c3c50d5520e05fdf94aa2bd859eef12c407"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3988044d1635c7821dd44f0edfbe47e9875427464e59d548aece447f8c22800a"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30a9d59da1543a6f1478c3436fd49ec59be3868bca561a33778b4391005e499d"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f49bdb94809ac56e09a310a62f33e5f22973d6fd351aac72a39cd551e98194"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddfd2dca3f11c365d6857a07e7d12985afc59798458a2fdb2ffa4a0332a3fd43"},
-    {file = "aiohttp-3.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c1508ec97b2cd3e120bfe309a4ff8e852e8a7460f1ef1de00c2c0ed01e33c"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:49904f38667c44c041a0b44c474b3ae36948d16a0398a8f8cd84e2bb3c42a069"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:352f3a4e5f11f3241a49b6a48bc5b935fabc35d1165fa0d87f3ca99c1fcca98b"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:fc61f39b534c5d5903490478a0dd349df397d2284a939aa3cbaa2fb7a19b8397"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:ad2274e707be37420d0b6c3d26a8115295fe9d8e6e530fa6a42487a8ca3ad052"},
-    {file = "aiohttp-3.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:c836bf3c7512100219fe1123743fd8dd9a2b50dd7cfb0c3bb10d041309acab4b"},
-    {file = "aiohttp-3.10.2-cp38-cp38-win32.whl", hash = "sha256:53e8898adda402be03ff164b0878abe2d884e3ea03a4701e6ad55399d84b92dc"},
-    {file = "aiohttp-3.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:7cc8f65f5b22304693de05a245b6736b14cb5bc9c8a03da6e2ae9ef15f8b458f"},
-    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9dfc906d656e14004c5bc672399c1cccc10db38df2b62a13fb2b6e165a81c316"},
-    {file = "aiohttp-3.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:91b10208b222ddf655c3a3d5b727879d7163db12b634492df41a9182a76edaae"},
-    {file = "aiohttp-3.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fd16b5e1a7bdd14668cd6bde60a2a29b49147a535c74f50d8177d11b38433a7"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2bfdda4971bd79201f59adbad24ec2728875237e1c83bba5221284dbbf57bda"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69d73f869cf29e8a373127fc378014e2b17bcfbe8d89134bc6fb06a2f67f3cb3"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df59f8486507c421c0620a2c3dce81fbf1d54018dc20ff4fecdb2c106d6e6abc"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df930015db36b460aa9badbf35eccbc383f00d52d4b6f3de2ccb57d064a6ade"},
-    {file = "aiohttp-3.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:562b1153ab7f766ee6b8b357ec777a302770ad017cf18505d34f1c088fccc448"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d984db6d855de58e0fde1ef908d48fe9a634cadb3cf715962722b4da1c40619d"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:14dc3fcb0d877911d775d511eb617a486a8c48afca0a887276e63db04d3ee920"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b52a27a5c97275e254704e1049f4b96a81e67d6205f52fa37a4777d55b0e98ef"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:cd33d9de8cfd006a0d0fe85f49b4183c57e91d18ffb7e9004ce855e81928f704"},
-    {file = "aiohttp-3.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1238fc979160bc03a92fff9ad021375ff1c8799c6aacb0d8ea1b357ea40932bb"},
-    {file = "aiohttp-3.10.2-cp39-cp39-win32.whl", hash = "sha256:e2f43d238eae4f0b04f58d4c0df4615697d4ca3e9f9b1963d49555a94f0f5a04"},
-    {file = "aiohttp-3.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:947847f07a8f81d7b39b2d0202fd73e61962ebe17ac2d8566f260679e467da7b"},
-    {file = "aiohttp-3.10.2.tar.gz", hash = "sha256:4d1f694b5d6e459352e5e925a42e05bac66655bfde44d81c59992463d2897014"},
+    {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5077b1a5f40ffa3ba1f40d537d3bec4383988ee51fbba6b74aa8fb1bc466599e"},
+    {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d6a14a4d93b5b3c2891fca94fa9d41b2322a68194422bef0dd5ec1e57d7d298"},
+    {file = "aiohttp-3.10.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffbfde2443696345e23a3c597049b1dd43049bb65337837574205e7368472177"},
+    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20b3d9e416774d41813bc02fdc0663379c01817b0874b932b81c7f777f67b217"},
+    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b943011b45ee6bf74b22245c6faab736363678e910504dd7531a58c76c9015a"},
+    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48bc1d924490f0d0b3658fe5c4b081a4d56ebb58af80a6729d4bd13ea569797a"},
+    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e12eb3f4b1f72aaaf6acd27d045753b18101524f72ae071ae1c91c1cd44ef115"},
+    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f14ebc419a568c2eff3c1ed35f634435c24ead2fe19c07426af41e7adb68713a"},
+    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:72b191cdf35a518bfc7ca87d770d30941decc5aaf897ec8b484eb5cc8c7706f3"},
+    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5ab2328a61fdc86424ee540d0aeb8b73bbcad7351fb7cf7a6546fc0bcffa0038"},
+    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:aa93063d4af05c49276cf14e419550a3f45258b6b9d1f16403e777f1addf4519"},
+    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:30283f9d0ce420363c24c5c2421e71a738a2155f10adbb1a11a4d4d6d2715cfc"},
+    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e5358addc8044ee49143c546d2182c15b4ac3a60be01c3209374ace05af5733d"},
+    {file = "aiohttp-3.10.11-cp310-cp310-win32.whl", hash = "sha256:e1ffa713d3ea7cdcd4aea9cddccab41edf6882fa9552940344c44e59652e1120"},
+    {file = "aiohttp-3.10.11-cp310-cp310-win_amd64.whl", hash = "sha256:778cbd01f18ff78b5dd23c77eb82987ee4ba23408cbed233009fd570dda7e674"},
+    {file = "aiohttp-3.10.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:80ff08556c7f59a7972b1e8919f62e9c069c33566a6d28586771711e0eea4f07"},
+    {file = "aiohttp-3.10.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c8f96e9ee19f04c4914e4e7a42a60861066d3e1abf05c726f38d9d0a466e695"},
+    {file = "aiohttp-3.10.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fb8601394d537da9221947b5d6e62b064c9a43e88a1ecd7414d21a1a6fba9c24"},
+    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ea224cf7bc2d8856d6971cea73b1d50c9c51d36971faf1abc169a0d5f85a382"},
+    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db9503f79e12d5d80b3efd4d01312853565c05367493379df76d2674af881caa"},
+    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0f449a50cc33f0384f633894d8d3cd020e3ccef81879c6e6245c3c375c448625"},
+    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82052be3e6d9e0c123499127782a01a2b224b8af8c62ab46b3f6197035ad94e9"},
+    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:20063c7acf1eec550c8eb098deb5ed9e1bb0521613b03bb93644b810986027ac"},
+    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:489cced07a4c11488f47aab1f00d0c572506883f877af100a38f1fedaa884c3a"},
+    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ea9b3bab329aeaa603ed3bf605f1e2a6f36496ad7e0e1aa42025f368ee2dc07b"},
+    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ca117819d8ad113413016cb29774b3f6d99ad23c220069789fc050267b786c16"},
+    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2dfb612dcbe70fb7cdcf3499e8d483079b89749c857a8f6e80263b021745c730"},
+    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9b615d3da0d60e7d53c62e22b4fd1c70f4ae5993a44687b011ea3a2e49051b8"},
+    {file = "aiohttp-3.10.11-cp311-cp311-win32.whl", hash = "sha256:29103f9099b6068bbdf44d6a3d090e0a0b2be6d3c9f16a070dd9d0d910ec08f9"},
+    {file = "aiohttp-3.10.11-cp311-cp311-win_amd64.whl", hash = "sha256:236b28ceb79532da85d59aa9b9bf873b364e27a0acb2ceaba475dc61cffb6f3f"},
+    {file = "aiohttp-3.10.11-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7480519f70e32bfb101d71fb9a1f330fbd291655a4c1c922232a48c458c52710"},
+    {file = "aiohttp-3.10.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f65267266c9aeb2287a6622ee2bb39490292552f9fbf851baabc04c9f84e048d"},
+    {file = "aiohttp-3.10.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7400a93d629a0608dc1d6c55f1e3d6e07f7375745aaa8bd7f085571e4d1cee97"},
+    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f34b97e4b11b8d4eb2c3a4f975be626cc8af99ff479da7de49ac2c6d02d35725"},
+    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e7b825da878464a252ccff2958838f9caa82f32a8dbc334eb9b34a026e2c636"},
+    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f92a344c50b9667827da308473005f34767b6a2a60d9acff56ae94f895f385"},
+    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc6f1ab987a27b83c5268a17218463c2ec08dbb754195113867a27b166cd6087"},
+    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1dc0f4ca54842173d03322793ebcf2c8cc2d34ae91cc762478e295d8e361e03f"},
+    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7ce6a51469bfaacff146e59e7fb61c9c23006495d11cc24c514a455032bcfa03"},
+    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:aad3cd91d484d065ede16f3cf15408254e2469e3f613b241a1db552c5eb7ab7d"},
+    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f4df4b8ca97f658c880fb4b90b1d1ec528315d4030af1ec763247ebfd33d8b9a"},
+    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2e4e18a0a2d03531edbc06c366954e40a3f8d2a88d2b936bbe78a0c75a3aab3e"},
+    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6ce66780fa1a20e45bc753cda2a149daa6dbf1561fc1289fa0c308391c7bc0a4"},
+    {file = "aiohttp-3.10.11-cp312-cp312-win32.whl", hash = "sha256:a919c8957695ea4c0e7a3e8d16494e3477b86f33067478f43106921c2fef15bb"},
+    {file = "aiohttp-3.10.11-cp312-cp312-win_amd64.whl", hash = "sha256:b5e29706e6389a2283a91611c91bf24f218962717c8f3b4e528ef529d112ee27"},
+    {file = "aiohttp-3.10.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:703938e22434d7d14ec22f9f310559331f455018389222eed132808cd8f44127"},
+    {file = "aiohttp-3.10.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9bc50b63648840854e00084c2b43035a62e033cb9b06d8c22b409d56eb098413"},
+    {file = "aiohttp-3.10.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5f0463bf8b0754bc744e1feb61590706823795041e63edf30118a6f0bf577461"},
+    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6c6dec398ac5a87cb3a407b068e1106b20ef001c344e34154616183fe684288"},
+    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bcaf2d79104d53d4dcf934f7ce76d3d155302d07dae24dff6c9fffd217568067"},
+    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:25fd5470922091b5a9aeeb7e75be609e16b4fba81cdeaf12981393fb240dd10e"},
+    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbde2ca67230923a42161b1f408c3992ae6e0be782dca0c44cb3206bf330dee1"},
+    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:249c8ff8d26a8b41a0f12f9df804e7c685ca35a207e2410adbd3e924217b9006"},
+    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:878ca6a931ee8c486a8f7b432b65431d095c522cbeb34892bee5be97b3481d0f"},
+    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8663f7777ce775f0413324be0d96d9730959b2ca73d9b7e2c2c90539139cbdd6"},
+    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6cd3f10b01f0c31481fba8d302b61603a2acb37b9d30e1d14e0f5a58b7b18a31"},
+    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e8d8aad9402d3aa02fdc5ca2fe68bcb9fdfe1f77b40b10410a94c7f408b664d"},
+    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:38e3c4f80196b4f6c3a85d134a534a56f52da9cb8d8e7af1b79a32eefee73a00"},
+    {file = "aiohttp-3.10.11-cp313-cp313-win32.whl", hash = "sha256:fc31820cfc3b2863c6e95e14fcf815dc7afe52480b4dc03393c4873bb5599f71"},
+    {file = "aiohttp-3.10.11-cp313-cp313-win_amd64.whl", hash = "sha256:4996ff1345704ffdd6d75fb06ed175938c133425af616142e7187f28dc75f14e"},
+    {file = "aiohttp-3.10.11-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:74baf1a7d948b3d640badeac333af581a367ab916b37e44cf90a0334157cdfd2"},
+    {file = "aiohttp-3.10.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:473aebc3b871646e1940c05268d451f2543a1d209f47035b594b9d4e91ce8339"},
+    {file = "aiohttp-3.10.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c2f746a6968c54ab2186574e15c3f14f3e7f67aef12b761e043b33b89c5b5f95"},
+    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d110cabad8360ffa0dec8f6ec60e43286e9d251e77db4763a87dcfe55b4adb92"},
+    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0099c7d5d7afff4202a0c670e5b723f7718810000b4abcbc96b064129e64bc7"},
+    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0316e624b754dbbf8c872b62fe6dcb395ef20c70e59890dfa0de9eafccd2849d"},
+    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a5f7ab8baf13314e6b2485965cbacb94afff1e93466ac4d06a47a81c50f9cca"},
+    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c891011e76041e6508cbfc469dd1a8ea09bc24e87e4c204e05f150c4c455a5fa"},
+    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:9208299251370ee815473270c52cd3f7069ee9ed348d941d574d1457d2c73e8b"},
+    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:459f0f32c8356e8125f45eeff0ecf2b1cb6db1551304972702f34cd9e6c44658"},
+    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:14cdc8c1810bbd4b4b9f142eeee23cda528ae4e57ea0923551a9af4820980e39"},
+    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:971aa438a29701d4b34e4943e91b5e984c3ae6ccbf80dd9efaffb01bd0b243a9"},
+    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:9a309c5de392dfe0f32ee57fa43ed8fc6ddf9985425e84bd51ed66bb16bce3a7"},
+    {file = "aiohttp-3.10.11-cp38-cp38-win32.whl", hash = "sha256:9ec1628180241d906a0840b38f162a3215114b14541f1a8711c368a8739a9be4"},
+    {file = "aiohttp-3.10.11-cp38-cp38-win_amd64.whl", hash = "sha256:9c6e0ffd52c929f985c7258f83185d17c76d4275ad22e90aa29f38e211aacbec"},
+    {file = "aiohttp-3.10.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cdc493a2e5d8dc79b2df5bec9558425bcd39aff59fc949810cbd0832e294b106"},
+    {file = "aiohttp-3.10.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b3e70f24e7d0405be2348da9d5a7836936bf3a9b4fd210f8c37e8d48bc32eca6"},
+    {file = "aiohttp-3.10.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:968b8fb2a5eee2770eda9c7b5581587ef9b96fbdf8dcabc6b446d35ccc69df01"},
+    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deef4362af9493d1382ef86732ee2e4cbc0d7c005947bd54ad1a9a16dd59298e"},
+    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:686b03196976e327412a1b094f4120778c7c4b9cff9bce8d2fdfeca386b89829"},
+    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bf6d027d9d1d34e1c2e1645f18a6498c98d634f8e373395221121f1c258ace8"},
+    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:099fd126bf960f96d34a760e747a629c27fb3634da5d05c7ef4d35ef4ea519fc"},
+    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c73c4d3dae0b4644bc21e3de546530531d6cdc88659cdeb6579cd627d3c206aa"},
+    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0c5580f3c51eea91559db3facd45d72e7ec970b04528b4709b1f9c2555bd6d0b"},
+    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fdf6429f0caabfd8a30c4e2eaecb547b3c340e4730ebfe25139779b9815ba138"},
+    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:d97187de3c276263db3564bb9d9fad9e15b51ea10a371ffa5947a5ba93ad6777"},
+    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:0acafb350cfb2eba70eb5d271f55e08bd4502ec35e964e18ad3e7d34d71f7261"},
+    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c13ed0c779911c7998a58e7848954bd4d63df3e3575f591e321b19a2aec8df9f"},
+    {file = "aiohttp-3.10.11-cp39-cp39-win32.whl", hash = "sha256:22b7c540c55909140f63ab4f54ec2c20d2635c0289cdd8006da46f3327f971b9"},
+    {file = "aiohttp-3.10.11-cp39-cp39-win_amd64.whl", hash = "sha256:7b26b1551e481012575dab8e3727b16fe7dd27eb2711d2e63ced7368756268fb"},
+    {file = "aiohttp-3.10.11.tar.gz", hash = "sha256:9dc2b8f3dcab2e39e0fa309c8da50c3b55e6f34ab25f1a71d3288f24924d33a7"},
 ]
 
 [package.dependencies]
 aiohappyeyeballs = ">=2.3.0"
 aiosignal = ">=1.1.2"
-async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
+async-timeout = {version = ">=4.0,<6.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
 frozenlist = ">=1.1.1"
 multidict = ">=4.5,<7.0"
-yarl = ">=1.0,<2.0"
+yarl = ">=1.12.0,<2.0"
 
 [package.extras]
 speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"]
@@ -2078,6 +2093,113 @@ files = [
 [package.extras]
 twisted = ["twisted"]
 
+[[package]]
+name = "propcache"
+version = "0.2.0"
+description = "Accelerated property cache"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"},
+    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"},
+    {file = "propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336"},
+    {file = "propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad"},
+    {file = "propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b"},
+    {file = "propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1"},
+    {file = "propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348"},
+    {file = "propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5"},
+    {file = "propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"},
+    {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"},
+    {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:53d1bd3f979ed529f0805dd35ddaca330f80a9a6d90bc0121d2ff398f8ed8861"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:83928404adf8fb3d26793665633ea79b7361efa0287dfbd372a7e74311d51ee6"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77a86c261679ea5f3896ec060be9dc8e365788248cc1e049632a1be682442063"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:218db2a3c297a3768c11a34812e63b3ac1c3234c3a086def9c0fee50d35add1f"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7735e82e3498c27bcb2d17cb65d62c14f1100b71723b68362872bca7d0913d90"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20a617c776f520c3875cf4511e0d1db847a076d720714ae35ffe0df3e440be68"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b69535c870670c9f9b14a75d28baa32221d06f6b6fa6f77a0a13c5a7b0a5b9"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4569158070180c3855e9c0791c56be3ceeb192defa2cdf6a3f39e54319e56b89"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:db47514ffdbd91ccdc7e6f8407aac4ee94cc871b15b577c1c324236b013ddd04"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:2a60ad3e2553a74168d275a0ef35e8c0a965448ffbc3b300ab3a5bb9956c2162"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:662dd62358bdeaca0aee5761de8727cfd6861432e3bb828dc2a693aa0471a563"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:25a1f88b471b3bc911d18b935ecb7115dff3a192b6fef46f0bfaf71ff4f12418"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:f60f0ac7005b9f5a6091009b09a419ace1610e163fa5deaba5ce3484341840e7"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74acd6e291f885678631b7ebc85d2d4aec458dd849b8c841b57ef04047833bed"},
+    {file = "propcache-0.2.0-cp38-cp38-win32.whl", hash = "sha256:d9b6ddac6408194e934002a69bcaadbc88c10b5f38fb9307779d1c629181815d"},
+    {file = "propcache-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:676135dcf3262c9c5081cc8f19ad55c8a64e3f7282a21266d05544450bffc3a5"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"},
+    {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"},
+    {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"},
+    {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"},
+    {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"},
+]
+
 [[package]]
 name = "psutil"
 version = "5.9.4"
@@ -3307,106 +3429,99 @@ files = [
 
 [[package]]
 name = "yarl"
-version = "1.9.4"
+version = "1.17.2"
 description = "Yet another URL library"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
 files = [
-    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"},
-    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"},
-    {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"},
-    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"},
-    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"},
-    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"},
-    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"},
-    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"},
-    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"},
-    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"},
-    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"},
-    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"},
-    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"},
-    {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"},
-    {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"},
-    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"},
-    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"},
-    {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"},
-    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"},
-    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"},
-    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"},
-    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"},
-    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"},
-    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"},
-    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"},
-    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"},
-    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"},
-    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"},
-    {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"},
-    {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"},
-    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"},
-    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"},
-    {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"},
-    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"},
-    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"},
-    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"},
-    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"},
-    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"},
-    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"},
-    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"},
-    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"},
-    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"},
-    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"},
-    {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"},
-    {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"},
-    {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"},
-    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"},
-    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"},
-    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"},
-    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"},
-    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"},
-    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"},
-    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"},
-    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"},
-    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"},
-    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"},
-    {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"},
-    {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"},
-    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"},
-    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"},
-    {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"},
-    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"},
-    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"},
-    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"},
-    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"},
-    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"},
-    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"},
-    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"},
-    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"},
-    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"},
-    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"},
-    {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"},
-    {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"},
-    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"},
-    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"},
-    {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"},
-    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"},
-    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"},
-    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"},
-    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"},
-    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"},
-    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"},
-    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"},
-    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"},
-    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"},
-    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"},
-    {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"},
-    {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"},
-    {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"},
-    {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"},
+    {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:93771146ef048b34201bfa382c2bf74c524980870bb278e6df515efaf93699ff"},
+    {file = "yarl-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8281db240a1616af2f9c5f71d355057e73a1409c4648c8949901396dc0a3c151"},
+    {file = "yarl-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:170ed4971bf9058582b01a8338605f4d8c849bd88834061e60e83b52d0c76870"},
+    {file = "yarl-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc61b005f6521fcc00ca0d1243559a5850b9dd1e1fe07b891410ee8fe192d0c0"},
+    {file = "yarl-1.17.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:871e1b47eec7b6df76b23c642a81db5dd6536cbef26b7e80e7c56c2fd371382e"},
+    {file = "yarl-1.17.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3a58a2f2ca7aaf22b265388d40232f453f67a6def7355a840b98c2d547bd037f"},
+    {file = "yarl-1.17.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:736bb076f7299c5c55dfef3eb9e96071a795cb08052822c2bb349b06f4cb2e0a"},
+    {file = "yarl-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8fd51299e21da709eabcd5b2dd60e39090804431292daacbee8d3dabe39a6bc0"},
+    {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:358dc7ddf25e79e1cc8ee16d970c23faee84d532b873519c5036dbb858965795"},
+    {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:50d866f7b1a3f16f98603e095f24c0eeba25eb508c85a2c5939c8b3870ba2df8"},
+    {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:8b9c4643e7d843a0dca9cd9d610a0876e90a1b2cbc4c5ba7930a0d90baf6903f"},
+    {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d63123bfd0dce5f91101e77c8a5427c3872501acece8c90df457b486bc1acd47"},
+    {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:4e76381be3d8ff96a4e6c77815653063e87555981329cf8f85e5be5abf449021"},
+    {file = "yarl-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:734144cd2bd633a1516948e477ff6c835041c0536cef1d5b9a823ae29899665b"},
+    {file = "yarl-1.17.2-cp310-cp310-win32.whl", hash = "sha256:26bfb6226e0c157af5da16d2d62258f1ac578d2899130a50433ffee4a5dfa673"},
+    {file = "yarl-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:76499469dcc24759399accd85ec27f237d52dec300daaca46a5352fcbebb1071"},
+    {file = "yarl-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:792155279dc093839e43f85ff7b9b6493a8eaa0af1f94f1f9c6e8f4de8c63500"},
+    {file = "yarl-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:38bc4ed5cae853409cb193c87c86cd0bc8d3a70fd2268a9807217b9176093ac6"},
+    {file = "yarl-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4a8c83f6fcdc327783bdc737e8e45b2e909b7bd108c4da1892d3bc59c04a6d84"},
+    {file = "yarl-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c6d5fed96f0646bfdf698b0a1cebf32b8aae6892d1bec0c5d2d6e2df44e1e2d"},
+    {file = "yarl-1.17.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:782ca9c58f5c491c7afa55518542b2b005caedaf4685ec814fadfcee51f02493"},
+    {file = "yarl-1.17.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ff6af03cac0d1a4c3c19e5dcc4c05252411bf44ccaa2485e20d0a7c77892ab6e"},
+    {file = "yarl-1.17.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a3f47930fbbed0f6377639503848134c4aa25426b08778d641491131351c2c8"},
+    {file = "yarl-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1fa68a3c921365c5745b4bd3af6221ae1f0ea1bf04b69e94eda60e57958907f"},
+    {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:187df91395c11e9f9dc69b38d12406df85aa5865f1766a47907b1cc9855b6303"},
+    {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:93d1c8cc5bf5df401015c5e2a3ce75a5254a9839e5039c881365d2a9dcfc6dc2"},
+    {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:11d86c6145ac5c706c53d484784cf504d7d10fa407cb73b9d20f09ff986059ef"},
+    {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c42774d1d1508ec48c3ed29e7b110e33f5e74a20957ea16197dbcce8be6b52ba"},
+    {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8e589379ef0407b10bed16cc26e7392ef8f86961a706ade0a22309a45414d7"},
+    {file = "yarl-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1056cadd5e850a1c026f28e0704ab0a94daaa8f887ece8dfed30f88befb87bb0"},
+    {file = "yarl-1.17.2-cp311-cp311-win32.whl", hash = "sha256:be4c7b1c49d9917c6e95258d3d07f43cfba2c69a6929816e77daf322aaba6628"},
+    {file = "yarl-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:ac8eda86cc75859093e9ce390d423aba968f50cf0e481e6c7d7d63f90bae5c9c"},
+    {file = "yarl-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:dd90238d3a77a0e07d4d6ffdebc0c21a9787c5953a508a2231b5f191455f31e9"},
+    {file = "yarl-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c74f0b0472ac40b04e6d28532f55cac8090e34c3e81f118d12843e6df14d0909"},
+    {file = "yarl-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4d486ddcaca8c68455aa01cf53d28d413fb41a35afc9f6594a730c9779545876"},
+    {file = "yarl-1.17.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25b7e93f5414b9a983e1a6c1820142c13e1782cc9ed354c25e933aebe97fcf2"},
+    {file = "yarl-1.17.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3a0baff7827a632204060f48dca9e63fbd6a5a0b8790c1a2adfb25dc2c9c0d50"},
+    {file = "yarl-1.17.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:460024cacfc3246cc4d9f47a7fc860e4fcea7d1dc651e1256510d8c3c9c7cde0"},
+    {file = "yarl-1.17.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5870d620b23b956f72bafed6a0ba9a62edb5f2ef78a8849b7615bd9433384171"},
+    {file = "yarl-1.17.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2941756754a10e799e5b87e2319bbec481ed0957421fba0e7b9fb1c11e40509f"},
+    {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9611b83810a74a46be88847e0ea616794c406dbcb4e25405e52bff8f4bee2d0a"},
+    {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:cd7e35818d2328b679a13268d9ea505c85cd773572ebb7a0da7ccbca77b6a52e"},
+    {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:6b981316fcd940f085f646b822c2ff2b8b813cbd61281acad229ea3cbaabeb6b"},
+    {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:688058e89f512fb7541cb85c2f149c292d3fa22f981d5a5453b40c5da49eb9e8"},
+    {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:56afb44a12b0864d17b597210d63a5b88915d680f6484d8d202ed68ade38673d"},
+    {file = "yarl-1.17.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:17931dfbb84ae18b287279c1f92b76a3abcd9a49cd69b92e946035cff06bcd20"},
+    {file = "yarl-1.17.2-cp312-cp312-win32.whl", hash = "sha256:ff8d95e06546c3a8c188f68040e9d0360feb67ba8498baf018918f669f7bc39b"},
+    {file = "yarl-1.17.2-cp312-cp312-win_amd64.whl", hash = "sha256:4c840cc11163d3c01a9d8aad227683c48cd3e5be5a785921bcc2a8b4b758c4f3"},
+    {file = "yarl-1.17.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3294f787a437cb5d81846de3a6697f0c35ecff37a932d73b1fe62490bef69211"},
+    {file = "yarl-1.17.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f1e7fedb09c059efee2533119666ca7e1a2610072076926fa028c2ba5dfeb78c"},
+    {file = "yarl-1.17.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:da9d3061e61e5ae3f753654813bc1cd1c70e02fb72cf871bd6daf78443e9e2b1"},
+    {file = "yarl-1.17.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91c012dceadc695ccf69301bfdccd1fc4472ad714fe2dd3c5ab4d2046afddf29"},
+    {file = "yarl-1.17.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f11fd61d72d93ac23718d393d2a64469af40be2116b24da0a4ca6922df26807e"},
+    {file = "yarl-1.17.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46c465ad06971abcf46dd532f77560181387b4eea59084434bdff97524444032"},
+    {file = "yarl-1.17.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef6eee1a61638d29cd7c85f7fd3ac7b22b4c0fabc8fd00a712b727a3e73b0685"},
+    {file = "yarl-1.17.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4434b739a8a101a837caeaa0137e0e38cb4ea561f39cb8960f3b1e7f4967a3fc"},
+    {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:752485cbbb50c1e20908450ff4f94217acba9358ebdce0d8106510859d6eb19a"},
+    {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:17791acaa0c0f89323c57da7b9a79f2174e26d5debbc8c02d84ebd80c2b7bff8"},
+    {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5c6ea72fe619fee5e6b5d4040a451d45d8175f560b11b3d3e044cd24b2720526"},
+    {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:db5ac3871ed76340210fe028f535392f097fb31b875354bcb69162bba2632ef4"},
+    {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:7a1606ba68e311576bcb1672b2a1543417e7e0aa4c85e9e718ba6466952476c0"},
+    {file = "yarl-1.17.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9bc27dd5cfdbe3dc7f381b05e6260ca6da41931a6e582267d5ca540270afeeb2"},
+    {file = "yarl-1.17.2-cp313-cp313-win32.whl", hash = "sha256:52492b87d5877ec405542f43cd3da80bdcb2d0c2fbc73236526e5f2c28e6db28"},
+    {file = "yarl-1.17.2-cp313-cp313-win_amd64.whl", hash = "sha256:8e1bf59e035534ba4077f5361d8d5d9194149f9ed4f823d1ee29ef3e8964ace3"},
+    {file = "yarl-1.17.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c556fbc6820b6e2cda1ca675c5fa5589cf188f8da6b33e9fc05b002e603e44fa"},
+    {file = "yarl-1.17.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f2f44a4247461965fed18b2573f3a9eb5e2c3cad225201ee858726cde610daca"},
+    {file = "yarl-1.17.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3a3ede8c248f36b60227eb777eac1dbc2f1022dc4d741b177c4379ca8e75571a"},
+    {file = "yarl-1.17.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2654caaf5584449d49c94a6b382b3cb4a246c090e72453493ea168b931206a4d"},
+    {file = "yarl-1.17.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0d41c684f286ce41fa05ab6af70f32d6da1b6f0457459a56cf9e393c1c0b2217"},
+    {file = "yarl-1.17.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2270d590997445a0dc29afa92e5534bfea76ba3aea026289e811bf9ed4b65a7f"},
+    {file = "yarl-1.17.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18662443c6c3707e2fc7fad184b4dc32dd428710bbe72e1bce7fe1988d4aa654"},
+    {file = "yarl-1.17.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:75ac158560dec3ed72f6d604c81090ec44529cfb8169b05ae6fcb3e986b325d9"},
+    {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1fee66b32e79264f428dc8da18396ad59cc48eef3c9c13844adec890cd339db5"},
+    {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:585ce7cd97be8f538345de47b279b879e091c8b86d9dbc6d98a96a7ad78876a3"},
+    {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:c019abc2eca67dfa4d8fb72ba924871d764ec3c92b86d5b53b405ad3d6aa56b0"},
+    {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c6e659b9a24d145e271c2faf3fa6dd1fcb3e5d3f4e17273d9e0350b6ab0fe6e2"},
+    {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:d17832ba39374134c10e82d137e372b5f7478c4cceeb19d02ae3e3d1daed8721"},
+    {file = "yarl-1.17.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:bc3003710e335e3f842ae3fd78efa55f11a863a89a72e9a07da214db3bf7e1f8"},
+    {file = "yarl-1.17.2-cp39-cp39-win32.whl", hash = "sha256:f5ffc6b7ace5b22d9e73b2a4c7305740a339fbd55301d52735f73e21d9eb3130"},
+    {file = "yarl-1.17.2-cp39-cp39-win_amd64.whl", hash = "sha256:48e424347a45568413deec6f6ee2d720de2cc0385019bedf44cd93e8638aa0ed"},
+    {file = "yarl-1.17.2-py3-none-any.whl", hash = "sha256:dd7abf4f717e33b7487121faf23560b3a50924f80e4bef62b22dab441ded8f3b"},
+    {file = "yarl-1.17.2.tar.gz", hash = "sha256:753eaaa0c7195244c84b5cc159dc8204b7fd99f716f11198f999f2332a86b178"},
 ]
 
 [package.dependencies]
 idna = ">=2.0"
 multidict = ">=4.0"
+propcache = ">=0.2.0"
 
 [[package]]
 name = "zipp"
@@ -3484,4 +3599,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c656496f9fbb7c29b2df3143c1d72c95b5e121cb6340134c0b8d070f54a08508"
+content-hash = "8cb9c38d83eec441391c0528ac2fbefde18c734373b2399e07c69382044e8ced"
diff --git a/pyproject.toml b/pyproject.toml
index 9ea42bf46f..197946fff8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.10.2"
+aiohttp = "3.10.11"
 pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"

From 5e3fbef7210a84870cb012837db6830aeab3d38d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 19 Nov 2024 14:10:09 -0500
Subject: [PATCH 03/24] fix(pageserver): queue stopped error should be ignored
 during create timeline (#9767)

close https://github.com/neondatabase/neon/issues/9730

The test case tests if anything goes wrong during pageserver restart +
*during timeline creation not complete*. Therefore, queue is stopped
error is normal in this case, except that it should be categorized as a
shutdown error instead of a real error.

## Summary of changes

* More comments for the test case.
* Queue stopped error will now be forwarded as
CreateTimelineError::ShuttingDown.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs            | 6 ++++++
 test_runner/regress/test_tenants.py | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e88dee7c6c..46317e93ee 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2446,6 +2446,12 @@ impl Tenant {
             .remote_client
             .wait_completion()
             .await
+            .map_err(|e| match e {
+                WaitCompletionError::NotInitialized(
+                    e, // If the queue is already stopped, it's a shutdown error.
+                ) if e.is_stopping() => CreateTimelineError::ShuttingDown,
+                e => CreateTimelineError::Other(e.into()),
+            })
             .context("wait for timeline initial uploads to complete")?;
 
         // The creating task is responsible for activating the timeline.
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 5a499ea98b..158c3fddb0 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -369,12 +369,16 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     - Bad response codes during shutdown (e.g. returning 500 instead of 503)
     - Issues where a tenant is still starting up while we receive a request for it
     - Issues with interrupting/resuming tenant/timeline creation in shutdown
+    - Issues with a timeline is not created successfully because of restart.
     """
     env = neon_env_builder.init_configs()
     env.start()
     tenant_id: TenantId = env.initial_tenant
     timeline_id = env.initial_timeline
 
+    # At this point, the initial tenant/timeline might not have been created successfully,
+    # and this is the case we want to test.
+
     # Multiple creation requests which race will generate this error on the pageserver
     # and storage controller respectively
     env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*")

From b092126c94fc2af37188ad05e5951ae10c84813a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 19 Nov 2024 20:10:53 +0100
Subject: [PATCH 04/24] scrubber: fix parsing issue with Azure (#9797)

Apparently Azure returns timelines ending with `/` which confuses the
parsing. So remove all trailing `/`s before attempting to parse.

Part of https://github.com/neondatabase/cloud/issues/19963
---
 storage_scrubber/src/metadata_stream.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index f896cff2d5..efda7c213d 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -60,7 +60,7 @@ pub async fn stream_tenant_shards<'a>(
 
             first_part
                 .parse::<TenantShardId>()
-                .with_context(|| format!("Incorrect entry id str: {first_part}"))
+                .with_context(|| format!("Incorrect tenant entry id str: {first_part}"))
         })
         .collect::<Vec<_>>();
 
@@ -114,9 +114,10 @@ pub async fn stream_tenant_timelines<'a>(
                 prefix.get_path().as_str().strip_prefix(prefix_str)
             })
             .map(|entry_id_str| {
-                entry_id_str
+                let first_part = entry_id_str.split('/').next().unwrap();
+                first_part
                     .parse::<TimelineId>()
-                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
+                    .with_context(|| format!("Incorrect timeline entry id str: {entry_id_str}"))
             });
 
         for i in new_entry_ids {

From b22a84a7bf2ccae30243be81439cc284835a37f1 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 19 Nov 2024 14:38:41 -0500
Subject: [PATCH 05/24] feat(pageserver): support key range for manual
 compaction trigger (#9723)

part of https://github.com/neondatabase/neon/issues/9114, we want to be
able to run partial gc-compaction in tests. In the future, we can also
expand this functionality to legacy compaction, so that we can trigger
compaction for a specific key range.

## Summary of changes

* Support passing compaction key range through pageserver routes.
* Refactor input parameters of compact related function to take the new
`CompactOptions`.
* Add tests for partial compaction. Note that the test may or may not
trigger compaction based on GC horizon. We need to improve the test case
to ensure things always get below the gc_horizon and the gc-compaction
can be triggered.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/utils/src/http/json.rs                  | 22 ++++++++++
 pageserver/src/http/routes.rs                | 15 +++++--
 pageserver/src/tenant.rs                     | 42 ++++++++++++-------
 pageserver/src/tenant/timeline.rs            | 36 +++++++++++++++-
 pageserver/src/tenant/timeline/compaction.rs | 43 +++++++++++++++-----
 test_runner/fixtures/pageserver/http.py      |  2 +
 test_runner/regress/test_compaction.py       | 39 ++++++++++++++++++
 7 files changed, 170 insertions(+), 29 deletions(-)

diff --git a/libs/utils/src/http/json.rs b/libs/utils/src/http/json.rs
index 6c25440b42..e53231f313 100644
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -5,6 +5,7 @@ use serde::{Deserialize, Serialize};
 
 use super::error::ApiError;
 
+/// Parse a json request body and deserialize it to the type `T`.
 pub async fn json_request<T: for<'de> Deserialize<'de>>(
     request: &mut Request<Body>,
 ) -> Result<T, ApiError> {
@@ -27,6 +28,27 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
         .map_err(ApiError::BadRequest)
 }
 
+/// Parse a json request body and deserialize it to the type `T`. If the body is empty, return `T::default`.
+pub async fn json_request_maybe<T: for<'de> Deserialize<'de> + Default>(
+    request: &mut Request<Body>,
+) -> Result<T, ApiError> {
+    let body = hyper::body::aggregate(request.body_mut())
+        .await
+        .context("Failed to read request body")
+        .map_err(ApiError::BadRequest)?;
+
+    if body.remaining() == 0 {
+        return Ok(T::default());
+    }
+
+    let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
+
+    serde_path_to_error::deserialize(&mut deser)
+        // intentionally stringify because the debug version is not helpful in python logs
+        .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
+        .map_err(ApiError::BadRequest)
+}
+
 pub fn json_response<T: Serialize>(
     status: StatusCode,
     data: T,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ab170679ba..306b0f35ab 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -83,6 +83,8 @@ use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
+use crate::tenant::timeline::CompactOptions;
+use crate::tenant::timeline::CompactRange;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
@@ -100,7 +102,7 @@ use utils::{
     http::{
         endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
         error::{ApiError, HttpErrorBody},
-        json::{json_request, json_response},
+        json::{json_request, json_request_maybe, json_response},
         request::parse_request_param,
         RequestExt, RouterBuilder,
     },
@@ -1927,13 +1929,15 @@ async fn timeline_gc_handler(
 
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
     cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
+    let compact_range = json_request_maybe::<Option<CompactRange>>(&mut request).await?;
+
     let state = get_state(&request);
 
     let mut flags = EnumSet::empty();
@@ -1957,11 +1961,16 @@ async fn timeline_compact_handler(
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
+    let options = CompactOptions {
+        compact_range,
+        flags,
+    };
+
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
         timeline
-            .compact(&cancel, flags, &ctx)
+            .compact_with_options(&cancel, options, &ctx)
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
         if wait_until_uploaded {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 46317e93ee..37bf83c984 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5254,7 +5254,7 @@ mod tests {
     use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
-    use timeline::DeltaLayerTestDesc;
+    use timeline::{CompactOptions, DeltaLayerTestDesc};
     use utils::id::TenantId;
 
     #[cfg(feature = "testing")]
@@ -7728,7 +7728,7 @@ mod tests {
 
         let cancel = CancellationToken::new();
         tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
 
@@ -7805,7 +7805,7 @@ mod tests {
             guard.cutoffs.space = Lsn(0x40);
         }
         tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
 
@@ -8237,7 +8237,7 @@ mod tests {
 
         let cancel = CancellationToken::new();
         tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
 
@@ -8266,7 +8266,7 @@ mod tests {
             guard.cutoffs.space = Lsn(0x40);
         }
         tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
 
@@ -8819,7 +8819,14 @@ mod tests {
         dryrun_flags.insert(CompactFlags::DryRun);
 
         tline
-            .compact_with_gc(&cancel, dryrun_flags, &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: dryrun_flags,
+                    compact_range: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
@@ -8827,14 +8834,14 @@ mod tests {
         verify_result().await;
 
         tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
         verify_result().await;
 
         // compact again
         tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
         verify_result().await;
@@ -8847,14 +8854,14 @@ mod tests {
             guard.cutoffs.space = Lsn(0x38);
         }
         tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
         verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
 
         // not increasing the GC horizon and compact again
         tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
         verify_result().await;
@@ -9048,7 +9055,14 @@ mod tests {
         dryrun_flags.insert(CompactFlags::DryRun);
 
         tline
-            .compact_with_gc(&cancel, dryrun_flags, &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: dryrun_flags,
+                    compact_range: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
@@ -9056,14 +9070,14 @@ mod tests {
         verify_result().await;
 
         tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
         verify_result().await;
 
         // compact again
         tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
         verify_result().await;
@@ -9248,7 +9262,7 @@ mod tests {
 
         let cancel = CancellationToken::new();
         branch_tline
-            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
             .await
             .unwrap();
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5547bc2c7a..0eb3de21e9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -774,6 +774,21 @@ pub(crate) enum CompactFlags {
     DryRun,
 }
 
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactRange {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub start: Key,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub end: Key,
+}
+
+#[derive(Clone, Default)]
+pub(crate) struct CompactOptions {
+    pub flags: EnumSet<CompactFlags>,
+    pub compact_range: Option<CompactRange>,
+}
+
 impl std::fmt::Debug for Timeline {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         write!(f, "Timeline<{}>", self.timeline_id)
@@ -1612,6 +1627,25 @@ impl Timeline {
         cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
+    ) -> Result<bool, CompactionError> {
+        self.compact_with_options(
+            cancel,
+            CompactOptions {
+                flags,
+                compact_range: None,
+            },
+            ctx,
+        )
+        .await
+    }
+
+    /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
+    /// compaction tasks.
+    pub(crate) async fn compact_with_options(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        options: CompactOptions,
+        ctx: &RequestContext,
     ) -> Result<bool, CompactionError> {
         // most likely the cancellation token is from background task, but in tests it could be the
         // request task as well.
@@ -1649,7 +1683,7 @@ impl Timeline {
                 self.compact_tiered(cancel, ctx).await?;
                 Ok(false)
             }
-            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
+            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await,
         }
     }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index b30e380de5..ecd68ba55e 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,7 +10,7 @@ use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
+    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
     RecordedDuration, Timeline,
 };
 
@@ -273,22 +273,32 @@ impl Timeline {
     pub(crate) async fn compact_legacy(
         self: &Arc<Self>,
         cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
+        options: CompactOptions,
         ctx: &RequestContext,
     ) -> Result<bool, CompactionError> {
-        if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
-            self.compact_with_gc(cancel, flags, ctx)
+        if options
+            .flags
+            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+        {
+            self.compact_with_gc(cancel, options, ctx)
                 .await
                 .map_err(CompactionError::Other)?;
             return Ok(false);
         }
 
-        if flags.contains(CompactFlags::DryRun) {
+        if options.flags.contains(CompactFlags::DryRun) {
             return Err(CompactionError::Other(anyhow!(
                 "dry-run mode is not supported for legacy compaction for now"
             )));
         }
 
+        if options.compact_range.is_some() {
+            // maybe useful in the future? could implement this at some point
+            return Err(CompactionError::Other(anyhow!(
+                "compaction range is not supported for legacy compaction for now"
+            )));
+        }
+
         // High level strategy for compaction / image creation:
         //
         // 1. First, calculate the desired "partitioning" of the
@@ -338,7 +348,7 @@ impl Timeline {
             .repartition(
                 self.get_last_record_lsn(),
                 self.get_compaction_target_size(),
-                flags,
+                options.flags,
                 ctx,
             )
             .await
@@ -354,7 +364,7 @@ impl Timeline {
                 let fully_compacted = self
                     .compact_level0(
                         target_file_size,
-                        flags.contains(CompactFlags::ForceL0Compaction),
+                        options.flags.contains(CompactFlags::ForceL0Compaction),
                         ctx,
                     )
                     .await?;
@@ -372,7 +382,10 @@ impl Timeline {
                         .create_image_layers(
                             &partitioning,
                             lsn,
-                            if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                            if options
+                                .flags
+                                .contains(CompactFlags::ForceImageLayerCreation)
+                            {
                                 ImageLayerCreationMode::Force
                             } else {
                                 ImageLayerCreationMode::Try
@@ -1736,11 +1749,19 @@ impl Timeline {
     pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
         cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
+        options: CompactOptions,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx)
-            .await
+        self.partial_compact_with_gc(
+            options
+                .compact_range
+                .map(|range| range.start..range.end)
+                .unwrap_or_else(|| Key::MIN..Key::MAX),
+            cancel,
+            options.flags,
+            ctx,
+        )
+        .await
     }
 
     /// An experimental compaction building block that combines compaction with garbage collection.
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index d1a9b5921a..01583757fa 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -665,6 +665,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         force_l0_compaction=False,
         wait_until_uploaded=False,
         enhanced_gc_bottom_most_compaction=False,
+        body: Optional[dict[str, Any]] = None,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -683,6 +684,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact",
             params=query,
+            json=body,
         )
         log.info(f"Got compact request response code: {res.status_code}")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index a02d0f6b98..48950a5a50 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -116,6 +116,45 @@ page_cache_size=10
     assert vectored_average < 8
 
 
+def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 1000
+    churn_rounds = 10
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        if i % 10 == 0:
+            log.info(f"Running churn round {i}/{churn_rounds} ...")
+
+        workload.churn_rows(row_count, env.pageserver.id)
+        # Force L0 compaction to ensure the number of layers is within bounds, so that gc-compaction can run.
+        ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
+        assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1
+        ps_http.timeline_compact(
+            tenant_id,
+            timeline_id,
+            enhanced_gc_bottom_most_compaction=True,
+            body={
+                "start": "000000000000000000000000000000000000",
+                "end": "030000000000000000000000000000000000",
+            },
+        )
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+
+
 # Stripe sizes in number of pages.
 TINY_STRIPES = 16
 LARGE_STRIPES = 32768

From 770ac34ae6137bfb3c7dab9536a2943e209f21d0 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 19 Nov 2024 22:29:57 +0200
Subject: [PATCH 06/24] Register custom xlog reader callbacks for on-demand WAL
 download in StartupDecodingContext (#9007)

## Problem

See https://github.com/neondatabase/neon/issues/8931
On-demand WAL download are not set in all cases where WAL is accessed by
logical replication

## Summary of changes

Set customer xlog reader handles in StartupDecodingContext

Related changes in Postgres modules:

https://github.com/neondatabase/postgres/pull/495
https://github.com/neondatabase/postgres/pull/496
https://github.com/neondatabase/postgres/pull/497
https://github.com/neondatabase/postgres/pull/498

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/neon.c                              |  4 +--
 .../regress/test_ondemand_wal_download.py     | 27 +++++++++++++++++++
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/postgres-v17                           |  2 +-
 vendor/revisions.json                         |  8 +++---
 7 files changed, 36 insertions(+), 11 deletions(-)
 create mode 100644 test_runner/regress/test_ondemand_wal_download.py

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index f207ed61f9..51b9f58bbc 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -421,9 +421,7 @@ _PG_init(void)
 
 	pg_init_libpagestore();
 	pg_init_walproposer();
-	WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
-	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
-	SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
+	Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
 	InitUnstableExtensionsSupport();
 	InitLogicalReplicationMonitor();
diff --git a/test_runner/regress/test_ondemand_wal_download.py b/test_runner/regress/test_ondemand_wal_download.py
new file mode 100644
index 0000000000..a7eb3e6625
--- /dev/null
+++ b/test_runner/regress/test_ondemand_wal_download.py
@@ -0,0 +1,27 @@
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_on_demand_wal_download(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    ep = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+        config_lines=[
+            "max_wal_size=32MB",
+            "min_wal_size=32MB",
+            "neon.logical_replication_max_snap_files=10000",
+        ],
+    )
+
+    con = ep.connect()
+    cur = con.cursor()
+    cur.execute("CREATE TABLE t(pk bigint primary key, payload text)")
+    cur.execute("ALTER TABLE t ALTER payload SET STORAGE external")
+    cur.execute("select pg_create_logical_replication_slot('myslot', 'test_decoding', false, true)")
+    cur.execute("insert into t values (generate_series(1,100000),repeat('?',10000))")
+
+    ep.stop("fast")
+    ep.start()
+    con = ep.connect()
+    cur = con.cursor()
+    cur.execute("select pg_replication_slot_advance('myslot', pg_current_wal_insert_lsn())")
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index e54af35045..aeecd27b1f 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit e54af3504513b1f44c0e0f68791a0d6d4210e948
+Subproject commit aeecd27b1f0775b606409d1cbb9c8aa9853a82af
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 29bf1f04a5..544620db4c 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 29bf1f04a5628618b4c7972fed6f87065e3750ce
+Subproject commit 544620db4ca6945be4f1f686a7fbd2cdfb0bf96f
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index b7e9ac3eb9..3cc152ae2d 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit b7e9ac3eb9c5f43c443ebc76ddf06d5038c9bb34
+Subproject commit 3cc152ae2d17b19679c7102486bdb94677705c02
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index a05dc1378d..e5d795a1a0 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit a05dc1378dd822276dc99cb5e888f905d3527597
+Subproject commit e5d795a1a0c25da907176d37c905badab70e00c0
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 7243ba8716..a13ef29e45 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "a05dc1378dd822276dc99cb5e888f905d3527597"
+    "e5d795a1a0c25da907176d37c905badab70e00c0"
   ],
   "v16": [
     "16.6",
-    "b7e9ac3eb9c5f43c443ebc76ddf06d5038c9bb34"
+    "3cc152ae2d17b19679c7102486bdb94677705c02"
   ],
   "v15": [
     "15.10",
-    "29bf1f04a5628618b4c7972fed6f87065e3750ce"
+    "544620db4ca6945be4f1f686a7fbd2cdfb0bf96f"
   ],
   "v14": [
     "14.15",
-    "e54af3504513b1f44c0e0f68791a0d6d4210e948"
+    "aeecd27b1f0775b606409d1cbb9c8aa9853a82af"
   ]
 }

From 725e0a1ac9d1b409c57d2e7d87ac16d9c3d9f91b Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 19 Nov 2024 23:03:15 +0000
Subject: [PATCH 07/24] CI(release): create reusable workflow for releases
 (#9806)

## Problem

We have a bunch of duplicated code for automated releases. There will be
even more, once we have `release-compute` branch
(https://github.com/neondatabase/neon/pull/9637).

Another issue with the current `release` workflow is that it creates a
PR from the main as is. If we create 2 different releases from the
same commit, GitHub could mix up results from different PRs.

## Summary of changes
- Create a reusable workflow for releases
- Create an empty commit to differentiate releases
---
 .github/workflows/_create-release-pr.yml | 79 ++++++++++++++++++++++
 .github/workflows/release.yml            | 84 ++++--------------------
 2 files changed, 93 insertions(+), 70 deletions(-)
 create mode 100644 .github/workflows/_create-release-pr.yml

diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml
new file mode 100644
index 0000000000..cc6994397f
--- /dev/null
+++ b/.github/workflows/_create-release-pr.yml
@@ -0,0 +1,79 @@
+name: Create Release PR
+
+on:
+  workflow_call:
+    inputs:
+      component-name:
+        description: 'Component name'
+        required: true
+        type: string
+      release-branch:
+        description: 'Release branch'
+        required: true
+        type: string
+    secrets:
+      ci-access-token:
+        description: 'CI access token'
+        required: true
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+jobs:
+  create-storage-release-branch:
+    runs-on: ubuntu-22.04
+
+    permissions:
+      contents: write # for `git push`
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        ref: main
+
+    - name: Set variables
+      id: vars
+      env:
+        COMPONENT_NAME: ${{ inputs.component-name }}
+        RELEASE_BRANCH: ${{ inputs.release-branch }}
+      run: |
+        today=$(date +'%Y-%m-%d')
+        echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
+        echo "rc-branch=rc/${RELEASE_BRANCH}/${today}"  | tee -a ${GITHUB_OUTPUT}
+
+    - name: Configure git
+      run: |
+        git config user.name "github-actions[bot]"
+        git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
+
+    - name: Create RC branch
+      env:
+        RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
+        TITLE: ${{ steps.vars.outputs.title }}
+      run: |
+        git checkout -b "${RC_BRANCH}"
+
+        # create an empty commit to distinguish workflow runs
+        # from other possible releases from the same commit
+        git commit --allow-empty -m "${TITLE}"
+
+        git push origin "${RC_BRANCH}"
+
+    - name: Create a PR into ${{ inputs.release-branch }}
+      env:
+        GH_TOKEN: ${{ secrets.ci-access-token }}
+        RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
+        RELEASE_BRANCH: ${{ inputs.release-branch }}
+        TITLE: ${{ steps.vars.outputs.title }}
+      run: |
+        cat << EOF > body.md
+          ## ${TITLE}
+
+          **Please merge this Pull Request using 'Create a merge commit' button**
+        EOF
+
+        gh pr create --title "${TITLE}" \
+                     --body-file "body.md" \
+                     --head "${RC_BRANCH}" \
+                     --base "${RELEASE_BRANCH}"
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 56ef6f4bbb..11f010b6d4 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -26,82 +26,26 @@ defaults:
 jobs:
   create-storage-release-branch:
     if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
-    runs-on: ubuntu-22.04
 
     permissions:
-      contents: write # for `git push`
+      contents: write
 
-    steps:
-    - name: Check out code
-      uses: actions/checkout@v4
-      with:
-        ref: main
-
-    - name: Set environment variables
-      run: |
-        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-        echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-
-    - name: Create release branch
-      run: git checkout -b $RELEASE_BRANCH
-
-    - name: Push new branch
-      run: git push origin $RELEASE_BRANCH
-
-    - name: Create pull request into release
-      env:
-        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-      run: |
-        TITLE="Storage & Compute release ${RELEASE_DATE}"
-
-        cat << EOF > body.md
-          ## ${TITLE}
-
-          **Please merge this Pull Request using 'Create a merge commit' button**
-        EOF
-
-        gh pr create --title "${TITLE}" \
-                     --body-file "body.md" \
-                     --head "${RELEASE_BRANCH}" \
-                     --base "release"
+    uses: ./.github/workflows/_create-release-pr.yml
+    with:
+      component-name: 'Storage & Compute'
+      release-branch: 'release'
+    secrets:
+      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
 
   create-proxy-release-branch:
     if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
-    runs-on: ubuntu-22.04
 
     permissions:
-      contents: write # for `git push`
+      contents: write
 
-    steps:
-    - name: Check out code
-      uses: actions/checkout@v4
-      with:
-        ref: main
-
-    - name: Set environment variables
-      run: |
-        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-        echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
-
-    - name: Create release branch
-      run: git checkout -b $RELEASE_BRANCH
-
-    - name: Push new branch
-      run: git push origin $RELEASE_BRANCH
-
-    - name: Create pull request into release
-      env:
-        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
-      run: |
-        TITLE="Proxy release ${RELEASE_DATE}"
-
-        cat << EOF > body.md
-          ## ${TITLE}
-
-          **Please merge this Pull Request using 'Create a merge commit' button**
-        EOF
-
-        gh pr create --title "${TITLE}" \
-                     --body-file "body.md" \
-                     --head "${RELEASE_BRANCH}" \
-                     --base "release-proxy"
+    uses: ./.github/workflows/_create-release-pr.yml
+    with:
+      component-name: 'Proxy'
+      release-branch: 'release-proxy'
+    secrets:
+      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}

From 2281a02c49fd396ef9b06fafa35028c33eea8b3d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 20 Nov 2024 00:30:24 +0000
Subject: [PATCH 08/24] CODEOWNERS: add developer-productivity team (#9810)

Notify @neondatabase/developer-productivity team about changes in CI
(i.e. in `.github/` directory)
---
 CODEOWNERS | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index f8ed4be816..21b0e7c51f 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,6 +1,5 @@
+/.github/ @neondatabase/developer-productivity
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/storage_controller @neondatabase/storage
-/storage_scrubber @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
 /libs/remote_storage/ @neondatabase/storage
@@ -11,4 +10,6 @@
 /pgxn/neon/ @neondatabase/compute @neondatabase/storage
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/storage
+/storage_controller @neondatabase/storage
+/storage_scrubber @neondatabase/storage
 /vendor/ @neondatabase/compute

From ea1858e3b66fa058ce8ddfb6f37b364154dd20a6 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm@gmail.com>
Date: Wed, 20 Nov 2024 02:14:58 +0100
Subject: [PATCH 09/24] compute_ctl: Streamline and Pipeline startup SQL
 (#9717)

Before, compute_ctl didn't have a good registry for what command would
run when, depending exclusively on sync code to apply changes. When
users have many databases/roles to manage, this step can take a
substantial amount of time, breaking assumptions about low (re)start
times in other systems.

This commit reduces the time compute_ctl takes to restart when changes
must be applied, by making all commands more or less blind writes, and
applying these commands in an asynchronous context, only waiting for
completion once we know the commands have all been sent.

Additionally, this reduces time spent by batching per-database
operations where previously we would create a new SQL connection for
every user-database operation we planned to execute.
---
 compute_tools/src/catalog.rs                  |  44 +-
 compute_tools/src/checker.rs                  |  28 -
 compute_tools/src/compute.rs                  | 397 ++++++++--
 compute_tools/src/lib.rs                      |   1 +
 compute_tools/src/pg_helpers.rs               |  39 +-
 compute_tools/src/spec.rs                     | 634 +---------------
 compute_tools/src/spec_apply.rs               | 680 ++++++++++++++++++
 .../src/sql/add_availabilitycheck_tables.sql  |  18 +
 .../src/sql/anon_ext_fn_reassign.sql          |  12 +
 compute_tools/src/sql/default_grants.sql      |  30 +
 .../src/sql/set_public_schema_owner.sql       |  23 +
 .../src/sql/unset_template_for_drop_dbs.sql   |  12 +
 12 files changed, 1146 insertions(+), 772 deletions(-)
 create mode 100644 compute_tools/src/spec_apply.rs
 create mode 100644 compute_tools/src/sql/add_availabilitycheck_tables.sql
 create mode 100644 compute_tools/src/sql/anon_ext_fn_reassign.sql
 create mode 100644 compute_tools/src/sql/default_grants.sql
 create mode 100644 compute_tools/src/sql/set_public_schema_owner.sql
 create mode 100644 compute_tools/src/sql/unset_template_for_drop_dbs.sql

diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 4fefa831e0..2f6f82dd39 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -1,38 +1,40 @@
-use compute_api::{
-    responses::CatalogObjects,
-    spec::{Database, Role},
-};
+use compute_api::responses::CatalogObjects;
 use futures::Stream;
-use postgres::{Client, NoTls};
+use postgres::NoTls;
 use std::{path::Path, process::Stdio, result::Result, sync::Arc};
 use tokio::{
     io::{AsyncBufReadExt, BufReader},
     process::Command,
-    task,
+    spawn,
 };
+use tokio_postgres::connect;
 use tokio_stream::{self as stream, StreamExt};
 use tokio_util::codec::{BytesCodec, FramedRead};
 use tracing::warn;
 
-use crate::{
-    compute::ComputeNode,
-    pg_helpers::{get_existing_dbs, get_existing_roles},
-};
+use crate::compute::ComputeNode;
+use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async};
 
 pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
     let connstr = compute.connstr.clone();
-    task::spawn_blocking(move || {
-        let mut client = Client::connect(connstr.as_str(), NoTls)?;
-        let roles: Vec<Role>;
-        {
-            let mut xact = client.transaction()?;
-            roles = get_existing_roles(&mut xact)?;
-        }
-        let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
 
-        Ok(CatalogObjects { roles, databases })
-    })
-    .await?
+    let (client, connection): (tokio_postgres::Client, _) =
+        connect(connstr.as_str(), NoTls).await?;
+
+    spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let roles = get_existing_roles_async(&client).await?;
+
+    let databases = get_existing_dbs_async(&client)
+        .await?
+        .into_values()
+        .collect();
+
+    Ok(CatalogObjects { roles, databases })
 }
 
 #[derive(Debug, thiserror::Error)]
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index d76eaad0a0..cec2b1bed8 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,37 +1,9 @@
 use anyhow::{anyhow, Ok, Result};
-use postgres::Client;
 use tokio_postgres::NoTls;
 use tracing::{error, instrument, warn};
 
 use crate::compute::ComputeNode;
 
-/// Create a special service table for availability checks
-/// only if it does not exist already.
-pub fn create_availability_check_data(client: &mut Client) -> Result<()> {
-    let query = "
-        DO $$
-        BEGIN
-            IF NOT EXISTS(
-                SELECT 1
-                FROM pg_catalog.pg_tables
-                WHERE tablename = 'health_check'
-            )
-            THEN
-            CREATE TABLE health_check (
-                id serial primary key,
-                updated_at timestamptz default now()
-            );
-            INSERT INTO health_check VALUES (1, now())
-                ON CONFLICT (id) DO UPDATE
-                 SET updated_at = now();
-            END IF;
-        END
-        $$;";
-    client.execute(query, &[])?;
-
-    Ok(())
-}
-
 /// Update timestamp in a row in a special service table to check
 /// that we can actually write some data in this particular timeline.
 #[instrument(skip_all)]
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0a8cb14058..4f67425ba8 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,20 +1,21 @@
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::env;
 use std::fs;
+use std::iter::once;
 use std::os::unix::fs::{symlink, PermissionsExt};
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
 use std::sync::atomic::AtomicU32;
 use std::sync::atomic::Ordering;
-use std::sync::{Condvar, Mutex, RwLock};
+use std::sync::{Arc, Condvar, Mutex, RwLock};
 use std::thread;
 use std::time::Duration;
 use std::time::Instant;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use compute_api::spec::PgIdent;
+use compute_api::spec::{PgIdent, Role};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -31,15 +32,23 @@ use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion};
 use utils::measured_stream::MeasuredReader;
 
 use nix::sys::signal::{kill, Signal};
-
 use remote_storage::{DownloadError, RemotePath};
+use tokio::spawn;
+use url::Url;
 
-use crate::checker::create_availability_check_data;
 use crate::installed_extensions::get_installed_extensions_sync;
 use crate::local_proxy;
-use crate::logger::inlinify;
 use crate::pg_helpers::*;
 use crate::spec::*;
+use crate::spec_apply::ApplySpecPhase::{
+    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSuperUser,
+    DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions,
+    RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
+};
+use crate::spec_apply::PerDatabasePhase::{
+    ChangeSchemaPerms, DeleteDBRoleReferences, HandleAnonExtension,
+};
+use crate::spec_apply::{apply_operations, MutableApplyContext, DB};
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server};
 
@@ -224,10 +233,7 @@ fn maybe_cgexec(cmd: &str) -> Command {
     }
 }
 
-/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
-/// that we give to customers
-#[instrument(skip_all)]
-fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String {
     let roles = spec
         .cluster
         .roles
@@ -296,11 +302,8 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
             $$;"#,
         roles_decl, database_decl,
     );
-    info!("Neon superuser created: {}", inlinify(&query));
-    client
-        .simple_query(&query)
-        .map_err(|e| anyhow::anyhow!(e).context(query))?;
-    Ok(())
+
+    query
 }
 
 impl ComputeNode {
@@ -813,21 +816,14 @@ impl ComputeNode {
         Ok(())
     }
 
-    /// Do initial configuration of the already started Postgres.
-    #[instrument(skip_all)]
-    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
-        // If connection fails,
-        // it may be the old node with `zenith_admin` superuser.
-        //
-        // In this case we need to connect with old `zenith_admin` name
-        // and create new user. We cannot simply rename connected user,
-        // but we can create a new one and grant it all privileges.
-        let mut connstr = self.connstr.clone();
+    async fn get_maintenance_client(url: &Url) -> Result<tokio_postgres::Client> {
+        let mut connstr = url.clone();
+
         connstr
             .query_pairs_mut()
             .append_pair("application_name", "apply_config");
 
-        let mut client = match Client::connect(connstr.as_str(), NoTls) {
+        let (client, conn) = match tokio_postgres::connect(connstr.as_str(), NoTls).await {
             Err(e) => match e.code() {
                 Some(&SqlState::INVALID_PASSWORD)
                 | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
@@ -845,8 +841,8 @@ impl ComputeNode {
                     let mut client =
                         Client::connect(zenith_admin_connstr.as_str(), NoTls)
                             .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
-                    // Disable forwarding so that users don't get a cloud_admin role
 
+                    // Disable forwarding so that users don't get a cloud_admin role
                     let mut func = || {
                         client.simple_query("SET neon.forward_ddl = false")?;
                         client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
@@ -858,49 +854,309 @@ impl ComputeNode {
                     drop(client);
 
                     // reconnect with connstring with expected name
-                    Client::connect(connstr.as_str(), NoTls)?
+                    tokio_postgres::connect(connstr.as_str(), NoTls).await?
                 }
                 _ => return Err(e.into()),
             },
-            Ok(client) => client,
+            Ok((client, conn)) => (client, conn),
         };
 
-        // Disable DDL forwarding because control plane already knows about these roles/databases.
+        spawn(async move {
+            if let Err(e) = conn.await {
+                error!("maintenance client connection error: {}", e);
+            }
+        });
+
+        // Disable DDL forwarding because control plane already knows about the roles/databases
+        // we're about to modify.
         client
             .simple_query("SET neon.forward_ddl = false")
+            .await
             .context("apply_config SET neon.forward_ddl = false")?;
 
-        // Proceed with post-startup configuration. Note, that order of operations is important.
-        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
-        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
-        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
-        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)
-            .context("apply_config handle_role_deletions")?;
-        handle_grants(
-            spec,
-            &mut client,
-            connstr.as_str(),
-            self.has_feature(ComputeFeature::AnonExtension),
-        )
-        .context("apply_config handle_grants")?;
-        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
-        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
-        create_availability_check_data(&mut client)
-            .context("apply_config create_availability_check_data")?;
+        Ok(client)
+    }
 
-        // 'Close' connection
-        drop(client);
+    /// Apply the spec to the running PostgreSQL instance.
+    /// The caller can decide to run with multiple clients in parallel, or
+    /// single mode.  Either way, the commands executed will be the same, and
+    /// only commands run in different databases are parallelized.
+    #[instrument(skip_all)]
+    pub fn apply_spec_sql(
+        &self,
+        spec: Arc<ComputeSpec>,
+        url: Arc<Url>,
+        concurrency: usize,
+    ) -> Result<()> {
+        let rt = tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .build()?;
 
-        if let Some(ref local_proxy) = spec.local_proxy_config {
+        info!("Applying config with max {} concurrency", concurrency);
+        debug!("Config: {:?}", spec);
+
+        rt.block_on(async {
+            // Proceed with post-startup configuration. Note, that order of operations is important.
+            let client = Self::get_maintenance_client(&url).await?;
+            let spec = spec.clone();
+
+            let databases = get_existing_dbs_async(&client).await?;
+            let roles = get_existing_roles_async(&client)
+                .await?
+                .into_iter()
+                .map(|role| (role.name.clone(), role))
+                .collect::<HashMap<String, Role>>();
+
+            let jwks_roles = Arc::new(
+                spec.as_ref()
+                    .local_proxy_config
+                    .iter()
+                    .flat_map(|it| &it.jwks)
+                    .flatten()
+                    .flat_map(|setting| &setting.role_names)
+                    .cloned()
+                    .collect::<HashSet<_>>(),
+            );
+
+            let ctx = Arc::new(tokio::sync::RwLock::new(MutableApplyContext {
+                roles,
+                dbs: databases,
+            }));
+
+            for phase in [
+                CreateSuperUser,
+                DropInvalidDatabases,
+                RenameRoles,
+                CreateAndAlterRoles,
+                RenameAndDeleteDatabases,
+                CreateAndAlterDatabases,
+            ] {
+                debug!("Applying phase {:?}", &phase);
+                apply_operations(
+                    spec.clone(),
+                    ctx.clone(),
+                    jwks_roles.clone(),
+                    phase,
+                    || async { Ok(&client) },
+                )
+                .await?;
+            }
+
+            let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
+
+            let db_processes = spec
+                .cluster
+                .databases
+                .iter()
+                .map(|db| DB::new(db.clone()))
+                // include
+                .chain(once(DB::SystemDB))
+                .map(|db| {
+                    let spec = spec.clone();
+                    let ctx = ctx.clone();
+                    let jwks_roles = jwks_roles.clone();
+                    let mut url = url.as_ref().clone();
+                    let concurrency_token = concurrency_token.clone();
+                    let db = db.clone();
+
+                    debug!("Applying per-database phases for Database {:?}", &db);
+
+                    match &db {
+                        DB::SystemDB => {}
+                        DB::UserDB(db) => {
+                            url.set_path(db.name.as_str());
+                        }
+                    }
+
+                    let url = Arc::new(url);
+                    let fut = Self::apply_spec_sql_db(
+                        spec.clone(),
+                        url,
+                        ctx.clone(),
+                        jwks_roles.clone(),
+                        concurrency_token.clone(),
+                        db,
+                    );
+
+                    Ok(spawn(fut))
+                })
+                .collect::<Vec<Result<_, anyhow::Error>>>();
+
+            for process in db_processes.into_iter() {
+                let handle = process?;
+                handle.await??;
+            }
+
+            for phase in vec![
+                HandleOtherExtensions,
+                HandleNeonExtension,
+                CreateAvailabilityCheck,
+                DropRoles,
+            ] {
+                debug!("Applying phase {:?}", &phase);
+                apply_operations(
+                    spec.clone(),
+                    ctx.clone(),
+                    jwks_roles.clone(),
+                    phase,
+                    || async { Ok(&client) },
+                )
+                .await?;
+            }
+
+            Ok::<(), anyhow::Error>(())
+        })?;
+
+        Ok(())
+    }
+
+    /// Apply SQL migrations of the RunInEachDatabase phase.
+    ///
+    /// May opt to not connect to databases that don't have any scheduled
+    /// operations.  The function is concurrency-controlled with the provided
+    /// semaphore.  The caller has to make sure the semaphore isn't exhausted.
+    async fn apply_spec_sql_db(
+        spec: Arc<ComputeSpec>,
+        url: Arc<Url>,
+        ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>,
+        jwks_roles: Arc<HashSet<String>>,
+        concurrency_token: Arc<tokio::sync::Semaphore>,
+        db: DB,
+    ) -> Result<()> {
+        let _permit = concurrency_token.acquire().await?;
+
+        let mut client_conn = None;
+
+        for subphase in [
+            DeleteDBRoleReferences,
+            ChangeSchemaPerms,
+            HandleAnonExtension,
+        ] {
+            apply_operations(
+                spec.clone(),
+                ctx.clone(),
+                jwks_roles.clone(),
+                RunInEachDatabase {
+                    db: db.clone(),
+                    subphase,
+                },
+                // Only connect if apply_operation actually wants a connection.
+                // It's quite possible this database doesn't need any queries,
+                // so by not connecting we save time and effort connecting to
+                // that database.
+                || async {
+                    if client_conn.is_none() {
+                        let db_client = Self::get_maintenance_client(&url).await?;
+                        client_conn.replace(db_client);
+                    }
+                    let client = client_conn.as_ref().unwrap();
+                    Ok(client)
+                },
+            )
+            .await?;
+        }
+
+        drop(client_conn);
+
+        Ok::<(), anyhow::Error>(())
+    }
+
+    /// Do initial configuration of the already started Postgres.
+    #[instrument(skip_all)]
+    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
+        // If connection fails,
+        // it may be the old node with `zenith_admin` superuser.
+        //
+        // In this case we need to connect with old `zenith_admin` name
+        // and create new user. We cannot simply rename connected user,
+        // but we can create a new one and grant it all privileges.
+        let mut url = self.connstr.clone();
+        url.query_pairs_mut()
+            .append_pair("application_name", "apply_config");
+
+        let url = Arc::new(url);
+        let spec = Arc::new(
+            compute_state
+                .pspec
+                .as_ref()
+                .expect("spec must be set")
+                .spec
+                .clone(),
+        );
+
+        // Choose how many concurrent connections to use for applying the spec changes.
+        // If the cluster is not currently Running we don't have to deal with user connections,
+        // and can thus use all `max_connections` connection slots. However, that's generally not
+        // very efficient, so we generally still limit it to a smaller number.
+        let max_concurrent_connections = if compute_state.status != ComputeStatus::Running {
+            // If the settings contain 'max_connections', use that as template
+            if let Some(config) = spec.cluster.settings.find("max_connections") {
+                config.parse::<usize>().ok()
+            } else {
+                // Otherwise, try to find the setting in the postgresql_conf string
+                spec.cluster
+                    .postgresql_conf
+                    .iter()
+                    .flat_map(|conf| conf.split("\n"))
+                    .filter_map(|line| {
+                        if !line.contains("max_connections") {
+                            return None;
+                        }
+
+                        let (key, value) = line.split_once("=")?;
+                        let key = key
+                            .trim_start_matches(char::is_whitespace)
+                            .trim_end_matches(char::is_whitespace);
+
+                        let value = value
+                            .trim_start_matches(char::is_whitespace)
+                            .trim_end_matches(char::is_whitespace);
+
+                        if key != "max_connections" {
+                            return None;
+                        }
+
+                        value.parse::<usize>().ok()
+                    })
+                    .next()
+            }
+            // If max_connections is present, use at most 1/3rd of that.
+            // When max_connections is lower than 30, try to use at least 10 connections, but
+            // never more than max_connections.
+            .map(|limit| match limit {
+                0..10 => limit,
+                10..30 => 10,
+                30.. => limit / 3,
+            })
+            // If we didn't find max_connections, default to 10 concurrent connections.
+            .unwrap_or(10)
+        } else {
+            // state == Running
+            // Because the cluster is already in the Running state, we should assume users are
+            // already connected to the cluster, and high concurrency could negatively
+            // impact user connectivity. Therefore, we can limit concurrency to the number of
+            // reserved superuser connections, which users wouldn't be able to use anyway.
+            spec.cluster
+                .settings
+                .find("superuser_reserved_connections")
+                .iter()
+                .filter_map(|val| val.parse::<usize>().ok())
+                .map(|val| if val > 1 { val - 1 } else { 1 })
+                .last()
+                .unwrap_or(3)
+        };
+
+        // Merge-apply spec & changes to PostgreSQL state.
+        self.apply_spec_sql(spec.clone(), url.clone(), max_concurrent_connections)?;
+
+        if let Some(ref local_proxy) = &spec.clone().local_proxy_config {
             info!("configuring local_proxy");
             local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
         }
 
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
-            let mut connstr = connstr.clone();
+            let mut connstr = url.as_ref().clone();
             connstr
                 .query_pairs_mut()
                 .append_pair("application_name", "migrations");
@@ -908,7 +1164,8 @@ impl ComputeNode {
             let mut client = Client::connect(connstr.as_str(), NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
         });
-        Ok(())
+
+        Ok::<(), anyhow::Error>(())
     }
 
     // Wrapped this around `pg_ctl reload`, but right now we don't use
@@ -971,32 +1228,16 @@ impl ComputeNode {
         config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
             self.pg_reload_conf()?;
 
-            let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
-
-            // Proceed with post-startup configuration. Note, that order of operations is important.
-            // Disable DDL forwarding because control plane already knows about these roles/databases.
             if spec.mode == ComputeMode::Primary {
-                client.simple_query("SET neon.forward_ddl = false")?;
-                cleanup_instance(&mut client)?;
-                handle_roles(&spec, &mut client)?;
-                handle_databases(&spec, &mut client)?;
-                handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-                handle_grants(
-                    &spec,
-                    &mut client,
-                    self.connstr.as_str(),
-                    self.has_feature(ComputeFeature::AnonExtension),
-                )?;
-                handle_extensions(&spec, &mut client)?;
-                handle_extension_neon(&mut client)?;
-                // We can skip handle_migrations here because a new migration can only appear
-                // if we have a new version of the compute_ctl binary, which can only happen
-                // if compute got restarted, in which case we'll end up inside of apply_config
-                // instead of reconfigure.
-            }
+                let mut url = self.connstr.clone();
+                url.query_pairs_mut()
+                    .append_pair("application_name", "apply_config");
+                let url = Arc::new(url);
 
-            // 'Close' connection
-            drop(client);
+                let spec = Arc::new(spec.clone());
+
+                self.apply_spec_sql(spec, url, 1)?;
+            }
 
             Ok(())
         })?;
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index d27ae58fa2..ee4cf2dfa5 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -23,5 +23,6 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
+mod spec_apply;
 pub mod swap;
 pub mod sync_sk;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index b2dc265864..4a1e5ee0e8 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -10,9 +10,9 @@ use std::thread::JoinHandle;
 use std::time::{Duration, Instant};
 
 use anyhow::{bail, Result};
+use futures::StreamExt;
 use ini::Ini;
 use notify::{RecursiveMode, Watcher};
-use postgres::{Client, Transaction};
 use tokio::io::AsyncBufReadExt;
 use tokio::time::timeout;
 use tokio_postgres::NoTls;
@@ -197,27 +197,34 @@ impl Escaping for PgIdent {
 }
 
 /// Build a list of existing Postgres roles
-pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
-    let postgres_roles = xact
-        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
-        .iter()
+pub async fn get_existing_roles_async(client: &tokio_postgres::Client) -> Result<Vec<Role>> {
+    let postgres_roles = client
+        .query_raw::<str, &String, &[String; 0]>(
+            "SELECT rolname, rolpassword FROM pg_catalog.pg_authid",
+            &[],
+        )
+        .await?
+        .filter_map(|row| async { row.ok() })
         .map(|row| Role {
             name: row.get("rolname"),
             encrypted_password: row.get("rolpassword"),
             options: None,
         })
-        .collect();
+        .collect()
+        .await;
 
     Ok(postgres_roles)
 }
 
 /// Build a list of existing Postgres databases
-pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>> {
+pub async fn get_existing_dbs_async(
+    client: &tokio_postgres::Client,
+) -> Result<HashMap<String, Database>> {
     // `pg_database.datconnlimit = -2` means that the database is in the
     // invalid state. See:
     //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
-    let postgres_dbs: Vec<Database> = client
-        .query(
+    let rowstream = client
+        .query_raw::<str, &String, &[String; 0]>(
             "SELECT
                 datname AS name,
                 datdba::regrole::text AS owner,
@@ -226,8 +233,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>
             FROM
                 pg_catalog.pg_database;",
             &[],
-        )?
-        .iter()
+        )
+        .await?;
+
+    let dbs_map = rowstream
+        .filter_map(|r| async { r.ok() })
         .map(|row| Database {
             name: row.get("name"),
             owner: row.get("owner"),
@@ -235,12 +245,9 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>
             invalid: row.get("invalid"),
             options: None,
         })
-        .collect();
-
-    let dbs_map = postgres_dbs
-        .iter()
         .map(|db| (db.name.clone(), db.clone()))
-        .collect::<HashMap<_, _>>();
+        .collect::<HashMap<_, _>>()
+        .await;
 
     Ok(dbs_map)
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 73f3d1006a..c7d2deb090 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,22 +1,17 @@
-use std::collections::HashSet;
+use anyhow::{anyhow, bail, Result};
+use postgres::Client;
+use reqwest::StatusCode;
 use std::fs::File;
 use std::path::Path;
-use std::str::FromStr;
-
-use anyhow::{anyhow, bail, Context, Result};
-use postgres::config::Config;
-use postgres::{Client, NoTls};
-use reqwest::StatusCode;
-use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
+use tracing::{error, info, instrument, warn};
 
 use crate::config;
-use crate::logger::inlinify;
 use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
 
 use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
-use compute_api::spec::{ComputeSpec, PgIdent, Role};
+use compute_api::spec::ComputeSpec;
 
 // Do control plane request and return response if any. In case of error it
 // returns a bool flag indicating whether it makes sense to retry the request
@@ -151,625 +146,6 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
     Ok(())
 }
 
-/// Compute could be unexpectedly shut down, for example, during the
-/// database dropping. This leaves the database in the invalid state,
-/// which prevents new db creation with the same name. This function
-/// will clean it up before proceeding with catalog updates. All
-/// possible future cleanup operations may go here too.
-#[instrument(skip_all)]
-pub fn cleanup_instance(client: &mut Client) -> Result<()> {
-    let existing_dbs = get_existing_dbs(client)?;
-
-    for (_, db) in existing_dbs {
-        if db.invalid {
-            // After recent commit in Postgres, interrupted DROP DATABASE
-            // leaves the database in the invalid state. According to the
-            // commit message, the only option for user is to drop it again.
-            // See:
-            //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
-            //
-            // Postgres Neon extension is done the way, that db is de-registered
-            // in the control plane metadata only after it is dropped. So there is
-            // a chance that it still thinks that db should exist. This means
-            // that it will be re-created by `handle_databases()`. Yet, it's fine
-            // as user can just repeat drop (in vanilla Postgres they would need
-            // to do the same, btw).
-            let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote());
-            info!("dropping invalid database {}", db.name);
-            client.execute(query.as_str(), &[])?;
-        }
-    }
-
-    Ok(())
-}
-
-/// Given a cluster spec json and open transaction it handles roles creation,
-/// deletion and update.
-#[instrument(skip_all)]
-pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let mut xact = client.transaction()?;
-    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
-
-    let mut jwks_roles = HashSet::new();
-    if let Some(local_proxy) = &spec.local_proxy_config {
-        for jwks_setting in local_proxy.jwks.iter().flatten() {
-            for role_name in &jwks_setting.role_names {
-                jwks_roles.insert(role_name.clone());
-            }
-        }
-    }
-
-    // Print a list of existing Postgres roles (only in debug mode)
-    if span_enabled!(Level::INFO) {
-        let mut vec = Vec::new();
-        for r in &existing_roles {
-            vec.push(format!(
-                "{}:{}",
-                r.name,
-                if r.encrypted_password.is_some() {
-                    "[FILTERED]"
-                } else {
-                    "(null)"
-                }
-            ));
-        }
-
-        info!("postgres roles (total {}): {:?}", vec.len(), vec);
-    }
-
-    // Process delta operations first
-    if let Some(ops) = &spec.delta_operations {
-        info!("processing role renames");
-        for op in ops {
-            match op.action.as_ref() {
-                "delete_role" => {
-                    // no-op now, roles will be deleted at the end of configuration
-                }
-                // Renaming role drops its password, since role name is
-                // used as a salt there.  It is important that this role
-                // is recorded with a new `name` in the `roles` list.
-                // Follow up roles update will set the new password.
-                "rename_role" => {
-                    let new_name = op.new_name.as_ref().unwrap();
-
-                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
-                    if existing_roles.iter().any(|r| r.name == op.name) {
-                        let query: String = format!(
-                            "ALTER ROLE {} RENAME TO {}",
-                            op.name.pg_quote(),
-                            new_name.pg_quote()
-                        );
-
-                        warn!("renaming role '{}' to '{}'", op.name, new_name);
-                        xact.execute(query.as_str(), &[])?;
-                    }
-                }
-                _ => {}
-            }
-        }
-    }
-
-    // Refresh Postgres roles info to handle possible roles renaming
-    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
-
-    info!(
-        "handling cluster spec roles (total {})",
-        spec.cluster.roles.len()
-    );
-    for role in &spec.cluster.roles {
-        let name = &role.name;
-        // XXX: with a limited number of roles it is fine, but consider making it a HashMap
-        let pg_role = existing_roles.iter().find(|r| r.name == *name);
-
-        enum RoleAction {
-            None,
-            Update,
-            Create,
-        }
-        let action = if let Some(r) = pg_role {
-            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
-                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
-            {
-                RoleAction::Update
-            } else if let Some(pg_pwd) = &r.encrypted_password {
-                // Check whether password changed or not (trim 'md5' prefix first if any)
-                //
-                // This is a backward compatibility hack, which comes from the times when we were using
-                // md5 for everyone and hashes were stored in the console db without md5 prefix. So when
-                // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
-                // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
-                // Here is the only place so far where we compare hashes, so it seems to be the best candidate
-                // to place this compatibility layer.
-                let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
-                    stripped
-                } else {
-                    pg_pwd
-                };
-                if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
-                    RoleAction::Update
-                } else {
-                    RoleAction::None
-                }
-            } else {
-                RoleAction::None
-            }
-        } else {
-            RoleAction::Create
-        };
-
-        match action {
-            RoleAction::None => {}
-            RoleAction::Update => {
-                // This can be run on /every/ role! Not just ones created through the console.
-                // This means that if you add some funny ALTER here that adds a permission,
-                // this will get run even on user-created roles! This will result in different
-                // behavior before and after a spec gets reapplied. The below ALTER as it stands
-                // now only grants LOGIN and changes the password. Please do not allow this branch
-                // to do anything silly.
-                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
-                query.push_str(&role.to_pg_options());
-                xact.execute(query.as_str(), &[])?;
-            }
-            RoleAction::Create => {
-                // This branch only runs when roles are created through the console, so it is
-                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
-                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
-                    name.pg_quote()
-                );
-                if jwks_roles.contains(name.as_str()) {
-                    query = format!("CREATE ROLE {}", name.pg_quote());
-                }
-                info!("running role create query: '{}'", &query);
-                query.push_str(&role.to_pg_options());
-                xact.execute(query.as_str(), &[])?;
-            }
-        }
-
-        if span_enabled!(Level::INFO) {
-            let pwd = if role.encrypted_password.is_some() {
-                "[FILTERED]"
-            } else {
-                "(null)"
-            };
-            let action_str = match action {
-                RoleAction::None => "",
-                RoleAction::Create => " -> create",
-                RoleAction::Update => " -> update",
-            };
-            info!(" - {}:{}{}", name, pwd, action_str);
-        }
-    }
-
-    xact.commit()?;
-
-    Ok(())
-}
-
-/// Reassign all dependent objects and delete requested roles.
-#[instrument(skip_all)]
-pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
-    if let Some(ops) = &spec.delta_operations {
-        // First, reassign all dependent objects to db owners.
-        info!("reassigning dependent objects of to-be-deleted roles");
-
-        // Fetch existing roles. We could've exported and used `existing_roles` from
-        // `handle_roles()`, but we only make this list there before creating new roles.
-        // Which is probably fine as we never create to-be-deleted roles, but that'd
-        // just look a bit untidy. Anyway, the entire `pg_roles` should be in shared
-        // buffers already, so this shouldn't be a big deal.
-        let mut xact = client.transaction()?;
-        let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
-        xact.commit()?;
-
-        for op in ops {
-            // Check that role is still present in Postgres, as this could be a
-            // restart with the same spec after role deletion.
-            if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
-                reassign_owned_objects(spec, connstr, &op.name)?;
-            }
-        }
-
-        // Second, proceed with role deletions.
-        info!("processing role deletions");
-        let mut xact = client.transaction()?;
-        for op in ops {
-            // We do not check either role exists or not,
-            // Postgres will take care of it for us
-            if op.action == "delete_role" {
-                let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.pg_quote());
-
-                warn!("deleting role '{}'", &op.name);
-                xact.execute(query.as_str(), &[])?;
-            }
-        }
-        xact.commit()?;
-    }
-
-    Ok(())
-}
-
-fn reassign_owned_objects_in_one_db(
-    conf: Config,
-    role_name: &PgIdent,
-    db_owner: &PgIdent,
-) -> Result<()> {
-    let mut client = conf.connect(NoTls)?;
-
-    // This will reassign all dependent objects to the db owner
-    let reassign_query = format!(
-        "REASSIGN OWNED BY {} TO {}",
-        role_name.pg_quote(),
-        db_owner.pg_quote()
-    );
-    info!(
-        "reassigning objects owned by '{}' in db '{}' to '{}'",
-        role_name,
-        conf.get_dbname().unwrap_or(""),
-        db_owner
-    );
-    client.simple_query(&reassign_query)?;
-
-    // This now will only drop privileges of the role
-    let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
-    client.simple_query(&drop_query)?;
-    Ok(())
-}
-
-// Reassign all owned objects in all databases to the owner of the database.
-fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
-    for db in &spec.cluster.databases {
-        if db.owner != *role_name {
-            let mut conf = Config::from_str(connstr)?;
-            conf.dbname(&db.name);
-            reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?;
-        }
-    }
-
-    // Also handle case when there are no databases in the spec.
-    // In this case we need to reassign objects in the default database.
-    let conf = Config::from_str(connstr)?;
-    let db_owner = PgIdent::from_str("cloud_admin")?;
-    reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?;
-
-    Ok(())
-}
-
-/// It follows mostly the same logic as `handle_roles()` excepting that we
-/// does not use an explicit transactions block, since major database operations
-/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
-/// atomicity should be enough here due to the order of operations and various checks,
-/// which together provide us idempotency.
-#[instrument(skip_all)]
-pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let existing_dbs = get_existing_dbs(client)?;
-
-    // Print a list of existing Postgres databases (only in debug mode)
-    if span_enabled!(Level::INFO) {
-        let mut vec = Vec::new();
-        for (dbname, db) in &existing_dbs {
-            vec.push(format!("{}:{}", dbname, db.owner));
-        }
-        info!("postgres databases (total {}): {:?}", vec.len(), vec);
-    }
-
-    // Process delta operations first
-    if let Some(ops) = &spec.delta_operations {
-        info!("processing delta operations on databases");
-        for op in ops {
-            match op.action.as_ref() {
-                // We do not check either DB exists or not,
-                // Postgres will take care of it for us
-                "delete_db" => {
-                    // In Postgres we can't drop a database if it is a template.
-                    // So we need to unset the template flag first, but it could
-                    // be a retry, so we could've already dropped the database.
-                    // Check that database exists first to make it idempotent.
-                    let unset_template_query: String = format!(
-                        "
-                        DO $$
-                        BEGIN
-                            IF EXISTS(
-                                SELECT 1
-                                FROM pg_catalog.pg_database
-                                WHERE datname = {}
-                            )
-                            THEN
-                            ALTER DATABASE {} is_template false;
-                            END IF;
-                        END
-                        $$;",
-                        escape_literal(&op.name),
-                        &op.name.pg_quote()
-                    );
-                    // Use FORCE to drop database even if there are active connections.
-                    // We run this from `cloud_admin`, so it should have enough privileges.
-                    // NB: there could be other db states, which prevent us from dropping
-                    // the database. For example, if db is used by any active subscription
-                    // or replication slot.
-                    // TODO: deal with it once we allow logical replication. Proper fix should
-                    // involve returning an error code to the control plane, so it could
-                    // figure out that this is a non-retryable error, return it to the user
-                    // and fail operation permanently.
-                    let drop_db_query: String = format!(
-                        "DROP DATABASE IF EXISTS {} WITH (FORCE)",
-                        &op.name.pg_quote()
-                    );
-
-                    warn!("deleting database '{}'", &op.name);
-                    client.execute(unset_template_query.as_str(), &[])?;
-                    client.execute(drop_db_query.as_str(), &[])?;
-                }
-                "rename_db" => {
-                    let new_name = op.new_name.as_ref().unwrap();
-
-                    if existing_dbs.contains_key(&op.name) {
-                        let query: String = format!(
-                            "ALTER DATABASE {} RENAME TO {}",
-                            op.name.pg_quote(),
-                            new_name.pg_quote()
-                        );
-
-                        warn!("renaming database '{}' to '{}'", op.name, new_name);
-                        client.execute(query.as_str(), &[])?;
-                    }
-                }
-                _ => {}
-            }
-        }
-    }
-
-    // Refresh Postgres databases info to handle possible renames
-    let existing_dbs = get_existing_dbs(client)?;
-
-    info!(
-        "handling cluster spec databases (total {})",
-        spec.cluster.databases.len()
-    );
-    for db in &spec.cluster.databases {
-        let name = &db.name;
-        let pg_db = existing_dbs.get(name);
-
-        enum DatabaseAction {
-            None,
-            Update,
-            Create,
-        }
-        let action = if let Some(r) = pg_db {
-            // XXX: db owner name is returned as quoted string from Postgres,
-            // when quoting is needed.
-            let new_owner = if r.owner.starts_with('"') {
-                db.owner.pg_quote()
-            } else {
-                db.owner.clone()
-            };
-
-            if new_owner != r.owner {
-                // Update the owner
-                DatabaseAction::Update
-            } else {
-                DatabaseAction::None
-            }
-        } else {
-            DatabaseAction::Create
-        };
-
-        match action {
-            DatabaseAction::None => {}
-            DatabaseAction::Update => {
-                let query: String = format!(
-                    "ALTER DATABASE {} OWNER TO {}",
-                    name.pg_quote(),
-                    db.owner.pg_quote()
-                );
-                let _guard = info_span!("executing", query).entered();
-                client.execute(query.as_str(), &[])?;
-            }
-            DatabaseAction::Create => {
-                let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
-                query.push_str(&db.to_pg_options());
-                let _guard = info_span!("executing", query).entered();
-                client.execute(query.as_str(), &[])?;
-                let grant_query: String = format!(
-                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
-                    name.pg_quote()
-                );
-                client.execute(grant_query.as_str(), &[])?;
-            }
-        };
-
-        if span_enabled!(Level::INFO) {
-            let action_str = match action {
-                DatabaseAction::None => "",
-                DatabaseAction::Create => " -> create",
-                DatabaseAction::Update => " -> update",
-            };
-            info!(" - {}:{}{}", db.name, db.owner, action_str);
-        }
-    }
-
-    Ok(())
-}
-
-/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
-/// to allow users creating trusted extensions and re-creating `public` schema, for example.
-#[instrument(skip_all)]
-pub fn handle_grants(
-    spec: &ComputeSpec,
-    client: &mut Client,
-    connstr: &str,
-    enable_anon_extension: bool,
-) -> Result<()> {
-    info!("modifying database permissions");
-    let existing_dbs = get_existing_dbs(client)?;
-
-    // Do some per-database access adjustments. We'd better do this at db creation time,
-    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
-    // atomically.
-    for db in &spec.cluster.databases {
-        match existing_dbs.get(&db.name) {
-            Some(pg_db) => {
-                if pg_db.restrict_conn || pg_db.invalid {
-                    info!(
-                        "skipping grants for db {} (invalid: {}, connections not allowed: {})",
-                        db.name, pg_db.invalid, pg_db.restrict_conn
-                    );
-                    continue;
-                }
-            }
-            None => {
-                bail!(
-                    "database {} doesn't exist in Postgres after handle_databases()",
-                    db.name
-                );
-            }
-        }
-
-        let mut conf = Config::from_str(connstr)?;
-        conf.dbname(&db.name);
-
-        let mut db_client = conf.connect(NoTls)?;
-
-        // This will only change ownership on the schema itself, not the objects
-        // inside it. Without it owner of the `public` schema will be `cloud_admin`
-        // and database owner cannot do anything with it. SQL procedure ensures
-        // that it won't error out if schema `public` doesn't exist.
-        let alter_query = format!(
-            "DO $$\n\
-                DECLARE\n\
-                    schema_owner TEXT;\n\
-                BEGIN\n\
-                    IF EXISTS(\n\
-                        SELECT nspname\n\
-                        FROM pg_catalog.pg_namespace\n\
-                        WHERE nspname = 'public'\n\
-                    )\n\
-                    THEN\n\
-                        SELECT nspowner::regrole::text\n\
-                            FROM pg_catalog.pg_namespace\n\
-                            WHERE nspname = 'public'\n\
-                            INTO schema_owner;\n\
-                \n\
-                        IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\
-                        THEN\n\
-                            ALTER SCHEMA public OWNER TO {};\n\
-                        END IF;\n\
-                    END IF;\n\
-                END\n\
-            $$;",
-            db.owner.pg_quote()
-        );
-        db_client.simple_query(&alter_query)?;
-
-        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
-        // This is needed because since postgres 15 this privilege is removed by default.
-        // TODO: web_access isn't created for almost 1 year. It could be that we have
-        // active users of 1 year old projects, but hopefully not, so check it and
-        // remove this code if possible. The worst thing that could happen is that
-        // user won't be able to use public schema in NEW databases created in the
-        // very OLD project.
-        //
-        // Also, alter default permissions so that relations created by extensions can be
-        // used by neon_superuser without permission issues.
-        let grant_query = "DO $$\n\
-                BEGIN\n\
-                    IF EXISTS(\n\
-                        SELECT nspname\n\
-                        FROM pg_catalog.pg_namespace\n\
-                        WHERE nspname = 'public'\n\
-                    ) AND\n\
-                    current_setting('server_version_num')::int/10000 >= 15\n\
-                    THEN\n\
-                        IF EXISTS(\n\
-                            SELECT rolname\n\
-                            FROM pg_catalog.pg_roles\n\
-                            WHERE rolname = 'web_access'\n\
-                        )\n\
-                        THEN\n\
-                            GRANT CREATE ON SCHEMA public TO web_access;\n\
-                        END IF;\n\
-                    END IF;\n\
-                    IF EXISTS(\n\
-                        SELECT nspname\n\
-                        FROM pg_catalog.pg_namespace\n\
-                        WHERE nspname = 'public'\n\
-                    )\n\
-                    THEN\n\
-                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
-                        ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
-                    END IF;\n\
-                END\n\
-            $$;"
-        .to_string();
-
-        info!(
-            "grant query for db {} : {}",
-            &db.name,
-            inlinify(&grant_query)
-        );
-        db_client.simple_query(&grant_query)?;
-
-        // it is important to run this after all grants
-        if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)
-                .context("handle_grants handle_extension_anon")?;
-        }
-    }
-
-    Ok(())
-}
-
-/// Create required system extensions
-#[instrument(skip_all)]
-pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
-        if libs.contains("pg_stat_statements") {
-            // Create extension only if this compute really needs it
-            let query = "CREATE EXTENSION IF NOT EXISTS pg_stat_statements";
-            info!("creating system extensions with query: {}", query);
-            client.simple_query(query)?;
-        }
-    }
-
-    Ok(())
-}
-
-/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
-#[instrument(skip_all)]
-pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
-    info!("handle extension neon");
-
-    let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
-    client.simple_query(query)?;
-
-    query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
-    info!("create neon extension with query: {}", query);
-    client.simple_query(query)?;
-
-    query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
-    client.simple_query(query)?;
-
-    query = "ALTER EXTENSION neon SET SCHEMA neon";
-    info!("alter neon extension schema with query: {}", query);
-    client.simple_query(query)?;
-
-    // this will be a no-op if extension is already up to date,
-    // which may happen in two cases:
-    // - extension was just installed
-    // - extension was already installed and is up to date
-    let query = "ALTER EXTENSION neon UPDATE";
-    info!("update neon extension version with query: {}", query);
-    if let Err(e) = client.simple_query(query) {
-        error!(
-            "failed to upgrade neon extension during `handle_extension_neon`: {}",
-            e
-        );
-    }
-
-    Ok(())
-}
-
 #[instrument(skip_all)]
 pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
     info!("handle neon extension upgrade");
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
new file mode 100644
index 0000000000..7308d5d36e
--- /dev/null
+++ b/compute_tools/src/spec_apply.rs
@@ -0,0 +1,680 @@
+use std::collections::{HashMap, HashSet};
+use std::fmt::{Debug, Formatter};
+use std::future::Future;
+use std::iter::empty;
+use std::iter::once;
+use std::sync::Arc;
+
+use crate::compute::construct_superuser_query;
+use crate::pg_helpers::{escape_literal, DatabaseExt, Escaping, GenericOptionsSearch, RoleExt};
+use anyhow::{bail, Result};
+use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
+use futures::future::join_all;
+use tokio::sync::RwLock;
+use tokio_postgres::Client;
+use tracing::{debug, info_span, Instrument};
+
+#[derive(Clone)]
+pub enum DB {
+    SystemDB,
+    UserDB(Database),
+}
+
+impl DB {
+    pub fn new(db: Database) -> DB {
+        Self::UserDB(db)
+    }
+
+    pub fn is_owned_by(&self, role: &PgIdent) -> bool {
+        match self {
+            DB::SystemDB => false,
+            DB::UserDB(db) => &db.owner == role,
+        }
+    }
+}
+
+impl Debug for DB {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            DB::SystemDB => f.debug_tuple("SystemDB").finish(),
+            DB::UserDB(db) => f.debug_tuple("UserDB").field(&db.name).finish(),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum PerDatabasePhase {
+    DeleteDBRoleReferences,
+    ChangeSchemaPerms,
+    HandleAnonExtension,
+}
+
+#[derive(Clone, Debug)]
+pub enum ApplySpecPhase {
+    CreateSuperUser,
+    DropInvalidDatabases,
+    RenameRoles,
+    CreateAndAlterRoles,
+    RenameAndDeleteDatabases,
+    CreateAndAlterDatabases,
+    RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
+    HandleOtherExtensions,
+    HandleNeonExtension,
+    CreateAvailabilityCheck,
+    DropRoles,
+}
+
+pub struct Operation {
+    pub query: String,
+    pub comment: Option<String>,
+}
+
+pub struct MutableApplyContext {
+    pub roles: HashMap<String, Role>,
+    pub dbs: HashMap<String, Database>,
+}
+
+/// Appply the operations that belong to the given spec apply phase.
+///
+/// Commands within a single phase are executed in order of Iterator yield.
+/// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database
+/// indicated by its `db` field, and can share a single client for all changes
+/// to that database.
+///
+/// Notes:
+/// - Commands are pipelined, and thus may cause incomplete apply if one
+///   command of many fails.
+/// - Failing commands will fail the phase's apply step once the return value
+///   is processed.
+/// - No timeouts have (yet) been implemented.
+/// - The caller is responsible for limiting and/or applying concurrency.
+pub async fn apply_operations<'a, Fut, F>(
+    spec: Arc<ComputeSpec>,
+    ctx: Arc<RwLock<MutableApplyContext>>,
+    jwks_roles: Arc<HashSet<String>>,
+    apply_spec_phase: ApplySpecPhase,
+    client: F,
+) -> Result<()>
+where
+    F: FnOnce() -> Fut,
+    Fut: Future<Output = Result<&'a Client>>,
+{
+    debug!("Starting phase {:?}", &apply_spec_phase);
+    let span = info_span!("db_apply_changes", phase=?apply_spec_phase);
+    let span2 = span.clone();
+    async move {
+        debug!("Processing phase {:?}", &apply_spec_phase);
+        let ctx = ctx;
+
+        let mut ops = get_operations(&spec, &ctx, &jwks_roles, &apply_spec_phase)
+            .await?
+            .peekable();
+
+        // Return (and by doing so, skip requesting the PostgreSQL client) if
+        // we don't have any operations scheduled.
+        if ops.peek().is_none() {
+            return Ok(());
+        }
+
+        let client = client().await?;
+
+        debug!("Applying phase {:?}", &apply_spec_phase);
+
+        let active_queries = ops
+            .map(|op| {
+                let Operation { comment, query } = op;
+                let inspan = match comment {
+                    None => span.clone(),
+                    Some(comment) => info_span!("phase {}: {}", comment),
+                };
+
+                async {
+                    let query = query;
+                    let res = client.simple_query(&query).await;
+                    debug!(
+                        "{} {}",
+                        if res.is_ok() {
+                            "successfully executed"
+                        } else {
+                            "failed to execute"
+                        },
+                        query
+                    );
+                    res
+                }
+                .instrument(inspan)
+            })
+            .collect::<Vec<_>>();
+
+        drop(ctx);
+
+        for it in join_all(active_queries).await {
+            drop(it?);
+        }
+
+        debug!("Completed phase {:?}", &apply_spec_phase);
+
+        Ok(())
+    }
+    .instrument(span2)
+    .await
+}
+
+/// Create a stream of operations to be executed for that phase of applying
+/// changes.
+///
+/// In the future we may generate a single stream of changes and then
+/// sort/merge/batch execution, but for now this is a nice way to improve
+/// batching behaviour of the commands.
+async fn get_operations<'a>(
+    spec: &'a ComputeSpec,
+    ctx: &'a RwLock<MutableApplyContext>,
+    jwks_roles: &'a HashSet<String>,
+    apply_spec_phase: &'a ApplySpecPhase,
+) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
+    match apply_spec_phase {
+        ApplySpecPhase::CreateSuperUser => {
+            let query = construct_superuser_query(spec);
+
+            Ok(Box::new(once(Operation {
+                query,
+                comment: None,
+            })))
+        }
+        ApplySpecPhase::DropInvalidDatabases => {
+            let mut ctx = ctx.write().await;
+            let databases = &mut ctx.dbs;
+
+            let keys: Vec<_> = databases
+                .iter()
+                .filter(|(_, db)| db.invalid)
+                .map(|(dbname, _)| dbname.clone())
+                .collect();
+
+            // After recent commit in Postgres, interrupted DROP DATABASE
+            // leaves the database in the invalid state. According to the
+            // commit message, the only option for user is to drop it again.
+            // See:
+            //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
+            //
+            // Postgres Neon extension is done the way, that db is de-registered
+            // in the control plane metadata only after it is dropped. So there is
+            // a chance that it still thinks that the db should exist. This means
+            // that it will be re-created by the `CreateDatabases` phase. This
+            // is fine, as user can just drop the table again (in vanilla
+            // Postgres they would need to do the same).
+            let operations = keys
+                .into_iter()
+                .filter_map(move |dbname| ctx.dbs.remove(&dbname))
+                .map(|db| Operation {
+                    query: format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote()),
+                    comment: Some(format!("Dropping invalid database {}", db.name)),
+                });
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::RenameRoles => {
+            let mut ctx = ctx.write().await;
+
+            let operations = spec
+                .delta_operations
+                .iter()
+                .flatten()
+                .filter(|op| op.action == "rename_role")
+                .filter_map(move |op| {
+                    let roles = &mut ctx.roles;
+
+                    if roles.contains_key(op.name.as_str()) {
+                        None
+                    } else {
+                        let new_name = op.new_name.as_ref().unwrap();
+                        let mut role = roles.remove(op.name.as_str()).unwrap();
+
+                        role.name = new_name.clone();
+                        role.encrypted_password = None;
+                        roles.insert(role.name.clone(), role);
+
+                        Some(Operation {
+                            query: format!(
+                                "ALTER ROLE {} RENAME TO {}",
+                                op.name.pg_quote(),
+                                new_name.pg_quote()
+                            ),
+                            comment: Some(format!("renaming role '{}' to '{}'", op.name, new_name)),
+                        })
+                    }
+                });
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::CreateAndAlterRoles => {
+            let mut ctx = ctx.write().await;
+
+            let operations = spec.cluster.roles
+                .iter()
+                .filter_map(move |role| {
+                    let roles = &mut ctx.roles;
+                    let db_role = roles.get(&role.name);
+
+                    match db_role {
+                        Some(db_role) => {
+                            if db_role.encrypted_password != role.encrypted_password {
+                                // This can be run on /every/ role! Not just ones created through the console.
+                                // This means that if you add some funny ALTER here that adds a permission,
+                                // this will get run even on user-created roles! This will result in different
+                                // behavior before and after a spec gets reapplied. The below ALTER as it stands
+                                // now only grants LOGIN and changes the password. Please do not allow this branch
+                                // to do anything silly.
+                                Some(Operation {
+                                    query: format!(
+                                        "ALTER ROLE {} {}",
+                                        role.name.pg_quote(),
+                                        role.to_pg_options(),
+                                    ),
+                                    comment: None,
+                                })
+                            } else {
+                                None
+                            }
+                        }
+                        None => {
+                            let query = if !jwks_roles.contains(role.name.as_str()) {
+                                format!(
+                                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser {}",
+                                    role.name.pg_quote(),
+                                    role.to_pg_options(),
+                                )
+                            } else {
+                                format!(
+                                    "CREATE ROLE {} {}",
+                                    role.name.pg_quote(),
+                                    role.to_pg_options(),
+                                )
+                            };
+                            Some(Operation {
+                                query,
+                                comment: Some(format!("creating role {}", role.name)),
+                            })
+                        }
+                    }
+                });
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::RenameAndDeleteDatabases => {
+            let mut ctx = ctx.write().await;
+
+            let operations = spec
+                .delta_operations
+                .iter()
+                .flatten()
+                .filter_map(move |op| {
+                    let databases = &mut ctx.dbs;
+                    match op.action.as_str() {
+                        // We do not check whether the DB exists or not,
+                        // Postgres will take care of it for us
+                        "delete_db" => {
+                            // In Postgres we can't drop a database if it is a template.
+                            // So we need to unset the template flag first, but it could
+                            // be a retry, so we could've already dropped the database.
+                            // Check that database exists first to make it idempotent.
+                            let unset_template_query: String = format!(
+                                include_str!("sql/unset_template_for_drop_dbs.sql"),
+                                datname_str = escape_literal(&op.name),
+                                datname = &op.name.pg_quote()
+                            );
+
+                            // Use FORCE to drop database even if there are active connections.
+                            // We run this from `cloud_admin`, so it should have enough privileges.
+                            // NB: there could be other db states, which prevent us from dropping
+                            // the database. For example, if db is used by any active subscription
+                            // or replication slot.
+                            // TODO: deal with it once we allow logical replication. Proper fix should
+                            // involve returning an error code to the control plane, so it could
+                            // figure out that this is a non-retryable error, return it to the user
+                            // and fail operation permanently.
+                            let drop_db_query: String = format!(
+                                "DROP DATABASE IF EXISTS {} WITH (FORCE)",
+                                &op.name.pg_quote()
+                            );
+
+                            databases.remove(&op.name);
+
+                            Some(vec![
+                                Operation {
+                                    query: unset_template_query,
+                                    comment: Some(format!(
+                                        "optionally clearing template flags for DB {}",
+                                        op.name,
+                                    )),
+                                },
+                                Operation {
+                                    query: drop_db_query,
+                                    comment: Some(format!("deleting database {}", op.name,)),
+                                },
+                            ])
+                        }
+                        "rename_db" => {
+                            if let Some(mut db) = databases.remove(&op.name) {
+                                // update state of known databases
+                                let new_name = op.new_name.as_ref().unwrap();
+                                db.name = new_name.clone();
+                                databases.insert(db.name.clone(), db);
+
+                                Some(vec![Operation {
+                                    query: format!(
+                                        "ALTER DATABASE {} RENAME TO {}",
+                                        op.name.pg_quote(),
+                                        new_name.pg_quote(),
+                                    ),
+                                    comment: Some(format!(
+                                        "renaming database '{}' to '{}'",
+                                        op.name, new_name
+                                    )),
+                                }])
+                            } else {
+                                None
+                            }
+                        }
+                        _ => None,
+                    }
+                })
+                .flatten();
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::CreateAndAlterDatabases => {
+            let mut ctx = ctx.write().await;
+
+            let operations = spec
+                .cluster
+                .databases
+                .iter()
+                .filter_map(move |db| {
+                    let databases = &mut ctx.dbs;
+                    if let Some(edb) = databases.get_mut(&db.name) {
+                        let change_owner = if edb.owner.starts_with('"') {
+                            db.owner.pg_quote() != edb.owner
+                        } else {
+                            db.owner != edb.owner
+                        };
+
+                        edb.owner = db.owner.clone();
+
+                        if change_owner {
+                            Some(vec![Operation {
+                                query: format!(
+                                    "ALTER DATABASE {} OWNER TO {}",
+                                    db.name.pg_quote(),
+                                    db.owner.pg_quote()
+                                ),
+                                comment: Some(format!(
+                                    "changing database owner of database {} to {}",
+                                    db.name, db.owner
+                                )),
+                            }])
+                        } else {
+                            None
+                        }
+                    } else {
+                        databases.insert(db.name.clone(), db.clone());
+
+                        Some(vec![
+                            Operation {
+                                query: format!(
+                                    "CREATE DATABASE {} {}",
+                                    db.name.pg_quote(),
+                                    db.to_pg_options(),
+                                ),
+                                comment: None,
+                            },
+                            Operation {
+                                query: format!(
+                                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
+                                    db.name.pg_quote()
+                                ),
+                                comment: None,
+                            },
+                        ])
+                    }
+                })
+                .flatten();
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::RunInEachDatabase { db, subphase } => {
+            match subphase {
+                PerDatabasePhase::DeleteDBRoleReferences => {
+                    let ctx = ctx.read().await;
+
+                    let operations =
+                        spec.delta_operations
+                            .iter()
+                            .flatten()
+                            .filter(|op| op.action == "delete_role")
+                            .filter_map(move |op| {
+                                if db.is_owned_by(&op.name) {
+                                    return None;
+                                }
+                                if !ctx.roles.contains_key(&op.name) {
+                                    return None;
+                                }
+                                let quoted = op.name.pg_quote();
+                                let new_owner = match &db {
+                                    DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
+                                    DB::UserDB(db) => db.owner.pg_quote(),
+                                };
+
+                                Some(vec![
+                                    // This will reassign all dependent objects to the db owner
+                                    Operation {
+                                        query: format!(
+                                            "REASSIGN OWNED BY {} TO {}",
+                                            quoted, new_owner,
+                                        ),
+                                        comment: None,
+                                    },
+                                    // This now will only drop privileges of the role
+                                    Operation {
+                                        query: format!("DROP OWNED BY {}", quoted),
+                                        comment: None,
+                                    },
+                                ])
+                            })
+                            .flatten();
+
+                    Ok(Box::new(operations))
+                }
+                PerDatabasePhase::ChangeSchemaPerms => {
+                    let ctx = ctx.read().await;
+                    let databases = &ctx.dbs;
+
+                    let db = match &db {
+                        // ignore schema permissions on the system database
+                        DB::SystemDB => return Ok(Box::new(empty())),
+                        DB::UserDB(db) => db,
+                    };
+
+                    if databases.get(&db.name).is_none() {
+                        bail!("database {} doesn't exist in PostgreSQL", db.name);
+                    }
+
+                    let edb = databases.get(&db.name).unwrap();
+
+                    if edb.restrict_conn || edb.invalid {
+                        return Ok(Box::new(empty()));
+                    }
+
+                    let operations = vec![
+                        Operation {
+                            query: format!(
+                                include_str!("sql/set_public_schema_owner.sql"),
+                                db_owner = db.owner.pg_quote()
+                            ),
+                            comment: None,
+                        },
+                        Operation {
+                            query: String::from(include_str!("sql/default_grants.sql")),
+                            comment: None,
+                        },
+                    ]
+                    .into_iter();
+
+                    Ok(Box::new(operations))
+                }
+                PerDatabasePhase::HandleAnonExtension => {
+                    // Only install Anon into user databases
+                    let db = match &db {
+                        DB::SystemDB => return Ok(Box::new(empty())),
+                        DB::UserDB(db) => db,
+                    };
+                    // Never install Anon when it's not enabled as feature
+                    if !spec.features.contains(&ComputeFeature::AnonExtension) {
+                        return Ok(Box::new(empty()));
+                    }
+
+                    // Only install Anon when it's added in preload libraries
+                    let opt_libs = spec.cluster.settings.find("shared_preload_libraries");
+
+                    let libs = match opt_libs {
+                        Some(libs) => libs,
+                        None => return Ok(Box::new(empty())),
+                    };
+
+                    if !libs.contains("anon") {
+                        return Ok(Box::new(empty()));
+                    }
+
+                    let db_owner = db.owner.pg_quote();
+
+                    let operations = vec![
+                        // Create anon extension if this compute needs it
+                        // Users cannot create it themselves, because superuser is required.
+                        Operation {
+                            query: String::from("CREATE EXTENSION IF NOT EXISTS anon CASCADE"),
+                            comment: Some(String::from("creating anon extension")),
+                        },
+                        // Initialize anon extension
+                        // This also requires superuser privileges, so users cannot do it themselves.
+                        Operation {
+                            query: String::from("SELECT anon.init()"),
+                            comment: Some(String::from("initializing anon extension data")),
+                        },
+                        Operation {
+                            query: format!("GRANT ALL ON SCHEMA anon TO {}", db_owner),
+                            comment: Some(String::from(
+                                "granting anon extension schema permissions",
+                            )),
+                        },
+                        Operation {
+                            query: format!(
+                                "GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}",
+                                db_owner
+                            ),
+                            comment: Some(String::from(
+                                "granting anon extension schema functions permissions",
+                            )),
+                        },
+                        // We need this, because some functions are defined as SECURITY DEFINER.
+                        // In Postgres SECURITY DEFINER functions are executed with the privileges
+                        // of the owner.
+                        // In anon extension this it is needed to access some GUCs, which are only accessible to
+                        // superuser. But we've patched postgres to allow db_owner to access them as well.
+                        // So we need to change owner of these functions to db_owner.
+                        Operation {
+                            query: format!(
+                                include_str!("sql/anon_ext_fn_reassign.sql"),
+                                db_owner = db_owner,
+                            ),
+                            comment: Some(String::from(
+                                "change anon extension functions owner to database_owner",
+                            )),
+                        },
+                        Operation {
+                            query: format!(
+                                "GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}",
+                                db_owner,
+                            ),
+                            comment: Some(String::from(
+                                "granting anon extension tables permissions",
+                            )),
+                        },
+                        Operation {
+                            query: format!(
+                                "GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}",
+                                db_owner,
+                            ),
+                            comment: Some(String::from(
+                                "granting anon extension sequences permissions",
+                            )),
+                        },
+                    ]
+                    .into_iter();
+
+                    Ok(Box::new(operations))
+                }
+            }
+        }
+        // Interestingly, we only install p_s_s in the main database, even when
+        // it's preloaded.
+        ApplySpecPhase::HandleOtherExtensions => {
+            if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+                if libs.contains("pg_stat_statements") {
+                    return Ok(Box::new(once(Operation {
+                        query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"),
+                        comment: Some(String::from("create system extensions")),
+                    })));
+                }
+            }
+            Ok(Box::new(empty()))
+        }
+        ApplySpecPhase::HandleNeonExtension => {
+            let operations = vec![
+                Operation {
+                    query: String::from("CREATE SCHEMA IF NOT EXISTS neon"),
+                    comment: Some(String::from("init: add schema for extension")),
+                },
+                Operation {
+                    query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"),
+                    comment: Some(String::from(
+                        "init: install the extension if not already installed",
+                    )),
+                },
+                Operation {
+                    query: String::from(
+                        "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'",
+                    ),
+                    comment: Some(String::from("compat/fix: make neon relocatable")),
+                },
+                Operation {
+                    query: String::from("ALTER EXTENSION neon SET SCHEMA neon"),
+                    comment: Some(String::from("compat/fix: alter neon extension schema")),
+                },
+                Operation {
+                    query: String::from("ALTER EXTENSION neon UPDATE"),
+                    comment: Some(String::from("compat/update: update neon extension version")),
+                },
+            ]
+            .into_iter();
+
+            Ok(Box::new(operations))
+        }
+        ApplySpecPhase::CreateAvailabilityCheck => Ok(Box::new(once(Operation {
+            query: String::from(include_str!("sql/add_availabilitycheck_tables.sql")),
+            comment: None,
+        }))),
+        ApplySpecPhase::DropRoles => {
+            let operations = spec
+                .delta_operations
+                .iter()
+                .flatten()
+                .filter(|op| op.action == "delete_role")
+                .map(|op| Operation {
+                    query: format!("DROP ROLE IF EXISTS {}", op.name.pg_quote()),
+                    comment: None,
+                });
+
+            Ok(Box::new(operations))
+        }
+    }
+}
diff --git a/compute_tools/src/sql/add_availabilitycheck_tables.sql b/compute_tools/src/sql/add_availabilitycheck_tables.sql
new file mode 100644
index 0000000000..7c60690c78
--- /dev/null
+++ b/compute_tools/src/sql/add_availabilitycheck_tables.sql
@@ -0,0 +1,18 @@
+DO $$
+BEGIN
+    IF NOT EXISTS(
+        SELECT 1
+        FROM pg_catalog.pg_tables
+        WHERE tablename = 'health_check'
+    )
+    THEN
+    CREATE TABLE health_check (
+        id serial primary key,
+        updated_at timestamptz default now()
+    );
+    INSERT INTO health_check VALUES (1, now())
+        ON CONFLICT (id) DO UPDATE
+         SET updated_at = now();
+    END IF;
+END
+$$
\ No newline at end of file
diff --git a/compute_tools/src/sql/anon_ext_fn_reassign.sql b/compute_tools/src/sql/anon_ext_fn_reassign.sql
new file mode 100644
index 0000000000..3d7b15c590
--- /dev/null
+++ b/compute_tools/src/sql/anon_ext_fn_reassign.sql
@@ -0,0 +1,12 @@
+DO $$
+DECLARE
+    query varchar;
+BEGIN
+    FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};'
+    FROM pg_proc p
+        JOIN pg_namespace nsp ON p.pronamespace = nsp.oid
+    WHERE nsp.nspname = 'anon' LOOP
+        EXECUTE query;
+    END LOOP;
+END
+$$;
diff --git a/compute_tools/src/sql/default_grants.sql b/compute_tools/src/sql/default_grants.sql
new file mode 100644
index 0000000000..58ebb0690b
--- /dev/null
+++ b/compute_tools/src/sql/default_grants.sql
@@ -0,0 +1,30 @@
+DO
+$$
+    BEGIN
+        IF EXISTS(
+            SELECT nspname
+            FROM pg_catalog.pg_namespace
+            WHERE nspname = 'public'
+        ) AND
+           current_setting('server_version_num')::int / 10000 >= 15
+        THEN
+            IF EXISTS(
+                SELECT rolname
+                FROM pg_catalog.pg_roles
+                WHERE rolname = 'web_access'
+            )
+            THEN
+                GRANT CREATE ON SCHEMA public TO web_access;
+            END IF;
+        END IF;
+        IF EXISTS(
+            SELECT nspname
+            FROM pg_catalog.pg_namespace
+            WHERE nspname = 'public'
+        )
+        THEN
+            ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
+            ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;
+        END IF;
+    END
+$$;
\ No newline at end of file
diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql
new file mode 100644
index 0000000000..fd061a713e
--- /dev/null
+++ b/compute_tools/src/sql/set_public_schema_owner.sql
@@ -0,0 +1,23 @@
+DO
+$$
+    DECLARE
+        schema_owner TEXT;
+    BEGIN
+        IF EXISTS(
+            SELECT nspname
+            FROM pg_catalog.pg_namespace
+            WHERE nspname = 'public'
+        )
+        THEN
+            SELECT nspowner::regrole::text
+            FROM pg_catalog.pg_namespace
+            WHERE nspname = 'public'
+            INTO schema_owner;
+
+            IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
+            THEN
+                ALTER SCHEMA public OWNER TO {db_owner};
+            END IF;
+        END IF;
+    END
+$$;
\ No newline at end of file
diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql
new file mode 100644
index 0000000000..6c4343a589
--- /dev/null
+++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql
@@ -0,0 +1,12 @@
+DO $$
+    BEGIN
+        IF EXISTS(
+            SELECT 1
+            FROM pg_catalog.pg_database
+            WHERE datname = {datname_str}
+        )
+        THEN
+            ALTER DATABASE {datname} is_template false;
+        END IF;
+    END
+$$;
\ No newline at end of file

From 0a499a317614a049bab4e1166984557789460793 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 20 Nov 2024 06:44:23 +0100
Subject: [PATCH 10/24] Don't preload offloaded timelines (#9646)

In timeline preloading, we also do a preload for offloaded timelines.
This includes the download of `index-part.json`. Ultimately, such a
download is wasteful, therefore avoid it. Same goes for the remote
client, we just discard it immediately thereafter.

Part of #8088

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant.rs | 71 +++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 37bf83c984..8e9e3890ba 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -249,7 +249,8 @@ struct TimelinePreload {
 
 pub(crate) struct TenantPreload {
     tenant_manifest: TenantManifest,
-    timelines: HashMap<TimelineId, TimelinePreload>,
+    /// Map from timeline ID to a possible timeline preload. It is None iff the timeline is offloaded according to the manifest.
+    timelines: HashMap<TimelineId, Option<TimelinePreload>>,
 }
 
 /// When we spawn a tenant, there is a special mode for tenant creation that
@@ -1397,7 +1398,7 @@ impl Tenant {
         // Get list of remote timelines
         // download index files for every tenant timeline
         info!("listing remote timelines");
-        let (remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines(
+        let (mut remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines(
             remote_storage,
             self.tenant_shard_id,
             cancel.clone(),
@@ -1431,11 +1432,27 @@ impl Tenant {
             warn!("Unexpected non timeline key {k}");
         }
 
+        // Avoid downloading IndexPart of offloaded timelines.
+        let mut offloaded_with_prefix = HashSet::new();
+        for offloaded in tenant_manifest.offloaded_timelines.iter() {
+            if remote_timeline_ids.remove(&offloaded.timeline_id) {
+                offloaded_with_prefix.insert(offloaded.timeline_id);
+            } else {
+                // We'll take care later of timelines in the manifest without a prefix
+            }
+        }
+
+        let timelines = self
+            .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
+            .await?;
+
         Ok(TenantPreload {
             tenant_manifest,
-            timelines: self
-                .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
-                .await?,
+            timelines: timelines
+                .into_iter()
+                .map(|(id, tl)| (id, Some(tl)))
+                .chain(offloaded_with_prefix.into_iter().map(|id| (id, None)))
+                .collect(),
         })
     }
 
@@ -1466,6 +1483,19 @@ impl Tenant {
             offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
             offloaded_timeline_ids.insert(timeline_id);
         }
+        // Complete deletions for offloaded timeline id's from manifest.
+        // The manifest will be uploaded later in this function.
+        offloaded_timelines_list
+            .retain(|(offloaded_id, offloaded)| {
+                // Existence of a timeline is finally determined by the existence of an index-part.json in remote storage.
+                // If there is dangling references in another location, they need to be cleaned up.
+                let delete = !preload.timelines.contains_key(offloaded_id);
+                if delete {
+                    tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found");
+                    offloaded.defuse_for_tenant_drop();
+                }
+                !delete
+        });
 
         let mut timelines_to_resume_deletions = vec![];
 
@@ -1473,10 +1503,9 @@ impl Tenant {
         let mut timeline_ancestors = HashMap::new();
         let mut existent_timelines = HashSet::new();
         for (timeline_id, preload) in preload.timelines {
-            if offloaded_timeline_ids.remove(&timeline_id) {
-                // The timeline is offloaded, skip loading it.
-                continue;
-            }
+            let Some(preload) = preload else { continue };
+            // This is an invariant of the `preload` function's API
+            assert!(!offloaded_timeline_ids.contains(&timeline_id));
             let index_part = match preload.index_part {
                 Ok(i) => {
                     debug!("remote index part exists for timeline {timeline_id}");
@@ -1586,31 +1615,13 @@ impl Tenant {
             .context("resume_deletion")
             .map_err(LoadLocalTimelineError::ResumeDeletion)?;
         }
-        // Complete deletions for offloaded timeline id's.
-        offloaded_timelines_list
-            .retain(|(offloaded_id, offloaded)| {
-                // At this point, offloaded_timeline_ids has the list of all offloaded timelines
-                // without a prefix in S3, so they are inexistent.
-                // In the end, existence of a timeline is finally determined by the existence of an index-part.json in remote storage.
-                // If there is a dangling reference in another location, they need to be cleaned up.
-                let delete = offloaded_timeline_ids.contains(offloaded_id);
-                if delete {
-                    tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found");
-                    offloaded.defuse_for_tenant_drop();
-                }
-                !delete
-        });
-        if !offloaded_timelines_list.is_empty() {
-            tracing::info!(
-                "Tenant has {} offloaded timelines",
-                offloaded_timelines_list.len()
-            );
-        }
+        let needs_manifest_upload =
+            offloaded_timelines_list.len() != preload.tenant_manifest.offloaded_timelines.len();
         {
             let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
             offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
         }
-        if !offloaded_timeline_ids.is_empty() {
+        if needs_manifest_upload {
             self.store_tenant_manifest().await?;
         }
 

From 3ae0b2149e1a80975e098a4156b424e636cc550f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 20 Nov 2024 10:14:28 +0000
Subject: [PATCH 11/24] chore(proxy): demote a ton of logs for successful
 connection attempts (#9803)

See https://github.com/neondatabase/cloud/issues/14378

In collaboration with @cloneable and @awarus, we sifted through logs and
simply demoted some logs to debug. This is not at all finished and there
are more logs to review, but we ran out of time in the session we
organised. In any slightly more nuanced cases, we didn't touch the log,
instead leaving a TODO comment.
---
 proxy/src/auth/backend/classic.rs      |  8 +++++---
 proxy/src/auth/backend/hacks.rs        |  2 +-
 proxy/src/auth/backend/mod.rs          |  7 ++++---
 proxy/src/auth/credentials.rs          | 10 +++++-----
 proxy/src/auth/flow.rs                 |  2 ++
 proxy/src/bin/local_proxy.rs           |  1 +
 proxy/src/bin/proxy.rs                 |  1 +
 proxy/src/cancellation.rs              | 10 +++++-----
 proxy/src/compute.rs                   |  5 +++--
 proxy/src/console_redirect_proxy.rs    |  2 +-
 proxy/src/context/mod.rs               |  9 +++++++--
 proxy/src/control_plane/client/mod.rs  |  4 ++--
 proxy/src/control_plane/client/neon.rs | 13 +++++++++----
 proxy/src/stream.rs                    |  1 +
 14 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 6d26c99832..87a02133c8 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,5 +1,5 @@
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 
 use super::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::backend::ComputeCredentialKeys;
@@ -21,11 +21,11 @@ pub(super) async fn authenticate(
     let scram_keys = match secret {
         #[cfg(any(test, feature = "testing"))]
         AuthSecret::Md5(_) => {
-            info!("auth endpoint chooses MD5");
+            debug!("auth endpoint chooses MD5");
             return Err(auth::AuthError::bad_auth_method("MD5"));
         }
         AuthSecret::Scram(secret) => {
-            info!("auth endpoint chooses SCRAM");
+            debug!("auth endpoint chooses SCRAM");
             let scram = auth::Scram(&secret, ctx);
 
             let auth_outcome = tokio::time::timeout(
@@ -50,6 +50,8 @@ pub(super) async fn authenticate(
             let client_key = match auth_outcome {
                 sasl::Outcome::Success(key) => key,
                 sasl::Outcome::Failure(reason) => {
+                    // TODO: warnings?
+                    // TODO: should we get rid of this because double logging?
                     info!("auth backend failed with an error: {reason}");
                     return Err(auth::AuthError::password_failed(&*creds.user));
                 }
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 1411d908a5..e651df1d34 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -73,7 +73,7 @@ pub(crate) async fn password_hack_no_authentication(
         .get_password()
         .await?;
 
-    info!(project = &*payload.endpoint, "received missing parameter");
+    debug!(project = &*payload.endpoint, "received missing parameter");
 
     // Report tentative success; compute node will check the password anyway.
     Ok((
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 242fe99de2..83c72e7be0 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -14,7 +14,7 @@ use ipnet::{Ipv4Net, Ipv6Net};
 use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint};
@@ -286,7 +286,7 @@ async fn auth_quirks(
         Ok(info) => (info, None),
     };
 
-    info!("fetching user's authentication info");
+    debug!("fetching user's authentication info");
     let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
 
     // check allowed list
@@ -404,7 +404,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
     ) -> auth::Result<Backend<'a, ComputeCredentials>> {
         let res = match self {
             Self::ControlPlane(api, user_info) => {
-                info!(
+                debug!(
                     user = &*user_info.user,
                     project = user_info.endpoint(),
                     "performing authentication using the console"
@@ -427,6 +427,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
             }
         };
 
+        // TODO: replace with some metric
         info!("user successfully authenticated");
         Ok(res)
     }
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index ddecae6af5..dab9007400 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -7,7 +7,7 @@ use std::str::FromStr;
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use thiserror::Error;
-use tracing::{info, warn};
+use tracing::{debug, warn};
 
 use crate::auth::password_hack::parse_endpoint_param;
 use crate::context::RequestMonitoring;
@@ -147,22 +147,22 @@ impl ComputeUserInfoMaybeEndpoint {
         }
 
         let metrics = Metrics::get();
-        info!(%user, "credentials");
+        debug!(%user, "credentials");
         if sni.is_some() {
-            info!("Connection with sni");
+            debug!("Connection with sni");
             metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni);
         } else if endpoint.is_some() {
             metrics
                 .proxy
                 .accepted_connections_by_sni
                 .inc(SniKind::NoSni);
-            info!("Connection without sni");
+            debug!("Connection without sni");
         } else {
             metrics
                 .proxy
                 .accepted_connections_by_sni
                 .inc(SniKind::PasswordHack);
-            info!("Connection with password hack");
+            debug!("Connection with password hack");
         }
 
         let options = NeonOptions::parse_params(params);
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 6294549ff6..1740b59b14 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -178,6 +178,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
             SCRAM_SHA_256_PLUS => ctx.set_auth_method(crate::context::AuthMethod::ScramSha256Plus),
             _ => {}
         }
+
+        // TODO: make this a metric instead
         info!("client chooses {}", sasl.method);
 
         let outcome = sasl::SaslStream::new(self.stream, sasl.message)
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index 41b0e11e85..c4ec1300f2 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -125,6 +125,7 @@ async fn main() -> anyhow::Result<()> {
 
     Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
 
+    // TODO: refactor these to use labels
     debug!("Version: {GIT_VERSION}");
     debug!("Build_tag: {BUILD_TAG}");
     let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index fda5b25961..232721338d 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -288,6 +288,7 @@ async fn main() -> anyhow::Result<()> {
     let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
+    // TODO: refactor these to use labels
     info!("Version: {GIT_VERSION}");
     info!("Build_tag: {BUILD_TAG}");
     let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index db0970adcb..3ad2d55b53 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -7,7 +7,7 @@ use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::Mutex;
 use tokio_postgres::{CancelToken, NoTls};
-use tracing::info;
+use tracing::{debug, info};
 use uuid::Uuid;
 
 use crate::error::ReportableError;
@@ -73,7 +73,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
             break key;
         };
 
-        info!("registered new query cancellation key {key}");
+        debug!("registered new query cancellation key {key}");
         Session {
             key,
             cancellation_handler: self,
@@ -165,7 +165,7 @@ impl CancelClosure {
     pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> {
         let socket = TcpStream::connect(self.socket_addr).await?;
         self.cancel_token.cancel_query_raw(socket, NoTls).await?;
-        info!("query was cancelled");
+        debug!("query was cancelled");
         Ok(())
     }
 }
@@ -182,7 +182,7 @@ impl<P> Session<P> {
     /// Store the cancel token for the given session.
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
     pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
-        info!("enabling query cancellation for this session");
+        debug!("enabling query cancellation for this session");
         self.cancellation_handler
             .map
             .insert(self.key, Some(cancel_closure));
@@ -194,7 +194,7 @@ impl<P> Session<P> {
 impl<P> Drop for Session<P> {
     fn drop(&mut self) {
         self.cancellation_handler.map.remove(&self.key);
-        info!("dropped query cancellation key {}", &self.key);
+        debug!("dropped query cancellation key {}", &self.key);
     }
 }
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index ca4a348ed8..b8876b44eb 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -14,7 +14,7 @@ use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres_rustls::MakeRustlsConnect;
-use tracing::{error, info, warn};
+use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
 use crate::cancellation::CancelClosure;
@@ -213,7 +213,7 @@ impl ConnCfg {
         };
 
         let connect_once = |host, port| {
-            info!("trying to connect to compute node at {host}:{port}");
+            debug!("trying to connect to compute node at {host}:{port}");
             connect_with_timeout(host, port).and_then(|socket| async {
                 let socket_addr = socket.peer_addr()?;
                 // This prevents load balancer from severing the connection.
@@ -328,6 +328,7 @@ impl ConnCfg {
         tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
         let stream = connection.stream.into_inner();
 
+        // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
         info!(
             cold_start_info = ctx.cold_start_info().as_str(),
             "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index cc456f3667..8e71f552a5 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -146,7 +146,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
-    info!(
+    debug!(
         protocol = %ctx.protocol(),
         "handling interactive connection from client"
     );
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 6cf99c0c97..d057ee0bfd 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -8,7 +8,7 @@ use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use tokio::sync::mpsc;
 use tracing::field::display;
-use tracing::{debug, info, info_span, Span};
+use tracing::{debug, info_span, Span};
 use try_lock::TryLock;
 use uuid::Uuid;
 
@@ -122,6 +122,7 @@ impl RequestMonitoring {
         protocol: Protocol,
         region: &'static str,
     ) -> Self {
+        // TODO: be careful with long lived spans
         let span = info_span!(
             "connect_request",
             %protocol,
@@ -384,6 +385,10 @@ impl RequestMonitoringInner {
         } else {
             ConnectOutcome::Failed
         };
+
+        // TODO: get rid of entirely/refactor
+        // check for false positives
+        // AND false negatives
         if let Some(rejected) = self.rejected {
             let ep = self
                 .endpoint_id
@@ -391,7 +396,7 @@ impl RequestMonitoringInner {
                 .map(|x| x.as_str())
                 .unwrap_or_default();
             // This makes sense only if cache is disabled
-            info!(
+            debug!(
                 ?outcome,
                 ?rejected,
                 ?ep,
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index e388d8a538..50903e2f1e 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -8,7 +8,7 @@ use std::time::Duration;
 
 use dashmap::DashMap;
 use tokio::time::Instant;
-use tracing::info;
+use tracing::{debug, info};
 
 use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
 use crate::auth::backend::ComputeUserInfo;
@@ -214,7 +214,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
         self.metrics
             .semaphore_acquire_seconds
             .observe(now.elapsed().as_secs_f64());
-        info!("acquired permit {:?}", now.elapsed().as_secs_f64());
+        debug!("acquired permit {:?}", now.elapsed().as_secs_f64());
         Ok(WakeComputePermit { permit: permit? })
     }
 
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 26ff4e1402..8f4ae13f33 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -73,6 +73,8 @@ impl NeonControlPlaneClient {
             .endpoints_cache
             .is_valid(ctx, &user_info.endpoint.normalize())
         {
+            // TODO: refactor this because it's weird
+            // this is a failure to authenticate but we return Ok.
             info!("endpoint is not valid, skipping the request");
             return Ok(AuthInfo::default());
         }
@@ -92,7 +94,7 @@ impl NeonControlPlaneClient {
                 ])
                 .build()?;
 
-            info!(url = request.url().as_str(), "sending http request");
+            debug!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
@@ -104,10 +106,12 @@ impl NeonControlPlaneClient {
                 // TODO(anna): retry
                 Err(e) => {
                     return if e.get_reason().is_not_found() {
+                        // TODO: refactor this because it's weird
+                        // this is a failure to authenticate but we return Ok.
                         Ok(AuthInfo::default())
                     } else {
                         Err(e.into())
-                    }
+                    };
                 }
             };
 
@@ -163,7 +167,7 @@ impl NeonControlPlaneClient {
                 .build()
                 .map_err(GetEndpointJwksError::RequestBuild)?;
 
-            info!(url = request.url().as_str(), "sending http request");
+            debug!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self
@@ -220,7 +224,7 @@ impl NeonControlPlaneClient {
 
             let request = request_builder.build()?;
 
-            info!(url = request.url().as_str(), "sending http request");
+            debug!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
             let response = self.endpoint.execute(request).await?;
@@ -249,6 +253,7 @@ impl NeonControlPlaneClient {
             Ok(node)
         }
         .map_err(crate::error::log_error)
+        // TODO: redo this span stuff
         .instrument(info_span!("http", id = request_id))
         .await
     }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 89df48c5d3..11f426819d 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -133,6 +133,7 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
         msg: &'static str,
         error_kind: ErrorKind,
     ) -> Result<T, ReportedError> {
+        // TODO: only log this for actually interesting errors
         tracing::info!(
             kind = error_kind.to_metric_label(),
             msg,

From 33dce25af8ea722b3bf53467616fb5156dd41249 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Nov 2024 11:07:45 +0000
Subject: [PATCH 12/24] safekeeper: block deletion on protocol handler shutdown
 (#9364)

## Problem

Two recently observed log errors indicate safekeeper tasks for a
timeline running after that timeline's deletion has started.
- https://github.com/neondatabase/neon/issues/8972
- https://github.com/neondatabase/neon/issues/8974

These code paths do not have a mechanism that coordinates task shutdown
with the overall shutdown of the timeline.

## Summary of changes

- Add a `Gate` to `Timeline`
- Take the gate as part of resident timeline guard: any code that holds
a guard over a timeline staying resident should also hold a guard over
the timeline's total lifetime.
- Take the gate from the wal removal task
- Respect Timeline::cancel in WAL send/recv code, so that we do not
block shutdown indefinitely.
- Add a test that deletes timelines with open pageserver+compute
connections, to check these get torn down as expected.

There is some risk to introducing gates: if there is code holding a gate
which does not properly respect a cancellation token, it can cause
shutdown hangs. The risk of this for safekeepers is lower in practice
than it is for other services, because in a healthy timeline deletion,
the compute is shutdown first, then the timeline is deleted on the
pageserver, and finally it is deleted on the safekeepers -- that makes
it much less likely that some protocol handler will still be running.

Closes: #8972
Closes: #8974
---
 libs/postgres_backend/src/lib.rs         |   7 +-
 safekeeper/src/receive_wal.rs            |  29 +++++-
 safekeeper/src/send_wal.rs               |  37 ++++---
 safekeeper/src/timeline.rs               | 120 +++++++++--------------
 safekeeper/src/timeline_guard.rs         |  11 ++-
 safekeeper/src/timeline_manager.rs       |  55 +++++++++--
 safekeeper/src/timelines_global_map.rs   |   4 +-
 safekeeper/src/wal_backup.rs             |  42 +++++---
 test_runner/regress/test_wal_acceptor.py |  83 ++++++++++++++++
 9 files changed, 270 insertions(+), 118 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 9075a019b4..8c024375c1 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -834,7 +834,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         use CopyStreamHandlerEnd::*;
 
         let expected_end = match &end {
-            ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF => true,
+            ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true,
             CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
                 if is_expected_io_error(io_error) =>
             {
@@ -874,6 +874,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
             // message from server' when it receives ErrorResponse (anything but
             // CopyData/CopyDone) back.
             CopyFail => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
+
+            // When cancelled, send no response: we must not risk blocking on sending that response
+            Cancelled => None,
             _ => None,
         };
         if let Some((err, errcode)) = err_to_send_and_errcode {
@@ -1051,6 +1054,8 @@ pub enum CopyStreamHandlerEnd {
     /// The connection was lost
     #[error("connection error: {0}")]
     Disconnected(#[from] ConnectionError),
+    #[error("Shutdown")]
+    Cancelled,
     /// Some other error
     #[error(transparent)]
     Other(#[from] anyhow::Error),
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 2edcc4ef6f..bfa1764abf 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -239,6 +239,10 @@ impl SafekeeperPostgresHandler {
         pgb: &mut PostgresBackend<IO>,
         tli: &mut Option<WalResidentTimeline>,
     ) -> Result<(), CopyStreamHandlerEnd> {
+        // The `tli` parameter is only used for passing _out_ a timeline, one should
+        // not have been passed in.
+        assert!(tli.is_none());
+
         // Notify the libpq client that it's allowed to send `CopyData` messages
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
 
@@ -256,6 +260,7 @@ impl SafekeeperPostgresHandler {
         // sends, so this avoids deadlocks.
         let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?;
         let peer_addr = *pgb.get_peer_addr();
+
         let mut network_reader = NetworkReader {
             ttid: self.ttid,
             conn_id: self.conn_id,
@@ -275,10 +280,14 @@ impl SafekeeperPostgresHandler {
                     .subscribe();
             *tli = Some(timeline.wal_residence_guard().await?);
 
+            let timeline_cancel = timeline.cancel.clone();
             tokio::select! {
                 // todo: add read|write .context to these errors
                 r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
                 r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
+                _ = timeline_cancel.cancelled() => {
+                    return Err(CopyStreamHandlerEnd::Cancelled);
+                }
             }
         } else {
             res.map(|_| ())
@@ -303,7 +312,7 @@ impl SafekeeperPostgresHandler {
 
                 // Otherwise, WalAcceptor thread must have errored.
                 match wal_acceptor_res {
-                    Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination
+                    Ok(Ok(_)) => Ok(()), // Clean shutdown
                     Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))),
                     Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!(
                         "WalAcceptor task panicked",
@@ -356,6 +365,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
         Ok((tli, next_msg))
     }
 
+    /// This function is cancellation-safe (only does network I/O and channel read/writes).
     async fn run(
         self,
         msg_tx: Sender<ProposerAcceptorMessage>,
@@ -397,6 +407,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
     loop {
         let started = Instant::now();
         let size = next_msg.size();
+
         match msg_tx.send_timeout(next_msg, SLOW_THRESHOLD).await {
             Ok(()) => {}
             // Slow send, log a message and keep trying. Log context has timeline ID.
@@ -428,6 +439,8 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
 /// Read replies from WalAcceptor and pass them back to socket. Returns Ok(())
 /// if reply_rx closed; it must mean WalAcceptor terminated, joining it should
 /// tell the error.
+///
+/// This function is cancellation-safe (only does network I/O and channel read/writes).
 async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
     pgb_writer: &mut PostgresBackend<IO>,
     mut reply_rx: Receiver<AcceptorProposerMessage>,
@@ -461,7 +474,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
                         Some(AcceptorProposerMessage::AppendResponse(append_response))
                     }
                     _ => None,
-                }
+                },
         };
 
         let Some(msg) = msg else {
@@ -527,6 +540,10 @@ impl WalAcceptor {
 
     /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
     /// it must mean that network thread terminated.
+    ///
+    /// This function is *not* cancellation safe, it does local disk I/O: it should always
+    /// be allowed to run to completion. It respects Timeline::cancel and shuts down cleanly
+    /// when that gets triggered.
     async fn run(&mut self) -> anyhow::Result<()> {
         let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
 
@@ -541,7 +558,7 @@ impl WalAcceptor {
         // Tracks whether we have unflushed appends.
         let mut dirty = false;
 
-        loop {
+        while !self.tli.is_cancelled() {
             let reply = tokio::select! {
                 // Process inbound message.
                 msg = self.msg_rx.recv() => {
@@ -599,6 +616,10 @@ impl WalAcceptor {
                     WAL_RECEIVER_QUEUE_DEPTH.observe(self.msg_rx.len() as f64);
                     None // no reply
                 }
+
+                _ = self.tli.cancel.cancelled() => {
+                    break;
+                }
             };
 
             // Send reply, if any.
@@ -610,7 +631,7 @@ impl WalAcceptor {
         }
 
         // Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259.
-        if dirty {
+        if dirty && !self.tli.cancel.is_cancelled() {
             self.tli
                 .process_msg(&ProposerAcceptorMessage::FlushWAL)
                 .await?;
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 6d94ff98b1..aa65ec851b 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -456,6 +456,8 @@ impl SafekeeperPostgresHandler {
         // not synchronized with sends, so this avoids deadlocks.
         let reader = pgb.split().context("START_REPLICATION split")?;
 
+        let tli_cancel = tli.cancel.clone();
+
         let mut sender = WalSender {
             pgb,
             // should succeed since we're already holding another guard
@@ -479,6 +481,9 @@ impl SafekeeperPostgresHandler {
             // todo: add read|write .context to these errors
             r = sender.run() => r,
             r = reply_reader.run() => r,
+            _ = tli_cancel.cancelled() => {
+                return Err(CopyStreamHandlerEnd::Cancelled);
+            }
         };
 
         let ws_state = ws_guard
@@ -557,6 +562,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
     /// Send WAL until
     /// - an error occurs
     /// - receiver is caughtup and there is no computes (if streaming up to commit_lsn)
+    /// - timeline's cancellation token fires
     ///
     /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
     /// convenience.
@@ -601,15 +607,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
             };
             let send_buf = &send_buf[..send_size];
 
-            // and send it
-            self.pgb
-                .write_message(&BeMessage::XLogData(XLogDataBody {
-                    wal_start: self.start_pos.0,
-                    wal_end: self.end_pos.0,
-                    timestamp: get_current_timestamp(),
-                    data: send_buf,
-                }))
-                .await?;
+            // and send it, while respecting Timeline::cancel
+            let msg = BeMessage::XLogData(XLogDataBody {
+                wal_start: self.start_pos.0,
+                wal_end: self.end_pos.0,
+                timestamp: get_current_timestamp(),
+                data: send_buf,
+            });
+            self.pgb.write_message(&msg).await?;
 
             if let Some(appname) = &self.appname {
                 if appname == "replica" {
@@ -674,13 +679,13 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 }
             }
 
-            self.pgb
-                .write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
-                    wal_end: self.end_pos.0,
-                    timestamp: get_current_timestamp(),
-                    request_reply: true,
-                }))
-                .await?;
+            let msg = BeMessage::KeepAlive(WalSndKeepAlive {
+                wal_end: self.end_pos.0,
+                timestamp: get_current_timestamp(),
+                request_reply: true,
+            });
+
+            self.pgb.write_message(&msg).await?;
         }
     }
 
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 85add6bfea..ef928f7633 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantId;
+use utils::sync::gate::Gate;
 
 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
@@ -467,6 +468,10 @@ pub struct Timeline {
     timeline_dir: Utf8PathBuf,
     manager_ctl: ManagerCtl,
 
+    /// Hold this gate from code that depends on the Timeline's non-shut-down state.  While holding
+    /// this gate, you must respect [`Timeline::cancel`]
+    pub(crate) gate: Gate,
+
     /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
     pub(crate) cancel: CancellationToken,
 
@@ -508,6 +513,7 @@ impl Timeline {
             mutex: RwLock::new(shared_state),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
+            gate: Default::default(),
             cancel: CancellationToken::default(),
             manager_ctl: ManagerCtl::new(),
             broker_active: AtomicBool::new(false),
@@ -533,56 +539,6 @@ impl Timeline {
         ))
     }
 
-    /// Initialize fresh timeline on disk and start background tasks. If init
-    /// fails, timeline is cancelled and cannot be used anymore.
-    ///
-    /// Init is transactional, so if it fails, created files will be deleted,
-    /// and state on disk should remain unchanged.
-    pub async fn init_new(
-        self: &Arc<Timeline>,
-        shared_state: &mut WriteGuardSharedState<'_>,
-        conf: &SafeKeeperConf,
-        broker_active_set: Arc<TimelinesSet>,
-        partial_backup_rate_limiter: RateLimiter,
-    ) -> Result<()> {
-        match fs::metadata(&self.timeline_dir).await {
-            Ok(_) => {
-                // Timeline directory exists on disk, we should leave state unchanged
-                // and return error.
-                bail!(TimelineError::Invalid(self.ttid));
-            }
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
-            Err(e) => {
-                return Err(e.into());
-            }
-        }
-
-        // Create timeline directory.
-        fs::create_dir_all(&self.timeline_dir).await?;
-
-        // Write timeline to disk and start background tasks.
-        if let Err(e) = shared_state.sk.state_mut().flush().await {
-            // Bootstrap failed, cancel timeline and remove timeline directory.
-            self.cancel(shared_state);
-
-            if let Err(fs_err) = fs::remove_dir_all(&self.timeline_dir).await {
-                warn!(
-                    "failed to remove timeline {} directory after bootstrap failure: {}",
-                    self.ttid, fs_err
-                );
-            }
-
-            return Err(e);
-        }
-        self.bootstrap(
-            shared_state,
-            conf,
-            broker_active_set,
-            partial_backup_rate_limiter,
-        );
-        Ok(())
-    }
-
     /// Bootstrap new or existing timeline starting background tasks.
     pub fn bootstrap(
         self: &Arc<Timeline>,
@@ -593,33 +549,61 @@ impl Timeline {
     ) {
         let (tx, rx) = self.manager_ctl.bootstrap_manager();
 
+        let Ok(gate_guard) = self.gate.enter() else {
+            // Init raced with shutdown
+            return;
+        };
+
         // Start manager task which will monitor timeline state and update
         // background tasks.
-        tokio::spawn(timeline_manager::main_task(
-            ManagerTimeline { tli: self.clone() },
-            conf.clone(),
-            broker_active_set,
-            tx,
-            rx,
-            partial_backup_rate_limiter,
-        ));
+        tokio::spawn({
+            let this = self.clone();
+            let conf = conf.clone();
+            async move {
+                let _gate_guard = gate_guard;
+                timeline_manager::main_task(
+                    ManagerTimeline { tli: this },
+                    conf,
+                    broker_active_set,
+                    tx,
+                    rx,
+                    partial_backup_rate_limiter,
+                )
+                .await
+            }
+        });
+    }
+
+    /// Background timeline activities (which hold Timeline::gate) will no
+    /// longer run once this function completes.
+    pub async fn shutdown(&self) {
+        info!("timeline {} shutting down", self.ttid);
+        self.cancel.cancel();
+
+        // Wait for any concurrent tasks to stop using this timeline, to avoid e.g. attempts
+        // to read deleted files.
+        self.gate.close().await;
     }
 
     /// Delete timeline from disk completely, by removing timeline directory.
-    /// Background timeline activities will stop eventually.
     ///
     /// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but
     /// deletion API endpoint is retriable.
+    ///
+    /// Timeline must be in shut-down state (i.e. call [`Self::shutdown`] first)
     pub async fn delete(
         &self,
         shared_state: &mut WriteGuardSharedState<'_>,
         only_local: bool,
     ) -> Result<bool> {
-        self.cancel(shared_state);
+        // Assert that [`Self::shutdown`] was already called
+        assert!(self.cancel.is_cancelled());
+        assert!(self.gate.close_complete());
+
+        // Close associated FDs. Nobody will be able to touch timeline data once
+        // it is cancelled, so WAL storage won't be opened again.
+        shared_state.sk.close_wal_store();
 
-        // TODO: It's better to wait for s3 offloader termination before
-        // removing data from s3. Though since s3 doesn't have transactions it
-        // still wouldn't guarantee absense of data after removal.
         let conf = GlobalTimelines::get_global_config();
         if !only_local && conf.is_wal_backup_enabled() {
             // Note: we concurrently delete remote storage data from multiple
@@ -631,16 +615,6 @@ impl Timeline {
         Ok(dir_existed)
     }
 
-    /// Cancel timeline to prevent further usage. Background tasks will stop
-    /// eventually after receiving cancellation signal.
-    fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
-        info!("timeline {} is cancelled", self.ttid);
-        self.cancel.cancel();
-        // Close associated FDs. Nobody will be able to touch timeline data once
-        // it is cancelled, so WAL storage won't be opened again.
-        shared_state.sk.close_wal_store();
-    }
-
     /// Returns if timeline is cancelled.
     pub fn is_cancelled(&self) -> bool {
         self.cancel.is_cancelled()
diff --git a/safekeeper/src/timeline_guard.rs b/safekeeper/src/timeline_guard.rs
index 1ddac573d2..9102a40df8 100644
--- a/safekeeper/src/timeline_guard.rs
+++ b/safekeeper/src/timeline_guard.rs
@@ -7,6 +7,7 @@
 use std::collections::HashSet;
 
 use tracing::debug;
+use utils::sync::gate::GateGuard;
 
 use crate::timeline_manager::ManagerCtlMessage;
 
@@ -16,6 +17,12 @@ pub struct GuardId(u64);
 pub struct ResidenceGuard {
     manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
     guard_id: GuardId,
+
+    /// [`ResidenceGuard`] represents a guarantee that a timeline's data remains resident,
+    /// which by extension also means the timeline is not shut down (since after shut down
+    /// our data may be deleted). Therefore everyone holding a residence guard must also
+    /// hold a guard on [`crate::timeline::Timeline::gate`]
+    _gate_guard: GateGuard,
 }
 
 impl Drop for ResidenceGuard {
@@ -52,7 +59,8 @@ impl AccessService {
         self.guards.is_empty()
     }
 
-    pub(crate) fn create_guard(&mut self) -> ResidenceGuard {
+    /// `timeline_gate_guard` is a guarantee that the timeline is not shut down
+    pub(crate) fn create_guard(&mut self, timeline_gate_guard: GateGuard) -> ResidenceGuard {
         let guard_id = self.next_guard_id;
         self.next_guard_id += 1;
         self.guards.insert(guard_id);
@@ -63,6 +71,7 @@ impl AccessService {
         ResidenceGuard {
             manager_tx: self.manager_tx.clone(),
             guard_id,
+            _gate_guard: timeline_gate_guard,
         }
     }
 
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index e9fed21bf5..c02fb904cf 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -266,8 +266,10 @@ pub async fn main_task(
 
     // Start recovery task which always runs on the timeline.
     if !mgr.is_offloaded && mgr.conf.peer_recovery_enabled {
-        let tli = mgr.wal_resident_timeline();
-        mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone())));
+        // Recovery task is only spawned if we can get a residence guard (i.e. timeline is not already shutting down)
+        if let Ok(tli) = mgr.wal_resident_timeline() {
+            mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone())));
+        }
     }
 
     // If timeline is evicted, reflect that in the metric.
@@ -375,6 +377,13 @@ pub async fn main_task(
 
     // shutdown background tasks
     if mgr.conf.is_wal_backup_enabled() {
+        if let Some(backup_task) = mgr.backup_task.take() {
+            // If we fell through here, then the timeline is shutting down. This is important
+            // because otherwise joining on the wal_backup handle might hang.
+            assert!(mgr.tli.cancel.is_cancelled());
+
+            backup_task.join().await;
+        }
         wal_backup::update_task(&mut mgr, false, &last_state).await;
     }
 
@@ -442,10 +451,18 @@ impl Manager {
     /// Get a WalResidentTimeline.
     /// Manager code must use this function instead of one from `Timeline`
     /// directly, because it will deadlock.
-    pub(crate) fn wal_resident_timeline(&mut self) -> WalResidentTimeline {
+    ///
+    /// This function is fallible because the guard may not be created if the timeline is
+    /// shutting down.
+    pub(crate) fn wal_resident_timeline(&mut self) -> anyhow::Result<WalResidentTimeline> {
         assert!(!self.is_offloaded);
-        let guard = self.access_service.create_guard();
-        WalResidentTimeline::new(self.tli.clone(), guard)
+        let guard = self.access_service.create_guard(
+            self.tli
+                .gate
+                .enter()
+                .map_err(|_| anyhow::anyhow!("Timeline shutting down"))?,
+        );
+        Ok(WalResidentTimeline::new(self.tli.clone(), guard))
     }
 
     /// Get a snapshot of the timeline state.
@@ -559,6 +576,11 @@ impl Manager {
 
         if removal_horizon_segno > self.last_removed_segno {
             // we need to remove WAL
+            let Ok(timeline_gate_guard) = self.tli.gate.enter() else {
+                tracing::info!("Timeline shutdown, not spawning WAL removal task");
+                return;
+            };
+
             let remover = match self.tli.read_shared_state().await.sk {
                 StateSK::Loaded(ref sk) => {
                     crate::wal_storage::Storage::remove_up_to(&sk.wal_store, removal_horizon_segno)
@@ -573,6 +595,8 @@ impl Manager {
 
             self.wal_removal_task = Some(tokio::spawn(
                 async move {
+                    let _timeline_gate_guard = timeline_gate_guard;
+
                     remover.await?;
                     Ok(removal_horizon_segno)
                 }
@@ -619,10 +643,15 @@ impl Manager {
             return;
         }
 
+        let Ok(resident) = self.wal_resident_timeline() else {
+            // Shutting down
+            return;
+        };
+
         // Get WalResidentTimeline and start partial backup task.
         let cancel = CancellationToken::new();
         let handle = tokio::spawn(wal_backup_partial::main_task(
-            self.wal_resident_timeline(),
+            resident,
             self.conf.clone(),
             self.global_rate_limiter.clone(),
             cancel.clone(),
@@ -664,7 +693,7 @@ impl Manager {
             self.partial_backup_task = None;
         }
 
-        let tli = self.wal_resident_timeline();
+        let tli = self.wal_resident_timeline()?;
         let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await;
         // Reset might fail e.g. when cfile is already reset but s3 removal
         // failed, so set manager state to None beforehand. In any case caller
@@ -688,7 +717,12 @@ impl Manager {
                 let guard = if self.is_offloaded {
                     Err(anyhow::anyhow!("timeline is offloaded, can't get a guard"))
                 } else {
-                    Ok(self.access_service.create_guard())
+                    match self.tli.gate.enter() {
+                        Ok(gate_guard) => Ok(self.access_service.create_guard(gate_guard)),
+                        Err(_) => Err(anyhow::anyhow!(
+                            "timeline is shutting down, can't get a guard"
+                        )),
+                    }
                 };
 
                 if tx.send(guard).is_err() {
@@ -699,7 +733,10 @@ impl Manager {
                 let result = if self.is_offloaded {
                     None
                 } else {
-                    Some(self.access_service.create_guard())
+                    match self.tli.gate.enter() {
+                        Ok(gate_guard) => Some(self.access_service.create_guard(gate_guard)),
+                        Err(_) => None,
+                    }
                 };
 
                 if tx.send(result).is_err() {
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 33d94da034..067945fd5f 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -457,10 +457,12 @@ impl GlobalTimelines {
             Ok(timeline) => {
                 let was_active = timeline.broker_active.load(Ordering::Relaxed);
 
+                info!("deleting timeline {}, only_local={}", ttid, only_local);
+                timeline.shutdown().await;
+
                 // Take a lock and finish the deletion holding this mutex.
                 let mut shared_state = timeline.write_shared_state().await;
 
-                info!("deleting timeline {}, only_local={}", ttid, only_local);
                 let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
 
                 Ok(TimelineDeleteForceResult {
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 6c87e5a926..34b5dbeaa1 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -25,7 +25,6 @@ use tokio::fs::File;
 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
 use tokio::sync::{watch, OnceCell};
-use tokio::time::sleep;
 use tracing::*;
 
 use utils::{id::TenantTimelineId, lsn::Lsn};
@@ -46,6 +45,14 @@ pub struct WalBackupTaskHandle {
     handle: JoinHandle<()>,
 }
 
+impl WalBackupTaskHandle {
+    pub(crate) async fn join(self) {
+        if let Err(e) = self.handle.await {
+            error!("WAL backup task panicked: {}", e);
+        }
+    }
+}
+
 /// Do we have anything to upload to S3, i.e. should safekeepers run backup activity?
 pub(crate) fn is_wal_backup_required(
     wal_seg_size: usize,
@@ -74,11 +81,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St
 
             let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
 
-            let async_task = backup_task_main(
-                mgr.wal_resident_timeline(),
-                mgr.conf.backup_parallel_jobs,
-                shutdown_rx,
-            );
+            let Ok(resident) = mgr.wal_resident_timeline() else {
+                info!("Timeline shut down");
+                return;
+            };
+
+            let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx);
 
             let handle = if mgr.conf.current_thread_runtime {
                 tokio::spawn(async_task)
@@ -108,9 +116,7 @@ async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
         // Tell the task to shutdown. Error means task exited earlier, that's ok.
         let _ = wb_handle.shutdown_tx.send(()).await;
         // Await the task itself. TODO: restart panicked tasks earlier.
-        if let Err(e) = wb_handle.handle.await {
-            warn!("WAL backup task panicked: {}", e);
-        }
+        wb_handle.join().await;
     }
 }
 
@@ -214,6 +220,7 @@ async fn backup_task_main(
     let _guard = WAL_BACKUP_TASKS.guard();
     info!("started");
 
+    let cancel = tli.tli.cancel.clone();
     let mut wb = WalBackupTask {
         wal_seg_size: tli.get_wal_seg_size().await,
         commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
@@ -230,25 +237,34 @@ async fn backup_task_main(
         _ = wb.run() => {}
         _ = shutdown_rx.recv() => {
             canceled = true;
+        },
+        _ = cancel.cancelled() => {
+            canceled = true;
         }
     }
     info!("task {}", if canceled { "canceled" } else { "terminated" });
 }
 
 impl WalBackupTask {
+    /// This function must be called from a select! that also respects self.timeline's
+    /// cancellation token.  This is done in [`backup_task_main`].
+    ///
+    /// The future returned by this function is safe to drop at any time because it
+    /// does not write to local disk.
     async fn run(&mut self) {
         let mut backup_lsn = Lsn(0);
 
         let mut retry_attempt = 0u32;
         // offload loop
-        loop {
+        while !self.timeline.cancel.is_cancelled() {
             if retry_attempt == 0 {
                 // wait for new WAL to arrive
                 if let Err(e) = self.commit_lsn_watch_rx.changed().await {
-                    // should never happen, as we hold Arc to timeline.
+                    // should never happen, as we hold Arc to timeline and transmitter's lifetime
+                    // is within Timeline's
                     error!("commit_lsn watch shut down: {:?}", e);
                     return;
-                }
+                };
             } else {
                 // or just sleep if we errored previously
                 let mut retry_delay = UPLOAD_FAILURE_RETRY_MAX_MS;
@@ -256,7 +272,7 @@ impl WalBackupTask {
                 {
                     retry_delay = min(retry_delay, backoff_delay);
                 }
-                sleep(Duration::from_millis(retry_delay)).await;
+                tokio::time::sleep(Duration::from_millis(retry_delay)).await;
             }
 
             let commit_lsn = *self.commit_lsn_watch_rx.borrow();
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 0676b3dd9a..6eaaa3c37f 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1784,6 +1784,89 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
             cur.execute("INSERT INTO t (key) VALUES (123)")
 
 
+def test_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder):
+    """
+    Test deleting timelines on a safekeeper while they're under load.
+
+    This should not happen under normal operation, but it can happen if
+    there is some rogue compute/pageserver that is writing/reading to a
+    safekeeper that we're migrating a timeline away from, or if the timeline
+    is being deleted while such a rogue client is running.
+    """
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+
+    # Create two endpoints that will generate load
+    timeline_id_a = env.create_branch("deleteme_a")
+    timeline_id_b = env.create_branch("deleteme_b")
+
+    endpoint_a = env.endpoints.create("deleteme_a")
+    endpoint_a.start()
+    endpoint_b = env.endpoints.create("deleteme_b")
+    endpoint_b.start()
+
+    # Get tenant and timeline IDs
+    tenant_id = env.initial_tenant
+
+    # Start generating load on both timelines
+    def generate_load(endpoint: Endpoint):
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+                while True:
+                    try:
+                        cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'")
+                    except:  # noqa
+                        # Ignore errors since timeline may be deleted
+                        break
+
+    t_a = threading.Thread(target=generate_load, args=(endpoint_a,))
+    t_b = threading.Thread(target=generate_load, args=(endpoint_b,))
+    try:
+        t_a.start()
+        t_b.start()
+
+        # Let the load run for a bit
+        log.info("Warming up...")
+        time.sleep(2)
+
+        # Safekeeper errors will propagate to the pageserver: it is correct that these are
+        # logged at error severity because they indicate the pageserver is trying to read
+        # a timeline that it shouldn't.
+        env.pageserver.allowed_errors.extend(
+            [
+                ".*Timeline.*was cancelled.*",
+                ".*Timeline.*was not found.*",
+            ]
+        )
+
+        # Try deleting timelines while under load
+        sk = env.safekeepers[0]
+        sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
+
+        # Delete first timeline
+        log.info(f"Deleting {timeline_id_a}...")
+        assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"]
+
+        # Delete second timeline
+        log.info(f"Deleting {timeline_id_b}...")
+        assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"]
+
+        # Verify timelines are gone from disk
+        sk_data_dir = sk.data_dir
+        assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists()
+        # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists()
+
+    finally:
+        log.info("Stopping endpoints...")
+        # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang
+        endpoint_a.stop(mode="immediate")
+        endpoint_b.stop(mode="immediate")
+        log.info("Joining threads...")
+        t_a.join()
+        t_b.join()
+
+
 # Basic pull_timeline test.
 # When live_sk_change is False, compute is restarted to change set of
 # safekeepers; otherwise it is live reload.

From 94e4a0e2a0d43e066bd006a68eb147333ab0d074 Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Wed, 20 Nov 2024 13:04:14 +0100
Subject: [PATCH 13/24] update macos version for runner (#9817)

Closes: https://github.com/neondatabase/neon/issues/9816

Run MacOs builds on `macos-15`.
As `pkg-config` is bundled in runner image, don't install it with `brew`
---
 .github/workflows/neon_extra_builds.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index e827539c80..092831adb9 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -38,7 +38,7 @@ jobs:
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
       github.ref_name == 'main'
     timeout-minutes: 90
-    runs-on: macos-14
+    runs-on: macos-15
 
     env:
       # Use release build only, to have less debug info around
@@ -52,7 +52,7 @@ jobs:
           submodules: true
 
       - name: Install macOS postgres dependencies
-        run: brew install flex bison openssl protobuf icu4c pkg-config
+        run: brew install flex bison openssl protobuf icu4c
 
       - name: Set pg 14 revision for caching
         id: pg_v14_rev

From 46beecacce50bf1d113dbb6f31fe2283a598adf7 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 20 Nov 2024 12:23:41 +0000
Subject: [PATCH 14/24] CI(benchmarking): route test failures to
 on-call-qa-staging-stream (#9813)

## Problem

We want to keep `#on-call-staging-stream` channel close to the prod one
and redirect notifications from failing benchmarks to another channel
for investigation.

## Summary of changes
- Send notifications regarding failures in `benchmarking` job to
`#on-call-staging-stream`
- Send notifications regarding failures in `periodic_pagebench` job to
`#on-call-staging-stream`
---
 .github/workflows/benchmarking.yml       | 12 ++++++------
 .github/workflows/periodic_pagebench.yml |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 0289f552f9..acea859b4d 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -158,7 +158,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic perf testing: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
@@ -506,7 +506,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
@@ -643,7 +643,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
@@ -759,7 +759,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
@@ -874,7 +874,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
@@ -974,7 +974,7 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: |
           Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
           <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 615937b5a1..1cce348ae2 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -72,7 +72,7 @@ jobs:
           echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
         fi
 
-    - name: Start Bench with run_id   
+    - name: Start Bench with run_id
       run: |
         curl -k -X 'POST' \
         "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
@@ -116,7 +116,7 @@ jobs:
         -H 'accept: application/gzip' \
         -H "Authorization: Bearer $API_KEY" \
         --output "test_log_${GITHUB_RUN_ID}.gz"
-    
+
     - name: Unzip Test Log and Print it into this job's log
       if: always() && steps.poll_step.outputs.too_many_runs != 'true'
       run: |
@@ -134,13 +134,13 @@ jobs:
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
         slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
     - name: Cleanup Test Resources
-      if: always() 
+      if: always()
       run: |
         curl -k -X 'POST' \
         "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \

From 899933e159b56d8cfe92995befff8b37d6eb55b8 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 20 Nov 2024 12:48:21 +0000
Subject: [PATCH 15/24] scan_log_for_errors: check that regex is correct
 (#9815)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

I've noticed that we have 2 flaky tests which failed with error:
```
re.error: missing ), unterminated subpattern at position 21
```

- `test_timeline_archival_chaos` — has been already fixed
- `test_sharded_tad_interleaved_after_partial_success` — I didn't manage
to find the incorrect regex

[Internal link](https://neonprod.grafana.net/goto/yfmVHV7NR?orgId=1)

## Summary of changes
- Wrap `re.match` in `try..except` block and print incorrect regex
---
 test_runner/fixtures/pageserver/allowed_errors.py | 10 ++++++++--
 test_runner/fixtures/utils.py                     | 10 ++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index d05704c8e0..5059039678 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -25,8 +25,14 @@ def scan_pageserver_log_for_errors(
 
             # It's an ERROR or WARN. Is it in the allow-list?
             for a in allowed_errors:
-                if re.match(a, line):
-                    break
+                try:
+                    if re.match(a, line):
+                        break
+                # We can switch `re.error` with `re.PatternError` after 3.13
+                # https://docs.python.org/3/library/re.html#re.PatternError
+                except re.error:
+                    print(f"Invalid regex: '{a}'", file=sys.stderr)
+                    raise
             else:
                 errors.append((lineno, line))
     return errors
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 96a651f0db..bb45385ea6 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -495,8 +495,14 @@ def scan_log_for_errors(input: Iterable[str], allowed_errors: list[str]) -> list
 
             # It's an ERROR or WARN. Is it in the allow-list?
             for a in allowed_errors:
-                if re.match(a, line):
-                    break
+                try:
+                    if re.match(a, line):
+                        break
+                # We can switch `re.error` with `re.PatternError` after 3.13
+                # https://docs.python.org/3/library/re.html#re.PatternError
+                except re.error:
+                    log.error(f"Invalid regex: '{a}'")
+                    raise
             else:
                 errors.append((lineno, line))
     return errors

From bf7d859a8bdb26a6ac4ce1c17fec948d7bcecdcb Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 20 Nov 2024 13:50:36 +0100
Subject: [PATCH 16/24] proxy: Rename RequestMonitoring to RequestContext
 (#9805)

## Problem

It is called context/ctx everywhere and the Monitoring suffix needlessly
confuses with proper monitoring code.

## Summary of changes

* Rename RequestMonitoring to RequestContext
* Rename RequestMonitoringInner to RequestContextInner
---
 proxy/src/auth/backend/classic.rs          |  4 +--
 proxy/src/auth/backend/console_redirect.rs |  8 +++---
 proxy/src/auth/backend/hacks.rs            |  6 ++--
 proxy/src/auth/backend/jwt.rs              | 26 +++++++++---------
 proxy/src/auth/backend/local.rs            |  4 +--
 proxy/src/auth/backend/mod.rs              | 32 +++++++++++-----------
 proxy/src/auth/credentials.rs              | 30 ++++++++++----------
 proxy/src/auth/flow.rs                     |  4 +--
 proxy/src/bin/pg_sni_router.rs             |  8 +++---
 proxy/src/cache/endpoints.rs               |  4 +--
 proxy/src/compute.rs                       |  4 +--
 proxy/src/console_redirect_proxy.rs        |  6 ++--
 proxy/src/context/mod.rs                   | 22 +++++++--------
 proxy/src/context/parquet.rs               |  6 ++--
 proxy/src/control_plane/client/mock.rs     | 10 +++----
 proxy/src/control_plane/client/mod.rs      | 12 ++++----
 proxy/src/control_plane/client/neon.rs     | 16 +++++------
 proxy/src/control_plane/mod.rs             | 12 ++++----
 proxy/src/proxy/connect_compute.rs         | 10 +++----
 proxy/src/proxy/handshake.rs               |  4 +--
 proxy/src/proxy/mod.rs                     |  6 ++--
 proxy/src/proxy/tests/mitm.rs              |  2 +-
 proxy/src/proxy/tests/mod.rs               | 27 +++++++++---------
 proxy/src/proxy/wake_compute.rs            |  4 +--
 proxy/src/serverless/backend.rs            | 16 +++++------
 proxy/src/serverless/conn_pool.rs          |  4 +--
 proxy/src/serverless/conn_pool_lib.rs      |  4 +--
 proxy/src/serverless/http_conn_pool.rs     |  6 ++--
 proxy/src/serverless/local_conn_pool.rs    |  6 ++--
 proxy/src/serverless/mod.rs                |  6 ++--
 proxy/src/serverless/sql_over_http.rs      | 12 ++++----
 proxy/src/serverless/websocket.rs          |  4 +--
 32 files changed, 162 insertions(+), 163 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 87a02133c8..491b272ac4 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -5,13 +5,13 @@ use super::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::backend::ComputeCredentialKeys;
 use crate::auth::{self, AuthFlow};
 use crate::config::AuthenticationConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::AuthSecret;
 use crate::stream::{PqStream, Stream};
 use crate::{compute, sasl};
 
 pub(super) async fn authenticate(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     creds: ComputeUserInfo,
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index e25dc3d45e..5772471486 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -8,7 +8,7 @@ use tracing::{info, info_span};
 use super::ComputeCredentialKeys;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::{ReportableError, UserFacingError};
 use crate::proxy::connect_compute::ComputeConnectBackend;
@@ -71,7 +71,7 @@ impl ConsoleRedirectBackend {
 
     pub(crate) async fn authenticate(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         auth_config: &'static AuthenticationConfig,
         client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
     ) -> auth::Result<ConsoleRedirectNodeInfo> {
@@ -87,7 +87,7 @@ pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo);
 impl ComputeConnectBackend for ConsoleRedirectNodeInfo {
     async fn wake_compute(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
     ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
         Ok(Cached::new_uncached(self.0.clone()))
     }
@@ -98,7 +98,7 @@ impl ComputeConnectBackend for ConsoleRedirectNodeInfo {
 }
 
 async fn authenticate(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     auth_config: &'static AuthenticationConfig,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index e651df1d34..3316543022 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -4,7 +4,7 @@ use tracing::{debug, info};
 use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
 use crate::auth::{self, AuthFlow};
 use crate::config::AuthenticationConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::AuthSecret;
 use crate::intern::EndpointIdInt;
 use crate::sasl;
@@ -15,7 +15,7 @@ use crate::stream::{self, Stream};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub(crate) async fn authenticate_cleartext(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
@@ -57,7 +57,7 @@ pub(crate) async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub(crate) async fn password_hack_no_authentication(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index bfc674139b..f721d81aa2 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -17,7 +17,7 @@ use thiserror::Error;
 use tokio::time::Instant;
 
 use crate::auth::backend::ComputeCredentialKeys;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::errors::GetEndpointJwksError;
 use crate::http::read_body_with_limit;
 use crate::intern::RoleNameInt;
@@ -39,7 +39,7 @@ const JWKS_FETCH_RETRIES: u32 = 3;
 pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
     fn fetch_auth_rules(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> impl Future<Output = Result<Vec<AuthRule>, FetchAuthRulesError>> + Send;
 }
@@ -144,7 +144,7 @@ impl JwkCacheEntryLock {
     async fn renew_jwks<F: FetchAuthRules>(
         &self,
         _permit: JwkRenewalPermit<'_>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         client: &reqwest_middleware::ClientWithMiddleware,
         endpoint: EndpointId,
         auth_rules: &F,
@@ -261,7 +261,7 @@ impl JwkCacheEntryLock {
 
     async fn get_or_update_jwk_cache<F: FetchAuthRules>(
         self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         client: &reqwest_middleware::ClientWithMiddleware,
         endpoint: EndpointId,
         fetch: &F,
@@ -314,7 +314,7 @@ impl JwkCacheEntryLock {
 
     async fn check_jwt<F: FetchAuthRules>(
         self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         jwt: &str,
         client: &reqwest_middleware::ClientWithMiddleware,
         endpoint: EndpointId,
@@ -409,7 +409,7 @@ impl JwkCacheEntryLock {
 impl JwkCache {
     pub(crate) async fn check_jwt<F: FetchAuthRules>(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
         role_name: &RoleName,
         fetch: &F,
@@ -941,7 +941,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
     impl FetchAuthRules for Fetch {
         async fn fetch_auth_rules(
             &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &RequestContext,
             _endpoint: EndpointId,
         ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
             Ok(self.0.clone())
@@ -1039,7 +1039,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
             for token in &tokens {
                 jwk_cache
                     .check_jwt(
-                        &RequestMonitoring::test(),
+                        &RequestContext::test(),
                         endpoint.clone(),
                         role,
                         &fetch,
@@ -1097,7 +1097,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
 
         jwk_cache
             .check_jwt(
-                &RequestMonitoring::test(),
+                &RequestContext::test(),
                 endpoint.clone(),
                 &role_name,
                 &fetch,
@@ -1136,7 +1136,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
 
         let ep = EndpointId::from("ep");
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let err = jwk_cache
             .check_jwt(&ctx, ep, &role, &fetch, &bad_jwt)
             .await
@@ -1175,7 +1175,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         // this role_name is not accepted
         let bad_role_name = RoleName::from("cloud_admin");
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let err = jwk_cache
             .check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt)
             .await
@@ -1268,7 +1268,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
 
         let ep = EndpointId::from("ep");
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         for test in table {
             let jwt = new_custom_ec_jwt("1".into(), &key, test.body);
 
@@ -1336,7 +1336,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
 
         jwk_cache
             .check_jwt(
-                &RequestMonitoring::test(),
+                &RequestContext::test(),
                 endpoint.clone(),
                 &role_name,
                 &fetch,
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index f9cb085daf..32e0f53615 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -7,7 +7,7 @@ use super::jwt::{AuthRule, FetchAuthRules};
 use crate::auth::backend::jwt::FetchAuthRulesError;
 use crate::compute::ConnCfg;
 use crate::compute_ctl::ComputeCtlApi;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
 use crate::control_plane::NodeInfo;
 use crate::http;
@@ -56,7 +56,7 @@ pub static JWKS_ROLE_MAP: ArcSwapOption<EndpointJwksResponse> = ArcSwapOption::c
 impl FetchAuthRules for StaticAuthRules {
     async fn fetch_auth_rules(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         _endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
         let mappings = JWKS_ROLE_MAP.load();
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 83c72e7be0..57ecd5e499 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -20,7 +20,7 @@ use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint};
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::client::ControlPlaneClient;
 use crate::control_plane::errors::GetAuthInfoError;
 use crate::control_plane::{
@@ -210,7 +210,7 @@ impl RateBucketInfo {
 impl AuthenticationConfig {
     pub(crate) fn check_rate_limit(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         secret: AuthSecret,
         endpoint: &EndpointId,
         is_cleartext: bool,
@@ -265,7 +265,7 @@ impl AuthenticationConfig {
 ///
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     api: &impl control_plane::ControlPlaneApi,
     user_info: ComputeUserInfoMaybeEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -343,7 +343,7 @@ async fn auth_quirks(
 }
 
 async fn authenticate_with_secret(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     secret: AuthSecret,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
@@ -396,7 +396,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
     #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
     pub(crate) async fn authenticate(
         self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
@@ -436,7 +436,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
 impl Backend<'_, ComputeUserInfo> {
     pub(crate) async fn get_role_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         match self {
             Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await,
@@ -446,7 +446,7 @@ impl Backend<'_, ComputeUserInfo> {
 
     pub(crate) async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         match self {
             Self::ControlPlane(api, user_info) => {
@@ -461,7 +461,7 @@ impl Backend<'_, ComputeUserInfo> {
 impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
     async fn wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
     ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
         match self {
             Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await,
@@ -497,7 +497,7 @@ mod tests {
     use crate::auth::backend::MaskedIp;
     use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
     use crate::config::AuthenticationConfig;
-    use crate::context::RequestMonitoring;
+    use crate::context::RequestContext;
     use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret};
     use crate::proxy::NeonOptions;
     use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
@@ -513,7 +513,7 @@ mod tests {
     impl control_plane::ControlPlaneApi for Auth {
         async fn get_role_secret(
             &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &RequestContext,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<CachedRoleSecret, control_plane::errors::GetAuthInfoError> {
             Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
@@ -521,7 +521,7 @@ mod tests {
 
         async fn get_allowed_ips_and_secret(
             &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &RequestContext,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<
             (CachedAllowedIps, Option<CachedRoleSecret>),
@@ -535,7 +535,7 @@ mod tests {
 
         async fn get_endpoint_jwks(
             &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &RequestContext,
             _endpoint: crate::types::EndpointId,
         ) -> Result<Vec<super::jwt::AuthRule>, control_plane::errors::GetEndpointJwksError>
         {
@@ -544,7 +544,7 @@ mod tests {
 
         async fn wake_compute(
             &self,
-            _ctx: &RequestMonitoring,
+            _ctx: &RequestContext,
             _user_info: &super::ComputeUserInfo,
         ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
             unimplemented!()
@@ -623,7 +623,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -700,7 +700,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
@@ -752,7 +752,7 @@ mod tests {
         let (mut client, server) = tokio::io::duplex(1024);
         let mut stream = PqStream::new(Stream::from_raw(server));
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let api = Auth {
             ips: vec![],
             secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index dab9007400..f6bce9f2d8 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -10,7 +10,7 @@ use thiserror::Error;
 use tracing::{debug, warn};
 
 use crate::auth::password_hack::parse_endpoint_param;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, SniKind};
 use crate::proxy::NeonOptions;
@@ -86,7 +86,7 @@ pub(crate) fn endpoint_sni(
 
 impl ComputeUserInfoMaybeEndpoint {
     pub(crate) fn parse(
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         params: &StartupMessageParams,
         sni: Option<&str>,
         common_names: Option<&HashSet<String>>,
@@ -260,7 +260,7 @@ mod tests {
     fn parse_bare_minimum() -> anyhow::Result<()> {
         // According to postgresql, only `user` should be required.
         let options = StartupMessageParams::new([("user", "john_doe")]);
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id, None);
@@ -275,7 +275,7 @@ mod tests {
             ("database", "world"), // should be ignored
             ("foo", "bar"),        // should be ignored
         ]);
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id, None);
@@ -290,7 +290,7 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
@@ -307,7 +307,7 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
@@ -322,7 +322,7 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar -c geqo=off"),
         ]);
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
@@ -340,7 +340,7 @@ mod tests {
             ),
         ]);
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert!(user_info.endpoint_id.is_none());
@@ -355,7 +355,7 @@ mod tests {
             ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
         ]);
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
         assert!(user_info.endpoint_id.is_none());
@@ -370,7 +370,7 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
@@ -385,14 +385,14 @@ mod tests {
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.a.com");
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.b.com");
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
@@ -408,7 +408,7 @@ mod tests {
         let sni = Some("second.localhost");
         let common_names = Some(["localhost".into()].into());
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
             .expect_err("should fail");
         match err {
@@ -427,7 +427,7 @@ mod tests {
         let sni = Some("project.localhost");
         let common_names = Some(["example.com".into()].into());
 
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
             .expect_err("should fail");
         match err {
@@ -447,7 +447,7 @@ mod tests {
 
         let sni = Some("project.localhost");
         let common_names = Some(["localhost".into()].into());
-        let ctx = RequestMonitoring::test();
+        let ctx = RequestContext::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 1740b59b14..9c6ce151cb 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -11,7 +11,7 @@ use tracing::info;
 use super::backend::ComputeCredentialKeys;
 use super::{AuthError, PasswordHackPayload};
 use crate::config::TlsServerEndPoint;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::AuthSecret;
 use crate::intern::EndpointIdInt;
 use crate::sasl;
@@ -32,7 +32,7 @@ pub(crate) struct Begin;
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
 pub(crate) struct Scram<'a>(
     pub(crate) &'a scram::ServerSecret,
-    pub(crate) &'a RequestMonitoring,
+    pub(crate) &'a RequestContext,
 );
 
 impl AuthMethod for Scram<'_> {
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index ef5b5e8509..623a0fd3b2 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -11,7 +11,7 @@ use futures::future::Either;
 use futures::TryFutureExt;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
-use proxy::context::RequestMonitoring;
+use proxy::context::RequestContext;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::protocol2::ConnectionInfo;
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
@@ -177,7 +177,7 @@ async fn task_main(
                     .context("failed to set socket option")?;
 
                 info!(%peer_addr, "serving");
-                let ctx = RequestMonitoring::new(
+                let ctx = RequestContext::new(
                     session_id,
                     ConnectionInfo {
                         addr: peer_addr,
@@ -208,7 +208,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     raw_stream: S,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
@@ -259,7 +259,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }
 
 async fn handle_client(
-    ctx: RequestMonitoring,
+    ctx: RequestContext,
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
     tls_server_end_point: TlsServerEndPoint,
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 07769e053c..20db1fbb14 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::info;
 
 use crate::config::EndpointCacheConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 use crate::rate_limiter::GlobalRateLimiter;
@@ -75,7 +75,7 @@ impl EndpointsCache {
         }
     }
 
-    pub(crate) fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub(crate) fn is_valid(&self, ctx: &RequestContext, endpoint: &EndpointId) -> bool {
         if !self.ready.load(Ordering::Acquire) {
             // the endpoint cache is not yet fully initialised.
             return true;
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b8876b44eb..e7fbe9ab47 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -18,7 +18,7 @@ use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
 use crate::cancellation::CancelClosure;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::client::ApiLockError;
 use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::messages::MetricsAuxInfo;
@@ -286,7 +286,7 @@ impl ConnCfg {
     /// Connect to a corresponding compute node.
     pub(crate) async fn connect(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         allow_self_signed_compute: bool,
         aux: MetricsAuxInfo,
         timeout: Duration,
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 8e71f552a5..c88b2936db 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -8,7 +8,7 @@ use tracing::{debug, error, info, Instrument};
 use crate::auth::backend::ConsoleRedirectBackend;
 use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal};
 use crate::config::{ProxyConfig, ProxyProtocolV2};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
 use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
@@ -82,7 +82,7 @@ pub async fn task_main(
                 }
             };
 
-            let ctx = RequestMonitoring::new(
+            let ctx = RequestContext::new(
                 session_id,
                 peer_addr,
                 crate::metrics::Protocol::Tcp,
@@ -141,7 +141,7 @@ pub async fn task_main(
 pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     backend: &'static ConsoleRedirectBackend,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     conn_gauge: NumClientConnectionsGuard<'static>,
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index d057ee0bfd..6d2d2d51ce 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -32,15 +32,15 @@ pub(crate) static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<Reques
 ///
 /// This data should **not** be used for connection logic, only for observability and limiting purposes.
 /// All connection logic should instead use strongly typed state machines, not a bunch of Options.
-pub struct RequestMonitoring(
+pub struct RequestContext(
     /// To allow easier use of the ctx object, we have interior mutability.
     /// I would typically use a RefCell but that would break the `Send` requirements
     /// so we need something with thread-safety. `TryLock` is a cheap alternative
     /// that offers similar semantics to a `RefCell` but with synchronisation.
-    TryLock<RequestMonitoringInner>,
+    TryLock<RequestContextInner>,
 );
 
-struct RequestMonitoringInner {
+struct RequestContextInner {
     pub(crate) conn_info: ConnectionInfo,
     pub(crate) session_id: Uuid,
     pub(crate) protocol: Protocol,
@@ -81,10 +81,10 @@ pub(crate) enum AuthMethod {
     Cleartext,
 }
 
-impl Clone for RequestMonitoring {
+impl Clone for RequestContext {
     fn clone(&self) -> Self {
         let inner = self.0.try_lock().expect("should not deadlock");
-        let new = RequestMonitoringInner {
+        let new = RequestContextInner {
             conn_info: inner.conn_info.clone(),
             session_id: inner.session_id,
             protocol: inner.protocol,
@@ -115,7 +115,7 @@ impl Clone for RequestMonitoring {
     }
 }
 
-impl RequestMonitoring {
+impl RequestContext {
     pub fn new(
         session_id: Uuid,
         conn_info: ConnectionInfo,
@@ -132,7 +132,7 @@ impl RequestMonitoring {
             role = tracing::field::Empty,
         );
 
-        let inner = RequestMonitoringInner {
+        let inner = RequestContextInner {
             conn_info,
             session_id,
             protocol,
@@ -168,7 +168,7 @@ impl RequestMonitoring {
         let ip = IpAddr::from([127, 0, 0, 1]);
         let addr = SocketAddr::new(ip, 5432);
         let conn_info = ConnectionInfo { addr, extra: None };
-        RequestMonitoring::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test")
+        RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test")
     }
 
     pub(crate) fn console_application_name(&self) -> String {
@@ -325,7 +325,7 @@ impl RequestMonitoring {
 }
 
 pub(crate) struct LatencyTimerPause<'a> {
-    ctx: &'a RequestMonitoring,
+    ctx: &'a RequestContext,
     start: tokio::time::Instant,
     waiting_for: Waiting,
 }
@@ -341,7 +341,7 @@ impl Drop for LatencyTimerPause<'_> {
     }
 }
 
-impl RequestMonitoringInner {
+impl RequestContextInner {
     fn set_cold_start_info(&mut self, info: ColdStartInfo) {
         self.cold_start_info = info;
         self.latency_timer.cold_start_info(info);
@@ -430,7 +430,7 @@ impl RequestMonitoringInner {
     }
 }
 
-impl Drop for RequestMonitoringInner {
+impl Drop for RequestContextInner {
     fn drop(&mut self) {
         if self.sender.is_some() {
             self.log_connect();
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 4112de646f..9bf3a275bb 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -20,7 +20,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
-use super::{RequestMonitoringInner, LOG_CHAN};
+use super::{RequestContextInner, LOG_CHAN};
 use crate::config::remote_storage_from_toml;
 use crate::context::LOG_CHAN_DISCONNECT;
 
@@ -117,8 +117,8 @@ impl serde::Serialize for Options<'_> {
     }
 }
 
-impl From<&RequestMonitoringInner> for RequestData {
-    fn from(value: &RequestMonitoringInner) -> Self {
+impl From<&RequestContextInner> for RequestData {
+    fn from(value: &RequestContextInner) -> Self {
         Self {
             session_id: value.session_id,
             peer_addr: value.conn_info.addr.ip().to_string(),
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index fd333d2aac..500acad50f 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -13,7 +13,7 @@ use crate::auth::backend::jwt::AuthRule;
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret};
 use crate::control_plane::errors::{
     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
@@ -206,7 +206,7 @@ impl super::ControlPlaneApi for MockControlPlane {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         Ok(CachedRoleSecret::new_uncached(
@@ -216,7 +216,7 @@ impl super::ControlPlaneApi for MockControlPlane {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         Ok((
@@ -229,7 +229,7 @@ impl super::ControlPlaneApi for MockControlPlane {
 
     async fn get_endpoint_jwks(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         self.do_get_endpoint_jwks(endpoint).await
@@ -238,7 +238,7 @@ impl super::ControlPlaneApi for MockControlPlane {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         _user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         self.do_wake_compute().map_ok(Cached::new_uncached).await
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index 50903e2f1e..f8f74372f0 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -15,7 +15,7 @@ use crate::auth::backend::ComputeUserInfo;
 use crate::cache::endpoints::EndpointsCache;
 use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::{
     errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
 };
@@ -41,7 +41,7 @@ pub enum ControlPlaneClient {
 impl ControlPlaneApi for ControlPlaneClient {
     async fn get_role_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         match self {
@@ -57,7 +57,7 @@ impl ControlPlaneApi for ControlPlaneClient {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         match self {
@@ -71,7 +71,7 @@ impl ControlPlaneApi for ControlPlaneClient {
 
     async fn get_endpoint_jwks(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
         match self {
@@ -85,7 +85,7 @@ impl ControlPlaneApi for ControlPlaneClient {
 
     async fn wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
         match self {
@@ -271,7 +271,7 @@ impl WakeComputePermit {
 impl FetchAuthRules for ControlPlaneClient {
     async fn fetch_auth_rules(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
         self.get_endpoint_jwks(ctx, endpoint)
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 8f4ae13f33..53f9234926 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -14,7 +14,7 @@ use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeComput
 use crate::auth::backend::jwt::AuthRule;
 use crate::auth::backend::ComputeUserInfo;
 use crate::cache::Cached;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::caches::ApiCaches;
 use crate::control_plane::errors::{
     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
@@ -65,7 +65,7 @@ impl NeonControlPlaneClient {
 
     async fn do_get_auth_info(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<AuthInfo, GetAuthInfoError> {
         if !self
@@ -141,7 +141,7 @@ impl NeonControlPlaneClient {
 
     async fn do_get_endpoint_jwks(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         if !self
@@ -200,7 +200,7 @@ impl NeonControlPlaneClient {
 
     async fn do_wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<NodeInfo, WakeComputeError> {
         let request_id = ctx.session_id().to_string();
@@ -263,7 +263,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
@@ -297,7 +297,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
         let normalized_ep = &user_info.endpoint.normalize();
@@ -339,7 +339,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
     #[tracing::instrument(skip_all)]
     async fn get_endpoint_jwks(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         self.do_get_endpoint_jwks(ctx, endpoint).await
@@ -348,7 +348,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
     #[tracing::instrument(skip_all)]
     async fn wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         let key = user_info.endpoint_cache_key();
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index 70607ac0d0..41972e4e44 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -17,7 +17,7 @@ use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::auth::IpPattern;
 use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::cache::{Cached, TimedLru};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
 use crate::intern::ProjectIdInt;
 use crate::types::{EndpointCacheKey, EndpointId};
@@ -75,7 +75,7 @@ pub(crate) struct NodeInfo {
 impl NodeInfo {
     pub(crate) async fn connect(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         timeout: Duration,
     ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
         self.config
@@ -116,26 +116,26 @@ pub(crate) trait ControlPlaneApi {
     /// We still have to mock the scram to avoid leaking information that user doesn't exist.
     async fn get_role_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips_and_secret(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
 
     async fn get_endpoint_jwks(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 659b7afa68..b30aec09c1 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -7,7 +7,7 @@ use super::retry::ShouldRetryWakeCompute;
 use crate::auth::backend::ComputeCredentialKeys;
 use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT};
 use crate::config::RetryConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
@@ -47,7 +47,7 @@ pub(crate) trait ConnectMechanism {
     type Error: From<Self::ConnectError>;
     async fn connect_once(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         node_info: &control_plane::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError>;
@@ -59,7 +59,7 @@ pub(crate) trait ConnectMechanism {
 pub(crate) trait ComputeConnectBackend {
     async fn wake_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
     ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError>;
 
     fn get_keys(&self) -> &ComputeCredentialKeys;
@@ -82,7 +82,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     async fn connect_once(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         node_info: &control_plane::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
@@ -99,7 +99,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 #[tracing::instrument(skip_all)]
 pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     mechanism: &M,
     user_info: &B,
     allow_self_signed_compute: bool,
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index a67f1b8112..3ada3a9995 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -9,7 +9,7 @@ use tracing::{info, warn};
 
 use crate::auth::endpoint_sni;
 use crate::config::{TlsConfig, PG_ALPN_PROTOCOL};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::Metrics;
 use crate::proxy::ERR_INSECURE_CONNECTION;
@@ -66,7 +66,7 @@ pub(crate) enum HandshakeData<S> {
 /// we also take an extra care of propagating only the select handshake errors to client.
 #[tracing::instrument(skip_all)]
 pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     stream: S,
     mut tls: Option<&TlsConfig>,
     record_handshake_error: bool,
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 17721c23d5..4be4006d15 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -25,7 +25,7 @@ use self::connect_compute::{connect_to_compute, TcpMechanism};
 use self::passthrough::ProxyPassthrough;
 use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal};
 use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
 use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
@@ -117,7 +117,7 @@ pub async fn task_main(
                 }
             };
 
-            let ctx = RequestMonitoring::new(
+            let ctx = RequestContext::new(
                 session_id,
                 conn_info,
                 crate::metrics::Protocol::Tcp,
@@ -247,7 +247,7 @@ impl ReportableError for ClientRequestError {
 pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     auth_backend: &'static auth::Backend<'static, ()>,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index df9f79a7e3..fe211adfeb 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -36,7 +36,7 @@ async fn proxy_mitm(
         // begin handshake with end_server
         let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
         let (end_client, startup) = match handshake(
-            &RequestMonitoring::test(),
+            &RequestContext::test(),
             client1,
             Some(&server_config1),
             false,
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index be821925b5..3de8ca8736 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -162,7 +162,7 @@ impl TestAuth for Scram {
         stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
         let outcome = auth::AuthFlow::new(stream)
-            .begin(auth::Scram(&self.0, &RequestMonitoring::test()))
+            .begin(auth::Scram(&self.0, &RequestContext::test()))
             .await?
             .authenticate()
             .await?;
@@ -182,11 +182,10 @@ async fn dummy_proxy(
     auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
     let (client, _) = read_proxy_protocol(client).await?;
-    let mut stream =
-        match handshake(&RequestMonitoring::test(), client, tls.as_ref(), false).await? {
-            HandshakeData::Startup(stream, _) => stream,
-            HandshakeData::Cancel(_) => bail!("cancellation not supported"),
-        };
+    let mut stream = match handshake(&RequestContext::test(), client, tls.as_ref(), false).await? {
+        HandshakeData::Startup(stream, _) => stream,
+        HandshakeData::Cancel(_) => bail!("cancellation not supported"),
+    };
 
     auth.authenticate(&mut stream).await?;
 
@@ -466,7 +465,7 @@ impl ConnectMechanism for TestConnectMechanism {
 
     async fn connect_once(
         &self,
-        _ctx: &RequestMonitoring,
+        _ctx: &RequestContext,
         _node_info: &control_plane::CachedNodeInfo,
         _timeout: std::time::Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -581,7 +580,7 @@ fn helper_create_connect_info(
 async fn connect_to_compute_success() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -599,7 +598,7 @@ async fn connect_to_compute_success() {
 async fn connect_to_compute_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -618,7 +617,7 @@ async fn connect_to_compute_retry() {
 async fn connect_to_compute_non_retry_1() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -637,7 +636,7 @@ async fn connect_to_compute_non_retry_1() {
 async fn connect_to_compute_non_retry_2() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -657,7 +656,7 @@ async fn connect_to_compute_non_retry_3() {
     let _ = env_logger::try_init();
     tokio::time::pause();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism =
         TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
     let user_info = helper_create_connect_info(&mechanism);
@@ -689,7 +688,7 @@ async fn connect_to_compute_non_retry_3() {
 async fn wake_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
@@ -708,7 +707,7 @@ async fn wake_retry() {
 async fn wake_non_retry() {
     let _ = env_logger::try_init();
     use ConnectAction::*;
-    let ctx = RequestMonitoring::test();
+    let ctx = RequestContext::test();
     let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
     let user_info = helper_create_connect_info(&mechanism);
     let config = RetryConfig {
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index f9f46bb66c..d09e0b1f41 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -2,7 +2,7 @@ use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::CachedNodeInfo;
 use crate::error::ReportableError;
@@ -13,7 +13,7 @@ use crate::proxy::retry::{retry_after, should_retry};
 
 pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     api: &B,
     config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 5e9fd151ae..d9dcf6fbb7 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -23,7 +23,7 @@ use crate::compute_ctl::{
     ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
 };
 use crate::config::ProxyConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::client::ApiLockError;
 use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
 use crate::control_plane::locks::ApiLocks;
@@ -48,7 +48,7 @@ pub(crate) struct PoolingBackend {
 impl PoolingBackend {
     pub(crate) async fn authenticate_with_password(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
         password: &[u8],
     ) -> Result<ComputeCredentials, AuthError> {
@@ -110,7 +110,7 @@ impl PoolingBackend {
 
     pub(crate) async fn authenticate_with_jwt(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         user_info: &ComputeUserInfo,
         jwt: String,
     ) -> Result<ComputeCredentials, AuthError> {
@@ -161,7 +161,7 @@ impl PoolingBackend {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     pub(crate) async fn connect_to_compute(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: ConnInfo,
         keys: ComputeCredentials,
         force_new: bool,
@@ -201,7 +201,7 @@ impl PoolingBackend {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     pub(crate) async fn connect_to_local_proxy(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: ConnInfo,
     ) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
         info!("pool: looking for an existing connection");
@@ -249,7 +249,7 @@ impl PoolingBackend {
     #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
     pub(crate) async fn connect_to_local_postgres(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: ConnInfo,
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
@@ -490,7 +490,7 @@ impl ConnectMechanism for TokioMechanism {
 
     async fn connect_once(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
@@ -540,7 +540,7 @@ impl ConnectMechanism for HyperMechanism {
 
     async fn connect_once(
         &self,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 1845603bf7..07ba1ae9af 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -21,7 +21,7 @@ use {
 use super::conn_pool_lib::{
     Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, GlobalConnPool,
 };
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::metrics::Metrics;
 
@@ -53,7 +53,7 @@ impl fmt::Display for ConnInfo {
 
 pub(crate) fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<GlobalConnPool<C>>,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
     mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 61c39c32c9..fe3c422c3b 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -15,7 +15,7 @@ use super::conn_pool::ClientDataRemote;
 use super::http_conn_pool::ClientDataHttp;
 use super::local_conn_pool::ClientDataLocal;
 use crate::auth::backend::ComputeUserInfo;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::types::{DbName, EndpointCacheKey, RoleName};
@@ -380,7 +380,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
     pub(crate) fn get(
         self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let mut client: Option<ClientInnerCommon<C>> = None;
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index a1d4473b01..bc86c4b1cd 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -12,7 +12,7 @@ use tracing::{debug, error, info, info_span, Instrument};
 
 use super::backend::HttpConnError;
 use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::types::EndpointCacheKey;
@@ -212,7 +212,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
     #[expect(unused_results)]
     pub(crate) fn get(
         self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let result: Result<Option<Client<C>>, HttpConnError>;
@@ -280,7 +280,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
 
 pub(crate) fn poll_http2_client(
     global_pool: Arc<GlobalConnPool<Send>>,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     conn_info: &ConnInfo,
     client: Send,
     connection: Connect,
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 99d4329f88..cadcbd7530 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -36,7 +36,7 @@ use super::conn_pool_lib::{
     Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, DbUserConn,
     EndpointConnPool,
 };
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 
@@ -88,7 +88,7 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
 
     pub(crate) fn get(
         self: &Arc<Self>,
-        ctx: &RequestMonitoring,
+        ctx: &RequestContext,
         conn_info: &ConnInfo,
     ) -> Result<Option<Client<C>>, HttpConnError> {
         let client = self
@@ -159,7 +159,7 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<LocalConnPool<C>>,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
     mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index cf758855fa..59247f03bf 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -45,7 +45,7 @@ use utils::http::error::ApiError;
 
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::{ProxyConfig, ProxyProtocolV2};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::metrics::Metrics;
 use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo};
 use crate::proxy::run_until_cancelled;
@@ -423,7 +423,7 @@ async fn request_handler(
     if config.http_config.accept_websockets
         && framed_websockets::upgrade::is_upgrade_request(&request)
     {
-        let ctx = RequestMonitoring::new(
+        let ctx = RequestContext::new(
             session_id,
             conn_info,
             crate::metrics::Protocol::Ws,
@@ -458,7 +458,7 @@ async fn request_handler(
         // Return the response so the spawned future can continue.
         Ok(response.map(|b| b.map_err(|x| match x {}).boxed()))
     } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
-        let ctx = RequestMonitoring::new(
+        let ctx = RequestContext::new(
             session_id,
             conn_info,
             crate::metrics::Protocol::Http,
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index f0975617d4..36d8595902 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -34,7 +34,7 @@ use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
 use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
@@ -133,7 +133,7 @@ impl UserFacingError for ConnInfoError {
 
 fn get_conn_info(
     config: &'static AuthenticationConfig,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     headers: &HeaderMap,
     tls: Option<&TlsConfig>,
 ) -> Result<ConnInfoWithAuth, ConnInfoError> {
@@ -240,7 +240,7 @@ fn get_conn_info(
 
 pub(crate) async fn handle(
     config: &'static ProxyConfig,
-    ctx: RequestMonitoring,
+    ctx: RequestContext,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
@@ -516,7 +516,7 @@ fn map_isolation_level_to_headers(level: IsolationLevel) -> Option<HeaderValue>
 async fn handle_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
@@ -562,7 +562,7 @@ async fn handle_inner(
 async fn handle_db_inner(
     cancel: CancellationToken,
     config: &'static ProxyConfig,
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     request: Request<Incoming>,
     conn_info: ConnInfo,
     auth: AuthData,
@@ -733,7 +733,7 @@ pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue {
 }
 
 async fn handle_auth_broker_inner(
-    ctx: &RequestMonitoring,
+    ctx: &RequestContext,
     request: Request<Incoming>,
     conn_info: ConnInfo,
     jwt: String,
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index ba36116c2c..4088fea835 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -14,7 +14,7 @@ use tracing::warn;
 
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
-use crate::context::RequestMonitoring;
+use crate::context::RequestContext;
 use crate::error::{io_error, ReportableError};
 use crate::metrics::Metrics;
 use crate::proxy::{handle_client, ClientMode, ErrorSource};
@@ -126,7 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 pub(crate) async fn serve_websocket(
     config: &'static ProxyConfig,
     auth_backend: &'static crate::auth::Backend<'static, ()>,
-    ctx: RequestMonitoring,
+    ctx: RequestContext,
     websocket: OnUpgrade,
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,

From 593e35027a088cadeb74b8c6e6f08877495986b3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Nov 2024 14:57:59 +0000
Subject: [PATCH 17/24] tests: use fewer pageservers in
 test_sharding_split_smoke (#9804)

## Problem

This test uses a gratuitous number of pageservers (16). This works fine
when there are plenty of system resources, but causes issues on test
runners that have limited resources and run many tests concurrently.

Related: https://github.com/neondatabase/neon/issues/9802

## Summary of changes

- Split from 2 shards to 4, instead of 4 to 8
- Don't give every shard a separate pageserver, let two locations share
each pageserver.

Net result is 4 pageservers instead of 16
---
 test_runner/regress/test_sharding.py | 39 ++++++++++------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 0a4a53356d..84737fc81e 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -515,11 +515,12 @@ def test_sharding_split_smoke(
 
     """
 
-    # We will start with 4 shards and split into 8, then migrate all those
-    # 8 shards onto separate pageservers
-    shard_count = 4
-    split_shard_count = 8
-    neon_env_builder.num_pageservers = split_shard_count * 2
+    # Shard count we start with
+    shard_count = 2
+    # Shard count we split into
+    split_shard_count = 4
+    # We will have 2 shards per pageserver once done (including secondaries)
+    neon_env_builder.num_pageservers = split_shard_count
 
     # 1MiB stripes: enable getting some meaningful data distribution without
     # writing large quantities of data in this test.  The stripe size is given
@@ -591,7 +592,7 @@ def test_sharding_split_smoke(
 
     workload.validate()
 
-    assert len(pre_split_pageserver_ids) == 4
+    assert len(pre_split_pageserver_ids) == shard_count
 
     def shards_on_disk(shard_ids):
         for pageserver in env.pageservers:
@@ -654,9 +655,9 @@ def test_sharding_split_smoke(
     # - shard_count reconciles for the original setup of the tenant
     # - shard_count reconciles for detaching the original secondary locations during split
     # - split_shard_count reconciles during shard splitting, for setting up secondaries.
-    # - shard_count of the child shards will need to fail over to their secondaries
-    # - shard_count of the child shard secondary locations will get moved to emptier nodes
-    expect_reconciles = shard_count * 2 + split_shard_count + shard_count * 2
+    # - split_shard_count/2 of the child shards will need to fail over to their secondaries (since we have 8 shards and 4 pageservers, only 4 will move)
+    expect_reconciles = shard_count * 2 + split_shard_count + split_shard_count / 2
+
     reconcile_ok = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
     )
@@ -720,22 +721,10 @@ def test_sharding_split_smoke(
     # dominated by shard count.
     log.info(f"total: {total}")
     assert total == {
-        1: 1,
-        2: 1,
-        3: 1,
-        4: 1,
-        5: 1,
-        6: 1,
-        7: 1,
-        8: 1,
-        9: 1,
-        10: 1,
-        11: 1,
-        12: 1,
-        13: 1,
-        14: 1,
-        15: 1,
-        16: 1,
+        1: 2,
+        2: 2,
+        3: 2,
+        4: 2,
     }
 
     # The controller is not required to lay out the attached locations in any particular way, but

From 67f5f83edcf50e14fb269cd8919bbda3601fdaf0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Nov 2024 15:56:14 +0000
Subject: [PATCH 18/24] pageserver: avoid reading SLRU blocks for GC on shards
 >0  (#9423)

## Problem

SLRU blocks, which can add up to several gigabytes, are currently
ingested by all shards, multiplying their capacity cost by the shard
count and slowing down ingest. We do this because all shards need the
SLRU pages to do timestamp->LSN lookup for GC.

Related: https://github.com/neondatabase/neon/issues/7512

## Summary of changes

- On non-zero shards, learn the GC offset from shard 0's index instead
of calculating it.
- Add a test `test_sharding_gc` that exercises this
- Do GC in test_pg_regress as a general smoke test that GC functions run
(e.g. this would fail if we were using SLRUs we didn't have)

In this PR we are still ingesting SLRUs everywhere, but not using them
any more. Part 2 PR (https://github.com/neondatabase/neon/pull/9786)
makes the change to not store them at all.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .../src/tenant/remote_timeline_client.rs      |  23 ++++
 pageserver/src/tenant/timeline.rs             | 116 +++++++++++++-----
 test_runner/fixtures/remote_storage.py        |  16 ++-
 test_runner/regress/test_pg_regress.py        |   4 +-
 test_runner/regress/test_sharding.py          | 110 ++++++++++++++++-
 5 files changed, 228 insertions(+), 41 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 94f42c7827..b910a40547 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -197,6 +197,7 @@ use utils::backoff::{
     self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
 use utils::pausable_failpoint;
+use utils::shard::ShardNumber;
 
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
@@ -2231,6 +2232,28 @@ impl RemoteTimelineClient {
             UploadQueue::Initialized(x) => x.no_pending_work(),
         }
     }
+
+    /// 'foreign' in the sense that it does not belong to this tenant shard.  This method
+    /// is used during GC for other shards to get the index of shard zero.
+    pub(crate) async fn download_foreign_index(
+        &self,
+        shard_number: ShardNumber,
+        cancel: &CancellationToken,
+    ) -> Result<(IndexPart, Generation, std::time::SystemTime), DownloadError> {
+        let foreign_shard_id = TenantShardId {
+            shard_number,
+            shard_count: self.tenant_shard_id.shard_count,
+            tenant_id: self.tenant_shard_id.tenant_id,
+        };
+        download_index_part(
+            &self.storage_impl,
+            &foreign_shard_id,
+            &self.timeline_id,
+            Generation::MAX,
+            cancel,
+        )
+        .await
+    }
 }
 
 pub(crate) struct UploadQueueAccessor<'a> {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0eb3de21e9..a4289a222f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -38,6 +38,7 @@ use pageserver_api::{
     shard::{ShardIdentity, ShardNumber, TenantShardId},
 };
 use rand::Rng;
+use remote_storage::DownloadError;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
 use tokio::{
@@ -4821,6 +4822,86 @@ impl Timeline {
         Ok(())
     }
 
+    async fn find_gc_time_cutoff(
+        &self,
+        pitr: Duration,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<Option<Lsn>, PageReconstructError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        if self.shard_identity.is_shard_zero() {
+            // Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself
+            let now = SystemTime::now();
+            let time_range = if pitr == Duration::ZERO {
+                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
+            } else {
+                pitr
+            };
+
+            // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
+            let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
+            let timestamp = to_pg_timestamp(time_cutoff);
+
+            let time_cutoff = match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
+                LsnForTimestamp::Present(lsn) => Some(lsn),
+                LsnForTimestamp::Future(lsn) => {
+                    // The timestamp is in the future. That sounds impossible,
+                    // but what it really means is that there hasn't been
+                    // any commits since the cutoff timestamp.
+                    //
+                    // In this case we should use the LSN of the most recent commit,
+                    // which is implicitly the last LSN in the log.
+                    debug!("future({})", lsn);
+                    Some(self.get_last_record_lsn())
+                }
+                LsnForTimestamp::Past(lsn) => {
+                    debug!("past({})", lsn);
+                    None
+                }
+                LsnForTimestamp::NoData(lsn) => {
+                    debug!("nodata({})", lsn);
+                    None
+                }
+            };
+            Ok(time_cutoff)
+        } else {
+            // Shards other than shard zero cannot do timestamp->lsn lookups, and must instead learn their GC cutoff
+            // from shard zero's index.  The index doesn't explicitly tell us the time cutoff, but we may assume that
+            // the point up to which shard zero's last_gc_cutoff has advanced will either be the time cutoff, or a
+            // space cutoff that we would also have respected ourselves.
+            match self
+                .remote_client
+                .download_foreign_index(ShardNumber(0), cancel)
+                .await
+            {
+                Ok((index_part, index_generation, _index_mtime)) => {
+                    tracing::info!("GC loaded shard zero metadata (gen {index_generation:?}): latest_gc_cutoff_lsn: {}",
+                        index_part.metadata.latest_gc_cutoff_lsn());
+                    Ok(Some(index_part.metadata.latest_gc_cutoff_lsn()))
+                }
+                Err(DownloadError::NotFound) => {
+                    // This is unexpected, because during timeline creations shard zero persists to remote
+                    // storage before other shards are called, and during timeline deletion non-zeroth shards are
+                    // deleted before the zeroth one.  However, it should be harmless: if we somehow end up in this
+                    // state, then shard zero should _eventually_ write an index when it GCs.
+                    tracing::warn!("GC couldn't find shard zero's index for timeline");
+                    Ok(None)
+                }
+                Err(e) => {
+                    // TODO: this function should return a different error type than page reconstruct error
+                    Err(PageReconstructError::Other(anyhow::anyhow!(e)))
+                }
+            }
+
+            // TODO: after reading shard zero's GC cutoff, we should validate its generation with the storage
+            // controller.  Otherwise, it is possible that we see the GC cutoff go backwards while shard zero
+            // is going through a migration if we read the old location's index and it has GC'd ahead of the
+            // new location.  This is legal in principle, but problematic in practice because it might result
+            // in a timeline creation succeeding on shard zero ('s new location) but then failing on other shards
+            // because they have GC'd past the branch point.
+        }
+    }
+
     /// Find the Lsns above which layer files need to be retained on
     /// garbage collection.
     ///
@@ -4863,40 +4944,7 @@ impl Timeline {
         // - if PITR interval is set, then this is our cutoff.
         // - if PITR interval is not set, then we do a lookup
         //   based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
-        let time_cutoff = {
-            let now = SystemTime::now();
-            let time_range = if pitr == Duration::ZERO {
-                humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
-            } else {
-                pitr
-            };
-
-            // If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
-            let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
-            let timestamp = to_pg_timestamp(time_cutoff);
-
-            match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
-                LsnForTimestamp::Present(lsn) => Some(lsn),
-                LsnForTimestamp::Future(lsn) => {
-                    // The timestamp is in the future. That sounds impossible,
-                    // but what it really means is that there hasn't been
-                    // any commits since the cutoff timestamp.
-                    //
-                    // In this case we should use the LSN of the most recent commit,
-                    // which is implicitly the last LSN in the log.
-                    debug!("future({})", lsn);
-                    Some(self.get_last_record_lsn())
-                }
-                LsnForTimestamp::Past(lsn) => {
-                    debug!("past({})", lsn);
-                    None
-                }
-                LsnForTimestamp::NoData(lsn) => {
-                    debug!("nodata({})", lsn);
-                    None
-                }
-            }
-        };
+        let time_cutoff = self.find_gc_time_cutoff(pitr, cancel, ctx).await?;
 
         Ok(match (pitr, time_cutoff) {
             (Duration::ZERO, Some(time_cutoff)) => {
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 7024953661..c630ea98b4 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -77,14 +77,16 @@ class MockS3Server:
 class LocalFsStorage:
     root: Path
 
-    def tenant_path(self, tenant_id: TenantId) -> Path:
+    def tenant_path(self, tenant_id: Union[TenantId, TenantShardId]) -> Path:
         return self.root / "tenants" / str(tenant_id)
 
-    def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
+    def timeline_path(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ) -> Path:
         return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
 
     def timeline_latest_generation(
-        self, tenant_id: TenantId, timeline_id: TimelineId
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
     ) -> Optional[int]:
         timeline_files = os.listdir(self.timeline_path(tenant_id, timeline_id))
         index_parts = [f for f in timeline_files if f.startswith("index_part")]
@@ -102,7 +104,9 @@ class LocalFsStorage:
             raise RuntimeError(f"No index_part found for {tenant_id}/{timeline_id}")
         return generations[-1]
 
-    def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
+    def index_path(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ) -> Path:
         latest_gen = self.timeline_latest_generation(tenant_id, timeline_id)
         if latest_gen is None:
             filename = TIMELINE_INDEX_PART_FILE_NAME
@@ -126,7 +130,9 @@ class LocalFsStorage:
         filename = f"{local_name}-{generation:08x}"
         return self.timeline_path(tenant_id, timeline_id) / filename
 
-    def index_content(self, tenant_id: TenantId, timeline_id: TimelineId) -> Any:
+    def index_content(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ) -> Any:
         with self.index_path(tenant_id, timeline_id).open("r") as f:
             return json.load(f)
 
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index f4698191eb..6a5e388c53 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -110,13 +110,15 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
 
     check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
 
-    # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+    # Ensure that compaction/GC works, on a timeline containing all the diversity that postgres regression tests create.
     # There should have been compactions mid-test as well, this final check is in addition those.
     for shard, pageserver in tenant_get_shards(env, env.initial_tenant):
         pageserver.http_client().timeline_checkpoint(
             shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True
         )
 
+        pageserver.http_client().timeline_gc(shard, env.initial_timeline, None)
+
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 84737fc81e..3194fe6ec4 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -19,7 +19,7 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
-from fixtures.remote_storage import s3_storage
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, s3_storage
 from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
@@ -1674,3 +1674,111 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder):
     )
     assert len(top["shards"]) == n_tenants - 4
     assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:])
+
+
+def test_sharding_gc(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Exercise GC in a sharded tenant: because only shard 0 holds SLRU content, it acts as
+    the "leader" for GC, and other shards read its index to learn what LSN they should
+    GC up to.
+    """
+
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": 128 * 1024,
+        "compaction_threshold": 1,
+        "compaction_target_size": 128 * 1024,
+        # A short PITR horizon, so that we won't have to sleep too long in the test to wait for it to
+        # happen.
+        "pitr_interval": "1s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # Disable automatic creation of image layers, as we will create them explicitly when we want them
+        "image_creation_threshold": 9999,
+        "image_layer_creation_check_threshold": 0,
+        "lsn_lease_length": "0s",
+    }
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_conf=TENANT_CONF
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Create a branch and write some data
+    workload = Workload(env, tenant_id, timeline_id)
+    initial_lsn = Lsn(workload.endpoint().safe_psql("SELECT pg_current_wal_lsn()")[0][0])
+    log.info(f"Started at LSN: {initial_lsn}")
+
+    workload.init()
+
+    # Write enough data to generate multiple layers
+    for _i in range(10):
+        last_lsn = workload.write_rows(32)
+
+    assert last_lsn > initial_lsn
+
+    log.info(f"Wrote up to last LSN: {last_lsn}")
+
+    # Do full image layer generation. When we subsequently wait for PITR, all historic deltas
+    # should be GC-able
+    for shard_number in range(shard_count):
+        shard = TenantShardId(tenant_id, shard_number, shard_count)
+        env.get_tenant_pageserver(shard).http_client().timeline_compact(
+            shard, timeline_id, force_image_layer_creation=True
+        )
+
+    workload.churn_rows(32)
+
+    time.sleep(5)
+
+    # Invoke GC on a non-zero shard and verify its GC cutoff LSN does not advance
+    shard_one = TenantShardId(tenant_id, 1, shard_count)
+    env.get_tenant_pageserver(shard_one).http_client().timeline_gc(
+        shard_one, timeline_id, gc_horizon=None
+    )
+
+    # Check shard 1's index - GC cutoff LSN should not have advanced
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+    shard_1_index = env.pageserver_remote_storage.index_content(
+        tenant_id=shard_one, timeline_id=timeline_id
+    )
+    shard_1_gc_cutoff_lsn = Lsn(shard_1_index["metadata_bytes"]["latest_gc_cutoff_lsn"])
+    log.info(f"Shard 1 cutoff LSN: {shard_1_gc_cutoff_lsn}")
+    assert shard_1_gc_cutoff_lsn <= last_lsn
+
+    shard_zero = TenantShardId(tenant_id, 0, shard_count)
+    env.get_tenant_pageserver(shard_zero).http_client().timeline_gc(
+        shard_zero, timeline_id, gc_horizon=None
+    )
+
+    # TODO: observe that GC LSN of shard 0 has moved forward in remote storage
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+    shard_0_index = env.pageserver_remote_storage.index_content(
+        tenant_id=shard_zero, timeline_id=timeline_id
+    )
+    shard_0_gc_cutoff_lsn = Lsn(shard_0_index["metadata_bytes"]["latest_gc_cutoff_lsn"])
+    log.info(f"Shard 0 cutoff LSN: {shard_0_gc_cutoff_lsn}")
+    assert shard_0_gc_cutoff_lsn >= last_lsn
+
+    # Invoke GC on all other shards and verify their GC cutoff LSNs
+    for shard_number in range(1, shard_count):
+        shard = TenantShardId(tenant_id, shard_number, shard_count)
+        env.get_tenant_pageserver(shard).http_client().timeline_gc(
+            shard, timeline_id, gc_horizon=None
+        )
+
+        # Verify GC cutoff LSN advanced to match shard 0
+        shard_index = env.pageserver_remote_storage.index_content(
+            tenant_id=shard, timeline_id=timeline_id
+        )
+        shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"])
+        log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}")
+        assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn

From 5ff2f1ee7d2b4b2b6dfb017dfecd5ae3f59cb404 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 20 Nov 2024 17:31:55 +0000
Subject: [PATCH 19/24] pageserver: enable compaction to proceed while
 live-migrating (#5397)

## Problem

Long ago, in #5299 the tenant states for migration are added, but
respected only in a coarse-grained way: when hinted not to do deletions,
tenants will just avoid doing all GC or compaction.

Skipping compaction is not necessary for AttachedMulti, as we will soon
become the primary attached location, and it is not a waste of resources
to proceed with compaction. Instead, per the RFC
https://github.com/neondatabase/neon/pull/5029/files), deletions should
be queued up in this state, and executed later when we switch to
AttachedSingle.

Avoiding compaction in AttachedMulti can have an operational impact if a
tenant is under significant write load, as a long-running migration can
result in a large accumulation of delta layers with commensurate impact
on read latency.

Closes: https://github.com/neondatabase/neon/issues/5396

## Summary of changes

- Add a 'config' part to RemoteTimelineClient so that it can be aware of
the mode of the tenant it belongs to, and wire this through for
construction + updates
- Add a special buffer for delayed deletions, and when in AttachedMulti
route deletions here instead of into the main remote client queue. This
is drained when transitioning to AttachedSingle. If the tenant is
detached or our process dies before then, then these objects are leaked.
- As a quality of life improvement, also use the remote timeline
client's knowledge of the tenant state to avoid submitting remote
consistent LSN updates for validation when in AttachedStale (as we know
these will fail)

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 pageserver/src/tenant.rs                      |  25 +++-
 .../src/tenant/remote_timeline_client.rs      | 122 ++++++++++++++++--
 pageserver/src/tenant/timeline.rs             |   9 +-
 pageserver/src/tenant/timeline/delete.rs      |   2 +-
 pageserver/src/tenant/upload_queue.rs         |   7 +-
 .../regress/test_pageserver_secondary.py      |  24 ++++
 6 files changed, 167 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8e9e3890ba..2e5f69e3c9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -189,6 +189,7 @@ pub struct TenantSharedResources {
 /// A [`Tenant`] is really an _attached_ tenant.  The configuration
 /// for an attached tenant is a subset of the [`LocationConf`], represented
 /// in this struct.
+#[derive(Clone)]
 pub(super) struct AttachedTenantConf {
     tenant_conf: TenantConfOpt,
     location: AttachedLocationConfig,
@@ -1807,6 +1808,7 @@ impl Tenant {
             self.tenant_shard_id,
             timeline_id,
             self.generation,
+            &self.tenant_conf.load().location,
         )
     }
 
@@ -2527,6 +2529,10 @@ impl Tenant {
         {
             let conf = self.tenant_conf.load();
 
+            // If we may not delete layers, then simply skip GC.  Even though a tenant
+            // in AttachedMulti state could do GC and just enqueue the blocked deletions,
+            // the only advantage to doing it is to perhaps shrink the LayerMap metadata
+            // a bit sooner than we would achieve by waiting for AttachedSingle status.
             if !conf.location.may_delete_layers_hint() {
                 info!("Skipping GC in location state {:?}", conf.location);
                 return Ok(GcResult::default());
@@ -2568,7 +2574,14 @@ impl Tenant {
 
         {
             let conf = self.tenant_conf.load();
-            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
+
+            // Note that compaction usually requires deletions, but we don't respect
+            // may_delete_layers_hint here: that is because tenants in AttachedMulti
+            // should proceed with compaction even if they can't do deletion, to avoid
+            // accumulating dangerously deep stacks of L0 layers.  Deletions will be
+            // enqueued inside RemoteTimelineClient, and executed layer if/when we transition
+            // to AttachedSingle state.
+            if !conf.location.may_upload_layers_hint() {
                 info!("Skipping compaction in location state {:?}", conf.location);
                 return Ok(false);
             }
@@ -3446,6 +3459,7 @@ impl Tenant {
         // this race is not possible if both request types come from the storage
         // controller (as they should!) because an exclusive op lock is required
         // on the storage controller side.
+
         self.tenant_conf.rcu(|inner| {
             Arc::new(AttachedTenantConf {
                 tenant_conf: new_tenant_conf.clone(),
@@ -3455,20 +3469,22 @@ impl Tenant {
             })
         });
 
+        let updated = self.tenant_conf.load().clone();
+
         self.tenant_conf_updated(&new_tenant_conf);
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
         let timelines = self.list_timelines();
         for timeline in timelines {
-            timeline.tenant_conf_updated(&new_tenant_conf);
+            timeline.tenant_conf_updated(&updated);
         }
     }
 
     pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
         let new_tenant_conf = new_conf.tenant_conf.clone();
 
-        self.tenant_conf.store(Arc::new(new_conf));
+        self.tenant_conf.store(Arc::new(new_conf.clone()));
 
         self.tenant_conf_updated(&new_tenant_conf);
         // Don't hold self.timelines.lock() during the notifies.
@@ -3476,7 +3492,7 @@ impl Tenant {
         // mutexes in struct Timeline in the future.
         let timelines = self.list_timelines();
         for timeline in timelines {
-            timeline.tenant_conf_updated(&new_tenant_conf);
+            timeline.tenant_conf_updated(&new_conf);
         }
     }
 
@@ -4544,6 +4560,7 @@ impl Tenant {
             self.tenant_shard_id,
             timeline_id,
             self.generation,
+            &self.tenant_conf.load().location,
         )
     }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index b910a40547..377bc23542 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -241,6 +241,7 @@ use utils::id::{TenantId, TimelineId};
 
 use self::index::IndexPart;
 
+use super::config::AttachedLocationConfig;
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
 use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
@@ -302,6 +303,36 @@ pub enum WaitCompletionError {
 #[derive(Debug, thiserror::Error)]
 #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
 pub struct UploadQueueNotReadyError;
+/// Behavioral modes that enable seamless live migration.
+///
+/// See docs/rfcs/028-pageserver-migration.md to understand how these fit in.
+struct RemoteTimelineClientConfig {
+    /// If this is false, then update to remote_consistent_lsn are dropped rather
+    /// than being submitted to DeletionQueue for validation.  This behavior is
+    /// used when a tenant attachment is known to have a stale generation number,
+    /// such that validation attempts will always fail.  This is not necessary
+    /// for correctness, but avoids spamming error statistics with failed validations
+    /// when doing migrations of tenants.
+    process_remote_consistent_lsn_updates: bool,
+
+    /// If this is true, then object deletions are held in a buffer in RemoteTimelineClient
+    /// rather than being submitted to the DeletionQueue.  This behavior is used when a tenant
+    /// is known to be multi-attached, in order to avoid disrupting other attached tenants
+    /// whose generations' metadata refers to the deleted objects.
+    block_deletions: bool,
+}
+
+/// RemoteTimelineClientConfig's state is entirely driven by LocationConf, but we do
+/// not carry the entire LocationConf structure: it's much more than we need.  The From
+/// impl extracts the subset of the LocationConf that is interesting to RemoteTimelineClient.
+impl From<&AttachedLocationConfig> for RemoteTimelineClientConfig {
+    fn from(lc: &AttachedLocationConfig) -> Self {
+        Self {
+            block_deletions: !lc.may_delete_layers_hint(),
+            process_remote_consistent_lsn_updates: lc.may_upload_layers_hint(),
+        }
+    }
+}
 
 /// A client for accessing a timeline's data in remote storage.
 ///
@@ -322,7 +353,7 @@ pub struct UploadQueueNotReadyError;
 /// in the index part file, whenever timeline metadata is uploaded.
 ///
 /// Downloads are not queued, they are performed immediately.
-pub struct RemoteTimelineClient {
+pub(crate) struct RemoteTimelineClient {
     conf: &'static PageServerConf,
 
     runtime: tokio::runtime::Handle,
@@ -339,6 +370,9 @@ pub struct RemoteTimelineClient {
 
     deletion_queue_client: DeletionQueueClient,
 
+    /// Subset of tenant configuration used to control upload behaviors during migrations
+    config: std::sync::RwLock<RemoteTimelineClientConfig>,
+
     cancel: CancellationToken,
 }
 
@@ -349,13 +383,14 @@ impl RemoteTimelineClient {
     /// Note: the caller must initialize the upload queue before any uploads can be scheduled,
     /// by calling init_upload_queue.
     ///
-    pub fn new(
+    pub(crate) fn new(
         remote_storage: GenericRemoteStorage,
         deletion_queue_client: DeletionQueueClient,
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         generation: Generation,
+        location_conf: &AttachedLocationConfig,
     ) -> RemoteTimelineClient {
         RemoteTimelineClient {
             conf,
@@ -375,6 +410,7 @@ impl RemoteTimelineClient {
                 &tenant_shard_id,
                 &timeline_id,
             )),
+            config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)),
             cancel: CancellationToken::new(),
         }
     }
@@ -430,6 +466,43 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Notify this client of a change to its parent tenant's config, as this may cause us to
+    /// take action (unblocking deletions when transitioning from AttachedMulti to AttachedSingle)
+    pub(super) fn update_config(&self, location_conf: &AttachedLocationConfig) {
+        let new_conf = RemoteTimelineClientConfig::from(location_conf);
+        let unblocked = !new_conf.block_deletions;
+
+        // Update config before draining deletions, so that we don't race with more being
+        // inserted.  This can result in deletions happening our of order, but that does not
+        // violate any invariants: deletions only need to be ordered relative to upload of the index
+        // that dereferences the deleted objects, and we are not changing that order.
+        *self.config.write().unwrap() = new_conf;
+
+        if unblocked {
+            // If we may now delete layers, drain any that were blocked in our old
+            // configuration state
+            let mut queue_locked = self.upload_queue.lock().unwrap();
+
+            if let Ok(queue) = queue_locked.initialized_mut() {
+                let blocked_deletions = std::mem::take(&mut queue.blocked_deletions);
+                for d in blocked_deletions {
+                    if let Err(e) = self.deletion_queue_client.push_layers_sync(
+                        self.tenant_shard_id,
+                        self.timeline_id,
+                        self.generation,
+                        d.layers,
+                    ) {
+                        // This could happen if the pageserver is shut down while a tenant
+                        // is transitioning from a deletion-blocked state: we will leak some
+                        // S3 objects in this case.
+                        warn!("Failed to drain blocked deletions: {}", e);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
     /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise.
     pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
         match &mut *self.upload_queue.lock().unwrap() {
@@ -1913,16 +1986,24 @@ impl RemoteTimelineClient {
                     res
                 }
                 UploadOp::Delete(delete) => {
-                    pausable_failpoint!("before-delete-layer-pausable");
-                    self.deletion_queue_client
-                        .push_layers(
-                            self.tenant_shard_id,
-                            self.timeline_id,
-                            self.generation,
-                            delete.layers.clone(),
-                        )
-                        .await
-                        .map_err(|e| anyhow::anyhow!(e))
+                    if self.config.read().unwrap().block_deletions {
+                        let mut queue_locked = self.upload_queue.lock().unwrap();
+                        if let Ok(queue) = queue_locked.initialized_mut() {
+                            queue.blocked_deletions.push(delete.clone());
+                        }
+                        Ok(())
+                    } else {
+                        pausable_failpoint!("before-delete-layer-pausable");
+                        self.deletion_queue_client
+                            .push_layers(
+                                self.tenant_shard_id,
+                                self.timeline_id,
+                                self.generation,
+                                delete.layers.clone(),
+                            )
+                            .await
+                            .map_err(|e| anyhow::anyhow!(e))
+                    }
                 }
                 unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => {
                     // unreachable. Barrier operations are handled synchronously in
@@ -2029,8 +2110,16 @@ impl RemoteTimelineClient {
                         // Legacy mode: skip validating generation
                         upload_queue.visible_remote_consistent_lsn.store(lsn);
                         None
-                    } else {
+                    } else if self
+                        .config
+                        .read()
+                        .unwrap()
+                        .process_remote_consistent_lsn_updates
+                    {
                         Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
+                    } else {
+                        // Our config disables remote_consistent_lsn updates: drop it.
+                        None
                     }
                 }
                 UploadOp::Delete(_) => {
@@ -2167,6 +2256,7 @@ impl RemoteTimelineClient {
                         queued_operations: VecDeque::default(),
                         #[cfg(feature = "testing")]
                         dangling_files: HashMap::default(),
+                        blocked_deletions: Vec::new(),
                         shutting_down: false,
                         shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
                     };
@@ -2402,6 +2492,7 @@ mod tests {
     use crate::{
         context::RequestContext,
         tenant::{
+            config::AttachmentMode,
             harness::{TenantHarness, TIMELINE_ID},
             storage_layer::layer::local_layer_path,
             Tenant, Timeline,
@@ -2487,6 +2578,10 @@ mod tests {
 
         /// Construct a RemoteTimelineClient in an arbitrary generation
         fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
+            let location_conf = AttachedLocationConfig {
+                generation,
+                attach_mode: AttachmentMode::Single,
+            };
             Arc::new(RemoteTimelineClient {
                 conf: self.harness.conf,
                 runtime: tokio::runtime::Handle::current(),
@@ -2500,6 +2595,7 @@ mod tests {
                     &self.harness.tenant_shard_id,
                     &TIMELINE_ID,
                 )),
+                config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)),
                 cancel: CancellationToken::new(),
             })
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a4289a222f..95864af4d0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -273,7 +273,7 @@ pub struct Timeline {
 
     /// Remote storage client.
     /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
-    pub remote_client: Arc<RemoteTimelineClient>,
+    pub(crate) remote_client: Arc<RemoteTimelineClient>,
 
     // What page versions do we hold in the repository? If we get a
     // request > last_record_lsn, we need to wait until we receive all
@@ -2172,14 +2172,14 @@ impl Timeline {
             )
     }
 
-    pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
+    pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
 
         // The threshold is embedded in the metric. So, we need to update it.
         {
             let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                new_conf,
+                &new_conf.tenant_conf,
                 &self.conf.default_tenant_conf,
             );
 
@@ -2187,6 +2187,9 @@ impl Timeline {
             let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());
 
             let timeline_id_str = self.timeline_id.to_string();
+
+            self.remote_client.update_config(&new_conf.location);
+
             self.metrics
                 .evictions_with_low_residence_duration
                 .write()
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 13a8dfa51a..67fc710c44 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -283,7 +283,7 @@ impl DeleteTimelineFlow {
 
     /// Shortcut to create Timeline in stopping state and spawn deletion task.
     #[instrument(skip_all, fields(%timeline_id))]
-    pub async fn resume_deletion(
+    pub(crate) async fn resume_deletion(
         tenant: Arc<Tenant>,
         timeline_id: TimelineId,
         local_metadata: &TimelineMetadata,
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 592f41cb21..f14bf2f8c3 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -88,6 +88,9 @@ pub(crate) struct UploadQueueInitialized {
     #[cfg(feature = "testing")]
     pub(crate) dangling_files: HashMap<LayerName, Generation>,
 
+    /// Deletions that are blocked by the tenant configuration
+    pub(crate) blocked_deletions: Vec<Delete>,
+
     /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
     pub(crate) shutting_down: bool,
 
@@ -180,6 +183,7 @@ impl UploadQueue {
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
             dangling_files: HashMap::new(),
+            blocked_deletions: Vec::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
         };
@@ -220,6 +224,7 @@ impl UploadQueue {
             queued_operations: VecDeque::new(),
             #[cfg(feature = "testing")]
             dangling_files: HashMap::new(),
+            blocked_deletions: Vec::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
         };
@@ -270,7 +275,7 @@ pub(crate) struct UploadTask {
 
 /// A deletion of some layers within the lifetime of a timeline.  This is not used
 /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct Delete {
     pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
 }
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index d4aef96735..12134048e6 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -365,6 +365,19 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     workload.validate(pageserver_a.id)
     workload.validate(pageserver_b.id)
 
+    # Force compaction on destination pageserver
+    pageserver_b.http_client().timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
+
+    # Destination pageserver is in AttachedMulti, it should have generated deletions but
+    # not enqueued them yet.
+    # Check deletion metrics via prometheus - should be 0 since we're in AttachedMulti
+    assert (
+        pageserver_b.http_client().get_metric_value(
+            "pageserver_deletion_queue_submitted_total",
+        )
+        == 0
+    )
+
     # Revert the origin to secondary
     log.info("Setting origin to Secondary")
     pageserver_a.tenant_location_configure(
@@ -389,6 +402,17 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
         },
     )
 
+    # Transition to AttachedSingle should have drained deletions generated by doing a compaction
+    # while in AttachedMulti.
+    def blocked_deletions_drained():
+        submitted = pageserver_b.http_client().get_metric_value(
+            "pageserver_deletion_queue_submitted_total"
+        )
+        assert submitted is not None
+        assert submitted > 0
+
+    wait_until(10, 0.1, blocked_deletions_drained)
+
     workload.churn_rows(64, pageserver_b.id)
     workload.validate(pageserver_b.id)
     del workload

From f36f0068b83bd536d33c49b238d964dcd96c9479 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 20 Nov 2024 17:50:39 +0000
Subject: [PATCH 20/24] chore(proxy): demote more logs during successful
 connection attempts (#9828)

Follow up to #9803

See https://github.com/neondatabase/cloud/issues/14378

In collaboration with @cloneable and @awarus, we sifted through logs and
simply demoted some logs to debug. This is not at all finished and there
are more logs to review, but we ran out of time in the session we
organised. In any slightly more nuanced cases, we didn't touch the log,
instead leaving a TODO comment.

I've also slightly refactored the sql-over-http body read/length reject
code. I can split that into a separate PR. It just felt natural after I
switched to `read_body_with_limit` as we discussed during the meet.
---
 libs/pq_proto/src/lib.rs                      |  1 +
 proxy/src/bin/local_proxy.rs                  |  2 +-
 proxy/src/bin/proxy.rs                        |  2 +-
 proxy/src/config.rs                           |  2 +-
 proxy/src/control_plane/client/neon.rs        |  1 +
 proxy/src/http/mod.rs                         | 10 +--
 proxy/src/proxy/connect_compute.rs            | 15 +++--
 proxy/src/proxy/copy_bidirectional.rs         |  2 +
 proxy/src/proxy/handshake.rs                  | 10 ++-
 proxy/src/proxy/mod.rs                        |  2 +-
 proxy/src/proxy/passthrough.rs                |  4 +-
 proxy/src/proxy/wake_compute.rs               |  7 +-
 proxy/src/rate_limiter/limit_algorithm.rs     |  6 +-
 .../src/rate_limiter/limit_algorithm/aimd.rs  | 24 ++++---
 proxy/src/redis/cancellation_publisher.rs     |  3 +-
 proxy/src/serverless/backend.rs               |  8 +--
 proxy/src/serverless/conn_pool.rs             |  2 +-
 proxy/src/serverless/conn_pool_lib.rs         |  4 +-
 proxy/src/serverless/http_conn_pool.rs        |  2 +-
 proxy/src/serverless/local_conn_pool.rs       |  4 +-
 proxy/src/serverless/sql_over_http.rs         | 66 +++++++++++--------
 21 files changed, 104 insertions(+), 73 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 9ffaaba584..b9e5387d86 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -184,6 +184,7 @@ pub struct CancelKeyData {
 
 impl fmt::Display for CancelKeyData {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // TODO: this is producing strange results, with 0xffffffff........ always in the logs.
         let hi = (self.backend_pid as u64) << 32;
         let lo = self.cancel_key as u64;
         let id = hi | lo;
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index c4ec1300f2..968682cf0f 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -111,7 +111,7 @@ struct SqlOverHttpArgs {
     sql_over_http_cancel_set_shards: usize,
 
     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_request_size_bytes: u64,
+    sql_over_http_max_request_size_bytes: usize,
 
     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
     sql_over_http_max_response_size_bytes: usize,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 232721338d..45fbe4a398 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -276,7 +276,7 @@ struct SqlOverHttpArgs {
     sql_over_http_cancel_set_shards: usize,
 
     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
-    sql_over_http_max_request_size_bytes: u64,
+    sql_over_http_max_request_size_bytes: usize,
 
     #[clap(long, default_value_t = 10 * 1024 * 1024)] // 10 MiB
     sql_over_http_max_response_size_bytes: usize,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index b048c9d389..8bc8e3f96f 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -64,7 +64,7 @@ pub struct HttpConfig {
     pub pool_options: GlobalConnPoolOptions,
     pub cancel_set: CancelSet,
     pub client_conn_threshold: u64,
-    pub max_request_size_bytes: u64,
+    pub max_request_size_bytes: usize,
     pub max_response_size_bytes: usize,
 }
 
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 53f9234926..757ea6720a 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -380,6 +380,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
         // after getting back a permit - it's possible the cache was filled
         // double check
         if permit.should_check_cache() {
+            // TODO: if there is something in the cache, mark the permit as success.
             check_cache!();
         }
 
diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs
index b1642cedb3..ed88c77256 100644
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -122,18 +122,18 @@ impl Endpoint {
 }
 
 #[derive(Error, Debug)]
-pub(crate) enum ReadBodyError {
+pub(crate) enum ReadBodyError<E> {
     #[error("Content length exceeds limit of {limit} bytes")]
     BodyTooLarge { limit: usize },
 
     #[error(transparent)]
-    Read(#[from] reqwest::Error),
+    Read(#[from] E),
 }
 
-pub(crate) async fn read_body_with_limit(
-    mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
+pub(crate) async fn read_body_with_limit<E>(
+    mut b: impl Body<Data = Bytes, Error = E> + Unpin,
     limit: usize,
-) -> Result<Vec<u8>, ReadBodyError> {
+) -> Result<Vec<u8>, ReadBodyError<E>> {
     // We could use `b.limited().collect().await.to_bytes()` here
     // but this ends up being slightly more efficient as far as I can tell.
 
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index b30aec09c1..2e759b0894 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -117,7 +117,6 @@ where
     node_info.set_keys(user_info.get_keys());
     node_info.allow_self_signed_compute = allow_self_signed_compute;
     mechanism.update_connect_config(&mut node_info.config);
-    let retry_type = RetryType::ConnectToCompute;
 
     // try once
     let err = match mechanism
@@ -129,7 +128,7 @@ where
             Metrics::get().proxy.retries_metric.observe(
                 RetriesMetricGroup {
                     outcome: ConnectOutcome::Success,
-                    retry_type,
+                    retry_type: RetryType::ConnectToCompute,
                 },
                 num_retries.into(),
             );
@@ -147,7 +146,7 @@ where
             Metrics::get().proxy.retries_metric.observe(
                 RetriesMetricGroup {
                     outcome: ConnectOutcome::Failed,
-                    retry_type,
+                    retry_type: RetryType::ConnectToCompute,
                 },
                 num_retries.into(),
             );
@@ -156,8 +155,9 @@ where
         node_info
     } else {
         // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-        info!("compute node's state has likely changed; requesting a wake-up");
+        debug!("compute node's state has likely changed; requesting a wake-up");
         let old_node_info = invalidate_cache(node_info);
+        // TODO: increment num_retries?
         let mut node_info =
             wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
         node_info.reuse_settings(old_node_info);
@@ -169,7 +169,7 @@ where
     // now that we have a new node, try connect to it repeatedly.
     // this can error for a few reasons, for instance:
     // * DNS connection settings haven't quite propagated yet
-    info!("wake_compute success. attempting to connect");
+    debug!("wake_compute success. attempting to connect");
     num_retries = 1;
     loop {
         match mechanism
@@ -181,10 +181,11 @@ where
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
                         outcome: ConnectOutcome::Success,
-                        retry_type,
+                        retry_type: RetryType::ConnectToCompute,
                     },
                     num_retries.into(),
                 );
+                // TODO: is this necessary? We have a metric.
                 info!(?num_retries, "connected to compute node after");
                 return Ok(res);
             }
@@ -194,7 +195,7 @@ where
                     Metrics::get().proxy.retries_metric.observe(
                         RetriesMetricGroup {
                             outcome: ConnectOutcome::Failed,
-                            retry_type,
+                            retry_type: RetryType::ConnectToCompute,
                         },
                         num_retries.into(),
                     );
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 91a3ceff75..4e4af88634 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -87,6 +87,8 @@ where
             transfer_one_direction(cx, &mut compute_to_client, compute, client)
                 .map_err(ErrorSource::from_compute)?;
 
+        // TODO: 1 info log, with a enum label for close direction.
+
         // Early termination checks from compute to client.
         if let TransferState::Done(_) = compute_to_client {
             if let TransferState::Running(buf) = &client_to_compute {
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 3ada3a9995..e27c211932 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -5,7 +5,7 @@ use pq_proto::{
 };
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 
 use crate::auth::endpoint_sni;
 use crate::config::{TlsConfig, PG_ALPN_PROTOCOL};
@@ -199,6 +199,8 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         .await?;
                 }
 
+                // This log highlights the start of the connection.
+                // This contains useful information for debugging, not logged elsewhere, like role name and endpoint id.
                 info!(
                     ?version,
                     ?params,
@@ -211,7 +213,7 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
             FeStartupPacket::StartupMessage { params, version }
                 if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
             {
-                warn!(?version, "unsupported minor version");
+                debug!(?version, "unsupported minor version");
 
                 // no protocol extensions are supported.
                 // <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/backend/tcop/backend_startup.c#L744-L753>
@@ -233,14 +235,16 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
                 info!(
                     ?version,
+                    ?params,
                     session_type = "normal",
                     "successful handshake; unsupported minor version requested"
                 );
                 break Ok(HandshakeData::Startup(stream, params));
             }
-            FeStartupPacket::StartupMessage { version, .. } => {
+            FeStartupPacket::StartupMessage { version, params } => {
                 warn!(
                     ?version,
+                    ?params,
                     session_type = "normal",
                     "unsuccessful handshake; unsupported version"
                 );
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 4be4006d15..9415b54a4a 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -254,7 +254,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
-    info!(
+    debug!(
         protocol = %ctx.protocol(),
         "handling interactive connection from client"
     );
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index e3b4730982..5e07c8eeae 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,5 +1,5 @@
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::debug;
 use utils::measured_stream::MeasuredStream;
 
 use super::copy_bidirectional::ErrorSource;
@@ -45,7 +45,7 @@ pub(crate) async fn proxy_pass(
     );
 
     // Starting from here we only proxy the client's traffic.
-    info!("performing the proxy pass...");
+    debug!("performing the proxy pass...");
     let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute(
         &mut client,
         &mut compute,
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index d09e0b1f41..8a672d48dc 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -17,7 +17,6 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     api: &B,
     config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
-    let retry_type = RetryType::WakeCompute;
     loop {
         match api.wake_compute(ctx).await {
             Err(e) if !should_retry(&e, *num_retries, config) => {
@@ -26,7 +25,7 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
                         outcome: ConnectOutcome::Failed,
-                        retry_type,
+                        retry_type: RetryType::WakeCompute,
                     },
                     (*num_retries).into(),
                 );
@@ -40,10 +39,12 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
                 Metrics::get().proxy.retries_metric.observe(
                     RetriesMetricGroup {
                         outcome: ConnectOutcome::Success,
-                        retry_type,
+                        retry_type: RetryType::WakeCompute,
                     },
                     (*num_retries).into(),
                 );
+                // TODO: is this necessary? We have a metric.
+                // TODO: this log line is misleading as "wake_compute" might return cached (and stale) info.
                 info!(?num_retries, "compute node woken up after");
                 return Ok(n);
             }
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 16c398f303..b74a9ab17e 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -195,7 +195,11 @@ impl DynamicLimiter {
     ///
     /// Set the outcome to `None` to ignore the job.
     fn release_inner(&self, start: Instant, outcome: Option<Outcome>) {
-        tracing::info!("outcome is {:?}", outcome);
+        if outcome.is_none() {
+            tracing::warn!("outcome is {:?}", outcome);
+        } else {
+            tracing::debug!("outcome is {:?}", outcome);
+        }
         if self.config.initial_limit == 0 {
             return;
         }
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index 5332a5184f..3000cc4c2a 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -31,26 +31,32 @@ impl LimitAlgorithm for Aimd {
 
                 if utilisation > self.utilisation {
                     let limit = old_limit + self.inc;
-                    let increased_limit = limit.clamp(self.min, self.max);
-                    if increased_limit > old_limit {
-                        tracing::info!(increased_limit, "limit increased");
+                    let new_limit = limit.clamp(self.min, self.max);
+                    if new_limit > old_limit {
+                        tracing::info!(old_limit, new_limit, "limit increased");
+                    } else {
+                        tracing::debug!(old_limit, new_limit, "limit clamped at max");
                     }
 
-                    increased_limit
+                    new_limit
                 } else {
                     old_limit
                 }
             }
             Outcome::Overload => {
-                let limit = old_limit as f32 * self.dec;
+                let new_limit = old_limit as f32 * self.dec;
 
                 // Floor instead of round, so the limit reduces even with small numbers.
                 // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
-                let limit = limit.floor() as usize;
+                let new_limit = new_limit.floor() as usize;
 
-                let limit = limit.clamp(self.min, self.max);
-                tracing::info!(limit, "limit decreased");
-                limit
+                let new_limit = new_limit.clamp(self.min, self.max);
+                if new_limit < old_limit {
+                    tracing::info!(old_limit, new_limit, "limit decreased");
+                } else {
+                    tracing::debug!(old_limit, new_limit, "limit clamped at min");
+                }
+                new_limit
             }
         }
     }
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 0000246971..7392b0d316 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -121,6 +121,7 @@ impl RedisPublisherClient {
         cancel_key_data: CancelKeyData,
         session_id: Uuid,
     ) -> anyhow::Result<()> {
+        // TODO: review redundant error duplication logs.
         if !self.limiter.check() {
             tracing::info!("Rate limit exceeded. Skipping cancellation message");
             return Err(anyhow::anyhow!("Rate limit exceeded"));
@@ -146,7 +147,7 @@ impl CancellationPublisherMut for RedisPublisherClient {
         tracing::info!("publishing cancellation key to Redis");
         match self.try_publish_internal(cancel_key_data, session_id).await {
             Ok(()) => {
-                tracing::info!("cancellation key successfuly published to Redis");
+                tracing::debug!("cancellation key successfuly published to Redis");
                 Ok(())
             }
             Err(e) => {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index d9dcf6fbb7..7df978f84c 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -167,10 +167,10 @@ impl PoolingBackend {
         force_new: bool,
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         let maybe_client = if force_new {
-            info!("pool: pool is disabled");
+            debug!("pool: pool is disabled");
             None
         } else {
-            info!("pool: looking for an existing connection");
+            debug!("pool: looking for an existing connection");
             self.pool.get(ctx, &conn_info)?
         };
 
@@ -204,14 +204,14 @@ impl PoolingBackend {
         ctx: &RequestContext,
         conn_info: ConnInfo,
     ) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
-        info!("pool: looking for an existing connection");
+        debug!("pool: looking for an existing connection");
         if let Ok(Some(client)) = self.http_conn_pool.get(ctx, &conn_info) {
             return Ok(client);
         }
 
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
-        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+        debug!(%conn_id, "pool: opening a new connection '{conn_info}'");
         let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials {
             info: ComputeUserInfo {
                 user: conn_info.user_info.user.clone(),
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 07ba1ae9af..f716326a68 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -243,7 +243,7 @@ mod tests {
             },
             cancel_set: CancelSet::new(0),
             client_conn_threshold: u64::MAX,
-            max_request_size_bytes: u64::MAX,
+            max_request_size_bytes: usize::MAX,
             max_response_size_bytes: usize::MAX,
         }));
         let pool = GlobalConnPool::new(config);
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index fe3c422c3b..c5db025870 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -232,7 +232,7 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
 
         // do logging outside of the mutex
         if returned {
-            info!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+            debug!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
         } else {
             info!(%conn_id, "{pool_name}: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
         }
@@ -409,7 +409,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 "pid",
                 tracing::field::display(client.inner.get_process_id()),
             );
-            info!(
+            debug!(
                 cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
                 "pool: reusing connection '{conn_info}'"
             );
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index bc86c4b1cd..e9455420c0 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -227,7 +227,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
         };
 
         tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
-        info!(
+        debug!(
             cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
             "pool: reusing connection '{conn_info}'"
         );
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index cadcbd7530..310af08221 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -29,7 +29,7 @@ use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::types::ToSql;
 use tokio_postgres::{AsyncMessage, Socket};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{debug, error, info, info_span, warn, Instrument};
 
 use super::backend::HttpConnError;
 use super::conn_pool_lib::{
@@ -110,7 +110,7 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
                 "pid",
                 tracing::field::display(client.inner.get_process_id()),
             );
-            info!(
+            debug!(
                 cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
                 "local_pool: reusing connection '{conn_info}'"
             );
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 36d8595902..ab75086884 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -8,7 +8,7 @@ use http::header::AUTHORIZATION;
 use http::Method;
 use http_body_util::combinators::BoxBody;
 use http_body_util::{BodyExt, Full};
-use hyper::body::{Body, Incoming};
+use hyper::body::Incoming;
 use hyper::http::{HeaderName, HeaderValue};
 use hyper::{header, HeaderMap, Request, Response, StatusCode};
 use pq_proto::StartupMessageParamsBuilder;
@@ -18,7 +18,7 @@ use tokio::time;
 use tokio_postgres::error::{DbError, ErrorPosition, SqlState};
 use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info};
+use tracing::{debug, error, info};
 use typed_json::json;
 use url::Url;
 use urlencoding;
@@ -36,6 +36,7 @@ use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
 use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::http::{read_body_with_limit, ReadBodyError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
@@ -357,8 +358,6 @@ pub(crate) enum SqlOverHttpError {
     ConnectCompute(#[from] HttpConnError),
     #[error("{0}")]
     ConnInfo(#[from] ConnInfoError),
-    #[error("request is too large (max is {0} bytes)")]
-    RequestTooLarge(u64),
     #[error("response is too large (max is {0} bytes)")]
     ResponseTooLarge(usize),
     #[error("invalid isolation level")]
@@ -377,7 +376,6 @@ impl ReportableError for SqlOverHttpError {
             SqlOverHttpError::ReadPayload(e) => e.get_error_kind(),
             SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(),
             SqlOverHttpError::ConnInfo(e) => e.get_error_kind(),
-            SqlOverHttpError::RequestTooLarge(_) => ErrorKind::User,
             SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User,
             SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User,
             SqlOverHttpError::Postgres(p) => p.get_error_kind(),
@@ -393,7 +391,6 @@ impl UserFacingError for SqlOverHttpError {
             SqlOverHttpError::ReadPayload(p) => p.to_string(),
             SqlOverHttpError::ConnectCompute(c) => c.to_string_client(),
             SqlOverHttpError::ConnInfo(c) => c.to_string_client(),
-            SqlOverHttpError::RequestTooLarge(_) => self.to_string(),
             SqlOverHttpError::ResponseTooLarge(_) => self.to_string(),
             SqlOverHttpError::InvalidIsolationLevel => self.to_string(),
             SqlOverHttpError::Postgres(p) => p.to_string(),
@@ -406,13 +403,12 @@ impl UserFacingError for SqlOverHttpError {
 impl HttpCodeError for SqlOverHttpError {
     fn get_http_status_code(&self) -> StatusCode {
         match self {
-            SqlOverHttpError::ReadPayload(_) => StatusCode::BAD_REQUEST,
+            SqlOverHttpError::ReadPayload(e) => e.get_http_status_code(),
             SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() {
                 ErrorKind::User => StatusCode::BAD_REQUEST,
                 _ => StatusCode::INTERNAL_SERVER_ERROR,
             },
             SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST,
-            SqlOverHttpError::RequestTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE,
             SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE,
             SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST,
             SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST,
@@ -426,19 +422,41 @@ impl HttpCodeError for SqlOverHttpError {
 pub(crate) enum ReadPayloadError {
     #[error("could not read the HTTP request body: {0}")]
     Read(#[from] hyper::Error),
+    #[error("request is too large (max is {limit} bytes)")]
+    BodyTooLarge { limit: usize },
     #[error("could not parse the HTTP request body: {0}")]
     Parse(#[from] serde_json::Error),
 }
 
+impl From<ReadBodyError<hyper::Error>> for ReadPayloadError {
+    fn from(value: ReadBodyError<hyper::Error>) -> Self {
+        match value {
+            ReadBodyError::BodyTooLarge { limit } => Self::BodyTooLarge { limit },
+            ReadBodyError::Read(e) => Self::Read(e),
+        }
+    }
+}
+
 impl ReportableError for ReadPayloadError {
     fn get_error_kind(&self) -> ErrorKind {
         match self {
             ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect,
+            ReadPayloadError::BodyTooLarge { .. } => ErrorKind::User,
             ReadPayloadError::Parse(_) => ErrorKind::User,
         }
     }
 }
 
+impl HttpCodeError for ReadPayloadError {
+    fn get_http_status_code(&self) -> StatusCode {
+        match self {
+            ReadPayloadError::Read(_) => StatusCode::BAD_REQUEST,
+            ReadPayloadError::BodyTooLarge { .. } => StatusCode::PAYLOAD_TOO_LARGE,
+            ReadPayloadError::Parse(_) => StatusCode::BAD_REQUEST,
+        }
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum SqlOverHttpCancel {
     #[error("query was cancelled")]
@@ -580,28 +598,20 @@ async fn handle_db_inner(
 
     let parsed_headers = HttpHeaders::try_parse(headers)?;
 
-    let request_content_length = match request.body().size_hint().upper() {
-        Some(v) => v,
-        None => config.http_config.max_request_size_bytes + 1,
-    };
-    info!(request_content_length, "request size in bytes");
-    Metrics::get()
-        .proxy
-        .http_conn_content_length_bytes
-        .observe(HttpDirection::Request, request_content_length as f64);
-
-    // we don't have a streaming request support yet so this is to prevent OOM
-    // from a malicious user sending an extremely large request body
-    if request_content_length > config.http_config.max_request_size_bytes {
-        return Err(SqlOverHttpError::RequestTooLarge(
-            config.http_config.max_request_size_bytes,
-        ));
-    }
-
     let fetch_and_process_request = Box::pin(
         async {
-            let body = request.into_body().collect().await?.to_bytes();
-            info!(length = body.len(), "request payload read");
+            let body = read_body_with_limit(
+                request.into_body(),
+                config.http_config.max_request_size_bytes,
+            )
+            .await?;
+
+            Metrics::get()
+                .proxy
+                .http_conn_content_length_bytes
+                .observe(HttpDirection::Request, body.len() as f64);
+
+            debug!(length = body.len(), "request payload read");
             let payload: Payload = serde_json::from_slice(&body)?;
             Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
         }

From ee26f09e45e72eab940e7721ba9f0a674e84b827 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 20 Nov 2024 18:33:05 +0000
Subject: [PATCH 21/24] pageserver: remove shard split hard link assertion
 (#9829)

## Problem

We were hitting this assertion in debug mode tests sometimes.

This case was being hit when the parent shard has no resident layers.
For instance, this is the case on split retry where the previous attempt
shut-down the parent and deleted local state for it. If the logical size
calculation does not download some layers before we get to the
hardlinking, then the assertion is hit.

## Summary of Changes

Remove the assertion. It's fine for the ancestor to not have any
resident layers at the time of the split.

Closes https://github.com/neondatabase/neon/issues/9412
---
 pageserver/src/tenant/mgr.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4fc9d740c8..92b2200542 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1719,10 +1719,11 @@ impl TenantManager {
                     parent_layers.push(relative_path.to_owned());
                 }
             }
-            debug_assert!(
-                !parent_layers.is_empty(),
-                "shutdown cannot empty the layermap"
-            );
+
+            if parent_layers.is_empty() {
+                tracing::info!("Ancestor shard has no resident layer to hard link");
+            }
+
             (parent_timelines, parent_layers)
         };
 

From 811fab136fc82aa8b5c85f93dc00d19851c07387 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 20 Nov 2024 20:31:02 +0100
Subject: [PATCH 22/24] scrubber: allow restricting find_garbage to a partial
 tenant id prefix (#9814)

Adds support to the `find_garbage` command to restrict itself to a
partial tenant ID prefix, say `a`, and then it only traverses tenants
with IDs starting with `a`. One can now pass the `--tenant-id-prefix`
parameter.

That way, one can shard the `find_garbage` command and make it run in
parallel.

The PR also does a change of how `remote_storage` first removes trailing
`/`s, only to then add them in the listing function. It turns out that
this isn't neccessary and it prevents the prefix functionality from
working. S3 doesn't do this either.
---
 libs/remote_storage/src/azure_blob.rs   | 24 ++++++++----------------
 storage_scrubber/src/garbage.rs         | 15 ++++++++++++---
 storage_scrubber/src/main.rs            | 13 ++++++++++++-
 storage_scrubber/src/metadata_stream.rs | 13 ++++++++++++-
 4 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index f98d16789c..1c0d43d479 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -97,10 +97,7 @@ impl AzureBlobStorage {
 
     pub fn relative_path_to_name(&self, path: &RemotePath) -> String {
         assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .as_str()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path.get_path().as_str();
         match &self.prefix_in_container {
             Some(prefix) => {
                 if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
@@ -277,19 +274,14 @@ impl RemoteStorage for AzureBlobStorage {
         cancel: &CancellationToken,
     ) -> impl Stream<Item = Result<Listing, DownloadError>> {
         // get the passed prefix or if it is not set use prefix_in_bucket value
-        let list_prefix = prefix
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let list_prefix = prefix.map(|p| self.relative_path_to_name(p)).or_else(|| {
+            self.prefix_in_container.clone().map(|mut s| {
+                if !s.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                 }
-                p
-            });
+                s
+            })
+        });
 
         async_stream::stream! {
             let _permit = self.permit(RequestKind::List, cancel).await?;
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index 91668a42a7..b026efbc3b 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -21,7 +21,7 @@ use utils::{backoff, id::TenantId};
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
     init_remote, list_objects_with_retries,
-    metadata_stream::{stream_tenant_timelines, stream_tenants},
+    metadata_stream::{stream_tenant_timelines, stream_tenants_maybe_prefix},
     BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES,
 };
 
@@ -118,9 +118,17 @@ pub async fn find_garbage(
     console_config: ConsoleConfig,
     depth: TraversingDepth,
     node_kind: NodeKind,
+    tenant_id_prefix: Option<String>,
     output_path: String,
 ) -> anyhow::Result<()> {
-    let garbage = find_garbage_inner(bucket_config, console_config, depth, node_kind).await?;
+    let garbage = find_garbage_inner(
+        bucket_config,
+        console_config,
+        depth,
+        node_kind,
+        tenant_id_prefix,
+    )
+    .await?;
     let serialized = serde_json::to_vec_pretty(&garbage)?;
 
     tokio::fs::write(&output_path, &serialized).await?;
@@ -152,6 +160,7 @@ async fn find_garbage_inner(
     console_config: ConsoleConfig,
     depth: TraversingDepth,
     node_kind: NodeKind,
+    tenant_id_prefix: Option<String>,
 ) -> anyhow::Result<GarbageList> {
     // Construct clients for S3 and for Console API
     let (remote_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
@@ -178,7 +187,7 @@ async fn find_garbage_inner(
 
     // Enumerate Tenants in S3, and check if each one exists in Console
     tracing::info!("Finding all tenants in {}...", bucket_config.desc_str());
-    let tenants = stream_tenants(&remote_client, &target);
+    let tenants = stream_tenants_maybe_prefix(&remote_client, &target, tenant_id_prefix);
     let tenants_checked = tenants.map_ok(|t| {
         let api_client = cloud_admin_api_client.clone();
         let console_cache = console_cache.clone();
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 0ffb570984..92979d609e 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -54,6 +54,8 @@ enum Command {
         node_kind: NodeKind,
         #[arg(short, long, default_value_t=TraversingDepth::Tenant)]
         depth: TraversingDepth,
+        #[arg(short, long, default_value=None)]
+        tenant_id_prefix: Option<String>,
         #[arg(short, long, default_value_t = String::from("garbage.json"))]
         output_path: String,
     },
@@ -209,10 +211,19 @@ async fn main() -> anyhow::Result<()> {
         Command::FindGarbage {
             node_kind,
             depth,
+            tenant_id_prefix,
             output_path,
         } => {
             let console_config = ConsoleConfig::from_env()?;
-            find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
+            find_garbage(
+                bucket_config,
+                console_config,
+                depth,
+                node_kind,
+                tenant_id_prefix,
+                output_path,
+            )
+            .await
         }
         Command::PurgeGarbage {
             input_path,
diff --git a/storage_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs
index efda7c213d..47447d681c 100644
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -17,9 +17,20 @@ use utils::id::{TenantId, TimelineId};
 pub fn stream_tenants<'a>(
     remote_client: &'a GenericRemoteStorage,
     target: &'a RootTarget,
+) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
+    stream_tenants_maybe_prefix(remote_client, target, None)
+}
+/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
+pub fn stream_tenants_maybe_prefix<'a>(
+    remote_client: &'a GenericRemoteStorage,
+    target: &'a RootTarget,
+    tenant_id_prefix: Option<String>,
 ) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
     try_stream! {
-        let tenants_target = target.tenants_root();
+        let mut tenants_target = target.tenants_root();
+        if let Some(tenant_id_prefix) = tenant_id_prefix {
+            tenants_target.prefix_in_bucket += &tenant_id_prefix;
+        }
         let mut tenants_stream =
             std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
         while let Some(chunk) = tenants_stream.next().await {

From 313ebfdb88b7ef5d2f75d4d4c3ccacd7250fe861 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Wed, 20 Nov 2024 20:36:23 +0100
Subject: [PATCH 23/24] [proxy] chore: allow bypassing empty `params` to `/sql`
 endpoint (#9827)

## Problem

```
curl -H "Neon-Connection-String: postgresql://neondb_owner:PASSWORD@ep-autumn-rain-a58lubg0.us-east-2.aws.neon.tech/neondb?sslmode=require" https://ep-autumn-rain-a58lubg0.us-east-2.aws.neon.tech/sql -d '{"query":"SELECT 1","params":[]}'
```

For such a query, I also need to send `params`. Do I really need it?

## Summary of changes
I've marked `params` as optional
---
 proxy/src/serverless/sql_over_http.rs | 61 +++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index ab75086884..1b17495c5d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -48,6 +48,7 @@ use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
 struct QueryData {
     query: String,
     #[serde(deserialize_with = "bytes_to_pg_text")]
+    #[serde(default)]
     params: Vec<Option<String>>,
     #[serde(default)]
     array_mode: Option<bool>,
@@ -1105,3 +1106,63 @@ impl Discard<'_> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_payload() {
+        let payload = "{\"query\":\"SELECT * FROM users WHERE name = ?\",\"params\":[\"test\"],\"arrayMode\":true}";
+        let deserialized_payload: Payload = serde_json::from_str(payload).unwrap();
+
+        match deserialized_payload {
+            Payload::Single(QueryData {
+                query,
+                params,
+                array_mode,
+            }) => {
+                assert_eq!(query, "SELECT * FROM users WHERE name = ?");
+                assert_eq!(params, vec![Some(String::from("test"))]);
+                assert!(array_mode.unwrap());
+            }
+            Payload::Batch(_) => {
+                panic!("deserialization failed: case with single query, one param, and array mode")
+            }
+        }
+
+        let payload = "{\"queries\":[{\"query\":\"SELECT * FROM users0 WHERE name = ?\",\"params\":[\"test0\"], \"arrayMode\":false},{\"query\":\"SELECT * FROM users1 WHERE name = ?\",\"params\":[\"test1\"],\"arrayMode\":true}]}";
+        let deserialized_payload: Payload = serde_json::from_str(payload).unwrap();
+
+        match deserialized_payload {
+            Payload::Batch(BatchQueryData { queries }) => {
+                assert_eq!(queries.len(), 2);
+                for (i, query) in queries.into_iter().enumerate() {
+                    assert_eq!(
+                        query.query,
+                        format!("SELECT * FROM users{i} WHERE name = ?")
+                    );
+                    assert_eq!(query.params, vec![Some(format!("test{i}"))]);
+                    assert_eq!(query.array_mode.unwrap(), i > 0);
+                }
+            }
+            Payload::Single(_) => panic!("deserialization failed: case with multiple queries"),
+        }
+
+        let payload = "{\"query\":\"SELECT 1\"}";
+        let deserialized_payload: Payload = serde_json::from_str(payload).unwrap();
+
+        match deserialized_payload {
+            Payload::Single(QueryData {
+                query,
+                params,
+                array_mode,
+            }) => {
+                assert_eq!(query, "SELECT 1");
+                assert_eq!(params, vec![]);
+                assert!(array_mode.is_none());
+            }
+            Payload::Batch(_) => panic!("deserialization failed: case with only one query"),
+        }
+    }
+}

From 2d6bf176a0698258d17def3011aedcc836a5427f Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Wed, 20 Nov 2024 21:36:29 +0200
Subject: [PATCH 24/24] proxy: Refactor http conn pool (#9785)

- Use the same ConnPoolEntry for http connection pool.
- Rename EndpointConnPool to the HttpConnPool.
- Narrow clone bound for client

Fixes #9284
---
 proxy/src/serverless/backend.rs         |  13 +-
 proxy/src/serverless/conn_pool.rs       |   6 +-
 proxy/src/serverless/conn_pool_lib.rs   | 201 ++++++++++++++----------
 proxy/src/serverless/http_conn_pool.rs  | 201 +++++++-----------------
 proxy/src/serverless/local_conn_pool.rs |   1 +
 proxy/src/serverless/mod.rs             |   2 +-
 proxy/src/serverless/sql_over_http.rs   |   1 +
 7 files changed, 187 insertions(+), 238 deletions(-)

diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 7df978f84c..3037e20888 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -12,8 +12,8 @@ use tracing::field::display;
 use tracing::{debug, info};
 
 use super::conn_pool::poll_client;
-use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
-use super::http_conn_pool::{self, poll_http2_client, Send};
+use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool};
+use super::http_conn_pool::{self, poll_http2_client, HttpConnPool, Send};
 use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
@@ -36,9 +36,10 @@ use crate::rate_limiter::EndpointRateLimiter;
 use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX};
 
 pub(crate) struct PoolingBackend {
-    pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool<Send>>,
+    pub(crate) http_conn_pool: Arc<GlobalConnPool<Send, HttpConnPool<Send>>>,
     pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
-    pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+    pub(crate) pool:
+        Arc<GlobalConnPool<tokio_postgres::Client, EndpointConnPool<tokio_postgres::Client>>>,
 
     pub(crate) config: &'static ProxyConfig,
     pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
@@ -474,7 +475,7 @@ impl ShouldRetryWakeCompute for LocalProxyConnError {
 }
 
 struct TokioMechanism {
-    pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+    pool: Arc<GlobalConnPool<tokio_postgres::Client, EndpointConnPool<tokio_postgres::Client>>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
 
@@ -524,7 +525,7 @@ impl ConnectMechanism for TokioMechanism {
 }
 
 struct HyperMechanism {
-    pool: Arc<http_conn_pool::GlobalConnPool<Send>>,
+    pool: Arc<GlobalConnPool<Send, HttpConnPool<Send>>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index f716326a68..bd262f45ed 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -19,7 +19,8 @@ use {
 };
 
 use super::conn_pool_lib::{
-    Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, GlobalConnPool,
+    Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, EndpointConnPool,
+    GlobalConnPool,
 };
 use crate::context::RequestContext;
 use crate::control_plane::messages::MetricsAuxInfo;
@@ -52,7 +53,7 @@ impl fmt::Display for ConnInfo {
 }
 
 pub(crate) fn poll_client<C: ClientInnerExt>(
-    global_pool: Arc<GlobalConnPool<C>>,
+    global_pool: Arc<GlobalConnPool<C, EndpointConnPool<C>>>,
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
@@ -167,6 +168,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     Client::new(inner, conn_info, pool_clone)
 }
 
+#[derive(Clone)]
 pub(crate) struct ClientDataRemote {
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index c5db025870..fe1d2563bc 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::marker::PhantomData;
 use std::ops::Deref;
 use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
@@ -43,13 +44,14 @@ impl ConnInfo {
     }
 }
 
+#[derive(Clone)]
 pub(crate) enum ClientDataEnum {
     Remote(ClientDataRemote),
     Local(ClientDataLocal),
-    #[allow(dead_code)]
     Http(ClientDataHttp),
 }
 
+#[derive(Clone)]
 pub(crate) struct ClientInnerCommon<C: ClientInnerExt> {
     pub(crate) inner: C,
     pub(crate) aux: MetricsAuxInfo,
@@ -91,6 +93,7 @@ pub(crate) struct ConnPoolEntry<C: ClientInnerExt> {
 pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
     pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
     total_conns: usize,
+    /// max # connections per endpoint
     max_conns: usize,
     _guard: HttpEndpointPoolsGuard<'static>,
     global_connections_count: Arc<AtomicUsize>,
@@ -317,24 +320,49 @@ impl<C: ClientInnerExt> DbUserConn<C> for DbUserConnPool<C> {
     }
 }
 
-pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
+pub(crate) trait EndpointConnPoolExt<C: ClientInnerExt> {
+    fn clear_closed(&mut self) -> usize;
+    fn total_conns(&self) -> usize;
+}
+
+impl<C: ClientInnerExt> EndpointConnPoolExt<C> for EndpointConnPool<C> {
+    fn clear_closed(&mut self) -> usize {
+        let mut clients_removed: usize = 0;
+        for db_pool in self.pools.values_mut() {
+            clients_removed += db_pool.clear_closed_clients(&mut self.total_conns);
+        }
+        clients_removed
+    }
+
+    fn total_conns(&self) -> usize {
+        self.total_conns
+    }
+}
+
+pub(crate) struct GlobalConnPool<C, P>
+where
+    C: ClientInnerExt,
+    P: EndpointConnPoolExt<C>,
+{
     // endpoint -> per-endpoint connection pool
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
+    pub(crate) global_pool: DashMap<EndpointCacheKey, Arc<RwLock<P>>>,
 
     /// Number of endpoint-connection pools
     ///
     /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
     /// That seems like far too much effort, so we're using a relaxed increment counter instead.
     /// It's only used for diagnostics.
-    global_pool_size: AtomicUsize,
+    pub(crate) global_pool_size: AtomicUsize,
 
     /// Total number of connections in the pool
-    global_connections_count: Arc<AtomicUsize>,
+    pub(crate) global_connections_count: Arc<AtomicUsize>,
 
-    config: &'static crate::config::HttpConfig,
+    pub(crate) config: &'static crate::config::HttpConfig,
+
+    _marker: PhantomData<C>,
 }
 
 #[derive(Debug, Clone, Copy)]
@@ -357,7 +385,11 @@ pub struct GlobalConnPoolOptions {
     pub max_total_conns: usize,
 }
 
-impl<C: ClientInnerExt> GlobalConnPool<C> {
+impl<C, P> GlobalConnPool<C, P>
+where
+    C: ClientInnerExt,
+    P: EndpointConnPoolExt<C>,
+{
     pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         let shards = config.pool_options.pool_shards;
         Arc::new(Self {
@@ -365,6 +397,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             global_pool_size: AtomicUsize::new(0),
             config,
             global_connections_count: Arc::new(AtomicUsize::new(0)),
+            _marker: PhantomData,
         })
     }
 
@@ -378,6 +411,80 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         self.config.pool_options.idle_timeout
     }
 
+    pub(crate) fn shutdown(&self) {
+        // drops all strong references to endpoint-pools
+        self.global_pool.clear();
+    }
+
+    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
+        let epoch = self.config.pool_options.gc_epoch;
+        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
+        loop {
+            interval.tick().await;
+
+            let shard = rng.gen_range(0..self.global_pool.shards().len());
+            self.gc(shard);
+        }
+    }
+
+    pub(crate) fn gc(&self, shard: usize) {
+        debug!(shard, "pool: performing epoch reclamation");
+
+        // acquire a random shard lock
+        let mut shard = self.global_pool.shards()[shard].write();
+
+        let timer = Metrics::get()
+            .proxy
+            .http_pool_reclaimation_lag_seconds
+            .start_timer();
+        let current_len = shard.len();
+        let mut clients_removed = 0;
+        shard.retain(|endpoint, x| {
+            // if the current endpoint pool is unique (no other strong or weak references)
+            // then it is currently not in use by any connections.
+            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+                let endpoints = pool.get_mut();
+                clients_removed = endpoints.clear_closed();
+
+                if endpoints.total_conns() == 0 {
+                    info!("pool: discarding pool for endpoint {endpoint}");
+                    return false;
+                }
+            }
+
+            true
+        });
+
+        let new_len = shard.len();
+        drop(shard);
+        timer.observe();
+
+        // Do logging outside of the lock.
+        if clients_removed > 0 {
+            let size = self
+                .global_connections_count
+                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
+                - clients_removed;
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(clients_removed as i64);
+            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
+        }
+        let removed = current_len - new_len;
+
+        if removed > 0 {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_sub(removed, atomic::Ordering::Relaxed)
+                - removed;
+            info!("pool: performed global pool gc. size now {global_pool_size}");
+        }
+    }
+}
+
+impl<C: ClientInnerExt> GlobalConnPool<C, EndpointConnPool<C>> {
     pub(crate) fn get(
         self: &Arc<Self>,
         ctx: &RequestContext,
@@ -432,85 +539,6 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         Ok(None)
     }
 
-    pub(crate) fn shutdown(&self) {
-        // drops all strong references to endpoint-pools
-        self.global_pool.clear();
-    }
-
-    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
-        let epoch = self.config.pool_options.gc_epoch;
-        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
-        loop {
-            interval.tick().await;
-
-            let shard = rng.gen_range(0..self.global_pool.shards().len());
-            self.gc(shard);
-        }
-    }
-
-    pub(crate) fn gc(&self, shard: usize) {
-        debug!(shard, "pool: performing epoch reclamation");
-
-        // acquire a random shard lock
-        let mut shard = self.global_pool.shards()[shard].write();
-
-        let timer = Metrics::get()
-            .proxy
-            .http_pool_reclaimation_lag_seconds
-            .start_timer();
-        let current_len = shard.len();
-        let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
-            // if the current endpoint pool is unique (no other strong or weak references)
-            // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
-                let EndpointConnPool {
-                    pools, total_conns, ..
-                } = pool.get_mut();
-
-                // ensure that closed clients are removed
-                for db_pool in pools.values_mut() {
-                    clients_removed += db_pool.clear_closed_clients(total_conns);
-                }
-
-                // we only remove this pool if it has no active connections
-                if *total_conns == 0 {
-                    info!("pool: discarding pool for endpoint {endpoint}");
-                    return false;
-                }
-            }
-
-            true
-        });
-
-        let new_len = shard.len();
-        drop(shard);
-        timer.observe();
-
-        // Do logging outside of the lock.
-        if clients_removed > 0 {
-            let size = self
-                .global_connections_count
-                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
-                - clients_removed;
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(clients_removed as i64);
-            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
-        }
-        let removed = current_len - new_len;
-
-        if removed > 0 {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_sub(removed, atomic::Ordering::Relaxed)
-                - removed;
-            info!("pool: performed global pool gc. size now {global_pool_size}");
-        }
-    }
-
     pub(crate) fn get_or_create_endpoint_pool(
         self: &Arc<Self>,
         endpoint: &EndpointCacheKey,
@@ -556,7 +584,6 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         pool
     }
 }
-
 pub(crate) struct Client<C: ClientInnerExt> {
     span: Span,
     inner: Option<ClientInnerCommon<C>>,
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index e9455420c0..fde38d0de3 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -2,16 +2,17 @@ use std::collections::VecDeque;
 use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
 
-use dashmap::DashMap;
 use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
-use rand::Rng;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
 use super::backend::HttpConnError;
-use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
+use super::conn_pool_lib::{
+    ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry,
+    EndpointConnPoolExt, GlobalConnPool,
+};
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
@@ -23,17 +24,11 @@ pub(crate) type Connect =
     http2::Connection<TokioIo<TcpStream>, hyper::body::Incoming, TokioExecutor>;
 
 #[derive(Clone)]
-pub(crate) struct ConnPoolEntry<C: ClientInnerExt + Clone> {
-    conn: C,
-    conn_id: uuid::Uuid,
-    aux: MetricsAuxInfo,
-}
-
 pub(crate) struct ClientDataHttp();
 
 // Per-endpoint connection pool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
-pub(crate) struct EndpointConnPool<C: ClientInnerExt + Clone> {
+pub(crate) struct HttpConnPool<C: ClientInnerExt + Clone> {
     // TODO(conrad):
     // either we should open more connections depending on stream count
     // (not exposed by hyper, need our own counter)
@@ -48,14 +43,19 @@ pub(crate) struct EndpointConnPool<C: ClientInnerExt + Clone> {
     global_connections_count: Arc<AtomicUsize>,
 }
 
-impl<C: ClientInnerExt + Clone> EndpointConnPool<C> {
+impl<C: ClientInnerExt + Clone> HttpConnPool<C> {
     fn get_conn_entry(&mut self) -> Option<ConnPoolEntry<C>> {
         let Self { conns, .. } = self;
 
         loop {
             let conn = conns.pop_front()?;
-            if !conn.conn.is_closed() {
-                conns.push_back(conn.clone());
+            if !conn.conn.inner.is_closed() {
+                let new_conn = ConnPoolEntry {
+                    conn: conn.conn.clone(),
+                    _last_access: std::time::Instant::now(),
+                };
+
+                conns.push_back(new_conn);
                 return Some(conn);
             }
         }
@@ -69,7 +69,7 @@ impl<C: ClientInnerExt + Clone> EndpointConnPool<C> {
         } = self;
 
         let old_len = conns.len();
-        conns.retain(|conn| conn.conn_id != conn_id);
+        conns.retain(|entry| entry.conn.conn_id != conn_id);
         let new_len = conns.len();
         let removed = old_len - new_len;
         if removed > 0 {
@@ -84,7 +84,22 @@ impl<C: ClientInnerExt + Clone> EndpointConnPool<C> {
     }
 }
 
-impl<C: ClientInnerExt + Clone> Drop for EndpointConnPool<C> {
+impl<C: ClientInnerExt + Clone> EndpointConnPoolExt<C> for HttpConnPool<C> {
+    fn clear_closed(&mut self) -> usize {
+        let Self { conns, .. } = self;
+        let old_len = conns.len();
+        conns.retain(|entry| !entry.conn.inner.is_closed());
+
+        let new_len = conns.len();
+        old_len - new_len
+    }
+
+    fn total_conns(&self) -> usize {
+        self.conns.len()
+    }
+}
+
+impl<C: ClientInnerExt + Clone> Drop for HttpConnPool<C> {
     fn drop(&mut self) {
         if !self.conns.is_empty() {
             self.global_connections_count
@@ -98,117 +113,7 @@ impl<C: ClientInnerExt + Clone> Drop for EndpointConnPool<C> {
     }
 }
 
-pub(crate) struct GlobalConnPool<C: ClientInnerExt + Clone> {
-    // endpoint -> per-endpoint connection pool
-    //
-    // That should be a fairly conteded map, so return reference to the per-endpoint
-    // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
-
-    /// Number of endpoint-connection pools
-    ///
-    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
-    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
-    /// It's only used for diagnostics.
-    global_pool_size: AtomicUsize,
-
-    /// Total number of connections in the pool
-    global_connections_count: Arc<AtomicUsize>,
-
-    config: &'static crate::config::HttpConfig,
-}
-
-impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
-    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
-        let shards = config.pool_options.pool_shards;
-        Arc::new(Self {
-            global_pool: DashMap::with_shard_amount(shards),
-            global_pool_size: AtomicUsize::new(0),
-            config,
-            global_connections_count: Arc::new(AtomicUsize::new(0)),
-        })
-    }
-
-    pub(crate) fn shutdown(&self) {
-        // drops all strong references to endpoint-pools
-        self.global_pool.clear();
-    }
-
-    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
-        let epoch = self.config.pool_options.gc_epoch;
-        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
-        loop {
-            interval.tick().await;
-
-            let shard = rng.gen_range(0..self.global_pool.shards().len());
-            self.gc(shard);
-        }
-    }
-
-    fn gc(&self, shard: usize) {
-        debug!(shard, "pool: performing epoch reclamation");
-
-        // acquire a random shard lock
-        let mut shard = self.global_pool.shards()[shard].write();
-
-        let timer = Metrics::get()
-            .proxy
-            .http_pool_reclaimation_lag_seconds
-            .start_timer();
-        let current_len = shard.len();
-        let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
-            // if the current endpoint pool is unique (no other strong or weak references)
-            // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
-                let EndpointConnPool { conns, .. } = pool.get_mut();
-
-                let old_len = conns.len();
-
-                conns.retain(|conn| !conn.conn.is_closed());
-
-                let new_len = conns.len();
-                let removed = old_len - new_len;
-                clients_removed += removed;
-
-                // we only remove this pool if it has no active connections
-                if conns.is_empty() {
-                    info!("pool: discarding pool for endpoint {endpoint}");
-                    return false;
-                }
-            }
-
-            true
-        });
-
-        let new_len = shard.len();
-        drop(shard);
-        timer.observe();
-
-        // Do logging outside of the lock.
-        if clients_removed > 0 {
-            let size = self
-                .global_connections_count
-                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
-                - clients_removed;
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(clients_removed as i64);
-            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
-        }
-        let removed = current_len - new_len;
-
-        if removed > 0 {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_sub(removed, atomic::Ordering::Relaxed)
-                - removed;
-            info!("pool: performed global pool gc. size now {global_pool_size}");
-        }
-    }
-
+impl<C: ClientInnerExt + Clone> GlobalConnPool<C, HttpConnPool<C>> {
     #[expect(unused_results)]
     pub(crate) fn get(
         self: &Arc<Self>,
@@ -226,27 +131,28 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
             return result;
         };
 
-        tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
+        tracing::Span::current().record("conn_id", tracing::field::display(client.conn.conn_id));
         debug!(
             cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
             "pool: reusing connection '{conn_info}'"
         );
         ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
         ctx.success();
-        Ok(Some(Client::new(client.conn, client.aux)))
+
+        Ok(Some(Client::new(client.conn.clone())))
     }
 
     fn get_or_create_endpoint_pool(
         self: &Arc<Self>,
         endpoint: &EndpointCacheKey,
-    ) -> Arc<RwLock<EndpointConnPool<C>>> {
+    ) -> Arc<RwLock<HttpConnPool<C>>> {
         // fast path
         if let Some(pool) = self.global_pool.get(endpoint) {
             return pool.clone();
         }
 
         // slow path
-        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
+        let new_pool = Arc::new(RwLock::new(HttpConnPool {
             conns: VecDeque::new(),
             _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
             global_connections_count: self.global_connections_count.clone(),
@@ -279,7 +185,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
 }
 
 pub(crate) fn poll_http2_client(
-    global_pool: Arc<GlobalConnPool<Send>>,
+    global_pool: Arc<GlobalConnPool<Send, HttpConnPool<Send>>>,
     ctx: &RequestContext,
     conn_info: &ConnInfo,
     client: Send,
@@ -299,11 +205,15 @@ pub(crate) fn poll_http2_client(
     let pool = match conn_info.endpoint_cache_key() {
         Some(endpoint) => {
             let pool = global_pool.get_or_create_endpoint_pool(&endpoint);
-
-            pool.write().conns.push_back(ConnPoolEntry {
-                conn: client.clone(),
-                conn_id,
+            let client = ClientInnerCommon {
+                inner: client.clone(),
                 aux: aux.clone(),
+                conn_id,
+                data: ClientDataEnum::Http(ClientDataHttp()),
+            };
+            pool.write().conns.push_back(ConnPoolEntry {
+                conn: client,
+                _last_access: std::time::Instant::now(),
             });
             Metrics::get()
                 .proxy
@@ -335,23 +245,30 @@ pub(crate) fn poll_http2_client(
         .instrument(span),
     );
 
-    Client::new(client, aux)
+    let client = ClientInnerCommon {
+        inner: client,
+        aux,
+        conn_id,
+        data: ClientDataEnum::Http(ClientDataHttp()),
+    };
+
+    Client::new(client)
 }
 
 pub(crate) struct Client<C: ClientInnerExt + Clone> {
-    pub(crate) inner: C,
-    aux: MetricsAuxInfo,
+    pub(crate) inner: ClientInnerCommon<C>,
 }
 
 impl<C: ClientInnerExt + Clone> Client<C> {
-    pub(self) fn new(inner: C, aux: MetricsAuxInfo) -> Self {
-        Self { inner, aux }
+    pub(self) fn new(inner: ClientInnerCommon<C>) -> Self {
+        Self { inner }
     }
 
     pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+        let aux = &self.inner.aux;
         USAGE_METRICS.register(Ids {
-            endpoint_id: self.aux.endpoint_id,
-            branch_id: self.aux.branch_id,
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
         })
     }
 }
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 310af08221..9abe35db08 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -44,6 +44,7 @@ pub(crate) const EXT_NAME: &str = "pg_session_jwt";
 pub(crate) const EXT_VERSION: &str = "0.1.2";
 pub(crate) const EXT_SCHEMA: &str = "auth";
 
+#[derive(Clone)]
 pub(crate) struct ClientDataLocal {
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 59247f03bf..77025f419d 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -88,7 +88,7 @@ pub async fn task_main(
         }
     });
 
-    let http_conn_pool = http_conn_pool::GlobalConnPool::new(&config.http_config);
+    let http_conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config);
     {
         let http_conn_pool = Arc::clone(&http_conn_pool);
         tokio::spawn(async move {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 1b17495c5d..03b37bccd5 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -779,6 +779,7 @@ async fn handle_auth_broker_inner(
     let _metrics = client.metrics();
 
     Ok(client
+        .inner
         .inner
         .send_request(req)
         .await