review fix

remove todo
add v3 to one more test
2026-05-21 23:20:40 +00:00 · 2025-02-25 10:47:40 +01:00 · 2025-02-07 11:51:37 +01:00 · 2025-02-04 12:08:26 +01:00 · 2025-02-04 12:00:46 +01:00 · 2025-02-04 11:22:35 +01:00
128 changed files with 4762 additions and 1781 deletions
--- a/.github/ISSUE_TEMPLATE/bug-template.md
+++ b/.github/ISSUE_TEMPLATE/bug-template.md
@@ -3,6 +3,7 @@ name: Bug Template
 about: Used for describing bugs
 title: ''
 labels: t/bug
+type: Bug
 assignees: ''

 ---
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -4,6 +4,7 @@ about: A set of related tasks contributing towards specific outcome, comprising
  more than 1 week of work.
 title: 'Epic: '
 labels: t/Epic
+type: Epic
 assignees: ''

 ---
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -27,3 +27,4 @@ config-variables:
  - SLACK_ON_CALL_QA_STAGING_STREAM
  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
+  - SLACK_CICD_CHANNEL_ID
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -41,7 +41,10 @@ inputs:
    description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
    required: false
    default: '/tmp/neon/pg_install/v16/lib'
-  
+  project_settings:
+    description: 'A JSON object with project settings'
+    required: false
+    default: '{}'

 outputs:
  dsn:
@@ -73,7 +76,7 @@ runs:
              \"provisioner\": \"k8s-neonvm\",
              \"autoscaling_limit_min_cu\": ${MIN_CU},
              \"autoscaling_limit_max_cu\": ${MAX_CU},
-              \"settings\": { }
+              \"settings\": ${PROJECT_SETTINGS}
            }
          }")

@@ -92,12 +95,12 @@ runs:
        if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
          # determine tenant ID
          TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
-          
+
          echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"

          echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
          echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
-          
+
          # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
          curl -X PUT \
            "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
@@ -118,3 +121,4 @@ runs:
        STRIPE_SIZE: ${{ inputs.stripe_size }}
        PSQL: ${{ inputs.psql_path }}
        LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
+        PROJECT_SETTINGS: ${{ inputs.project_settings }}
--- a/.github/file-filters.yaml
+++ b/.github/file-filters.yaml
@@ -1,4 +1,5 @@
 rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock']
+rust_dependencies: ['**/Cargo.lock']

 v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**']
 v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**']
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -17,7 +17,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
+        platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon, neon_pg17 ]
        database: [ clickbench, tpch, userexample ]

    env:
@@ -41,6 +41,9 @@ jobs:
          neon)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
+          neon_pg17)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR_PG17 }}
+            ;;
          aws-rds-postgres)
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
            ;;
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -267,6 +267,26 @@ jobs:
          path: /tmp/neon
          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

+      - name: Check diesel schema
+        if: inputs.build-type == 'release' && inputs.arch == 'x64'
+        env:
+          DATABASE_URL: postgresql://localhost:1235/storage_controller
+          POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        run: |
+          /tmp/neon/bin/neon_local init
+          /tmp/neon/bin/neon_local storage_controller start
+
+          diesel print-schema > storage_controller/src/schema.rs
+
+          if [ -n "$(git diff storage_controller/src/schema.rs)" ]; then
+            echo >&2 "Uncommitted changes in diesel schema"
+
+            git diff .
+            exit 1
+          fi
+
+          /tmp/neon/bin/neon_local storage_controller stop
+
      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
      - name: Merge and upload coverage data
        if: inputs.build-type == 'debug'
--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -16,6 +16,9 @@ defaults:
  run:
    shell: bash -euxo pipefail {0}

+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
 jobs:
  check-codestyle-rust:
    strategy:
@@ -84,8 +87,3 @@ jobs:
        run: |
          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
-        run: cargo deny check --hide-inclusion-graph
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -94,7 +94,9 @@ jobs:
          echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT}
          echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT}

-      - run: gh pr checkout "${PR_NUMBER}"
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}

      - run: git checkout -b "${BRANCH}"

--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -63,11 +63,15 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - DEFAULT_PG_VERSION: 16
+          - PG_VERSION: 16
            PLATFORM: "neon-staging"
            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
            RUNNER: [ self-hosted, us-east-2, x64 ]
-          - DEFAULT_PG_VERSION: 16
+          - PG_VERSION: 17
+            PLATFORM: "neon-staging"
+            region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+          - PG_VERSION: 16
            PLATFORM: "azure-staging"
            region_id: 'azure-eastus2'
            RUNNER: [ self-hosted, eastus2, x64 ]
@@ -75,7 +79,7 @@ jobs:
      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
+      PG_VERSION: ${{ matrix.PG_VERSION }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -112,7 +116,7 @@ jobs:
      uses: ./.github/actions/neon-project-create
      with:
        region_id: ${{ matrix.region_id }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        postgres_version: ${{ env.PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

    - name: Run benchmark
@@ -122,7 +126,7 @@ jobs:
        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
@@ -313,7 +317,11 @@ jobs:
                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
-                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
+                      { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
+                      { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
        }'

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
@@ -329,12 +337,15 @@ jobs:
        matrix='{
          "platform": [
            "neonvm-captest-reuse"
+          ],
+          "pg_version" : [
+            16,17
          ]
        }'

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                     { "platform": "rds-aurora"   }]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" },
+                                                     { "pg_version": 16, "platform": "rds-aurora"   }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -346,14 +357,14 @@ jobs:
          "platform": [
            "neonvm-captest-reuse"
          ],
-          "scale": [
-            "10"
+          "pg_version" : [
+            16,17
          ]
        }'

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
-          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                     { "platform": "rds-aurora",   "scale": "10" }]')
+          matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "platform": "rds-postgres" },
+                                                     { "pg_version": 16, "platform": "rds-aurora"   }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -378,7 +389,7 @@ jobs:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
      TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
+      PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -416,7 +427,7 @@ jobs:
      uses: ./.github/actions/neon-project-create
      with:
        region_id: ${{ matrix.region_id }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        postgres_version: ${{ env.PG_VERSION }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
        compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}

@@ -447,7 +458,7 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB 
+    # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB
    # without (neonvm-captest-new)
    # and with (neonvm-captest-new-many-tables) many relations in the database
    - name: Create many relations before the run
@@ -459,7 +470,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -475,7 +486,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -490,7 +501,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -505,7 +516,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -549,14 +560,19 @@ jobs:
        include:
          - PLATFORM: "neonvm-captest-pgvector"
            RUNNER: [ self-hosted, us-east-2, x64 ]
+            postgres_version: 16
+          - PLATFORM: "neonvm-captest-pgvector-pg17"
+            RUNNER: [ self-hosted, us-east-2, x64 ]
+            postgres_version: 17
          - PLATFORM: "azure-captest-pgvector"
            RUNNER: [ self-hosted, eastus2, x64 ]
+            postgres_version: 16

    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
      TEST_PG_BENCH_SCALES_MATRIX: "1"
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.postgres_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote

@@ -574,32 +590,20 @@ jobs:
    steps:
    - uses: actions/checkout@v4

-    # until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
-    # instead of using Neon artifacts containing pgbench
-    - name: Install postgresql-16 where pytest expects it
-      run: |
-        # Just to make it easier to test things locally on macOS (with arm64)
-        arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours

-        cd /home/nonroot
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.2-1.pgdg120+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb"
-        wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.6-1.pgdg120+1_${arch}.deb"
-        dpkg -x libpq5_17.2-1.pgdg120+1_${arch}.deb pg
-        dpkg -x postgresql-16_16.6-1.pgdg120+1_${arch}.deb pg
-        dpkg -x postgresql-client-16_16.6-1.pgdg120+1_${arch}.deb pg
-
-        mkdir -p /tmp/neon/pg_install/v16/bin
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
-        ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql    /tmp/neon/pg_install/v16/bin/psql
-        ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu     /tmp/neon/pg_install/v16/lib
-
-        LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
-        export LD_LIBRARY_PATH
-        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
-
-        /tmp/neon/pg_install/v16/bin/pgbench --version
-        /tmp/neon/pg_install/v16/bin/psql --version
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Set up Connection String
      id: set-up-connstr
@@ -608,6 +612,9 @@ jobs:
          neonvm-captest-pgvector)
            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
            ;;
+          neonvm-captest-pgvector-pg17)
+            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_PG17 }}
+            ;;
          azure-captest-pgvector)
            CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
            ;;
@@ -619,13 +626,6 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-    - name: Configure AWS credentials
-      uses: aws-actions/configure-aws-credentials@v4
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours
-
    - name: Benchmark pgvector hnsw indexing
      uses: ./.github/actions/run-python-test-set
      with:
@@ -634,7 +634,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -649,7 +649,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -696,7 +696,7 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
      TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
@@ -739,7 +739,18 @@ jobs:
      run: |
        case "${PLATFORM}" in
          neonvm-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
+                ;;
+              17)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_CONNSTR_PG17 }}
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
            ;;
          rds-aurora)
            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }}
@@ -763,7 +774,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -799,7 +810,7 @@ jobs:
    # We might change it after https://github.com/neondatabase/neon/issues/2900.
    #
    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    permissions:
      contents: write
      statuses: write
@@ -812,12 +823,11 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}
-      TEST_OLAP_SCALE: ${{ matrix.scale }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -849,13 +859,24 @@ jobs:
      run: |
        case "${PLATFORM}" in
          neonvm-captest-reuse)
-            ENV_PLATFORM=CAPTEST_TPCH
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_S10_CONNSTR"
+                ;;
+              17)
+                CONNSTR_SECRET_NAME="BENCHMARK_CAPTEST_TPCH_CONNSTR_PG17"
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
            ;;
          rds-aurora)
-            ENV_PLATFORM=RDS_AURORA_TPCH
+            CONNSTR_SECRET_NAME="BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR"
            ;;
          rds-postgres)
-            ENV_PLATFORM=RDS_POSTGRES_TPCH
+            CONNSTR_SECRET_NAME="BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR"
            ;;
          *)
            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -863,7 +884,6 @@ jobs:
            ;;
        esac

-        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
        echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV

    - name: Set up Connection String
@@ -881,13 +901,13 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_tpch
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
-        TEST_OLAP_SCALE: ${{ matrix.scale }}
+        TEST_OLAP_SCALE: 10

    - name: Create Allure report
      id: create-allure-report
@@ -909,7 +929,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  user-examples-compare:
-    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
+    # if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
    permissions:
      contents: write
      statuses: write
@@ -922,7 +942,7 @@ jobs:

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
+      PG_VERSION: ${{ matrix.pg_version }}
      TEST_OUTPUT: /tmp/test_output
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -959,7 +979,18 @@ jobs:
      run: |
        case "${PLATFORM}" in
          neonvm-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
+            case "${PG_VERSION}" in
+              16)
+                CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
+                ;;
+              17)
+                CONNSTR=${{ secrets.BENCHMARK_CAPTEST_USER_EXAMPLE_CONNSTR_PG17 }}
+                ;;
+              *)
+                echo >&2 "Unsupported PG_VERSION=${PG_VERSION} for PLATFORM=${PLATFORM}"
+                exit 1
+                ;;
+            esac
            ;;
          rds-aurora)
            CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }}
@@ -983,7 +1014,7 @@ jobs:
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
        extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
-        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        pg_version: ${{ env.PG_VERSION }}
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -45,6 +45,26 @@ jobs:
            run cancel-previous-in-concurrency-group.yml \
              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"

+  files-changed:
+    needs: [ check-permissions ]
+    runs-on: [ self-hosted, small ]
+    timeout-minutes: 3
+    outputs:
+      check-rust-dependencies: ${{ steps.files-changed.outputs.rust_dependencies }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Check for file changes
+        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
+        id: files-changed
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          filters: .github/file-filters.yaml
+
  tag:
    needs: [ check-permissions ]
    runs-on: [ self-hosted, small ]
@@ -170,6 +190,14 @@ jobs:
      archs: '["x64", "arm64"]'
    secrets: inherit

+  check-dependencies-rust:
+    needs: [ files-changed, build-build-tools-image ]
+    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' }}
+    uses: ./.github/workflows/cargo-deny.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit
+
  build-and-test-locally:
    needs: [ tag, build-build-tools-image ]
    strategy:
@@ -1332,6 +1360,8 @@ jobs:
      - build-and-test-locally
      - check-codestyle-python
      - check-codestyle-rust
+      - check-dependencies-rust
+      - files-changed
      - promote-images-dev
      - test-images
      - trigger-custom-extensions-build-and-wait
@@ -1344,4 +1374,11 @@ jobs:
        if: |
          contains(needs.*.result, 'failure')
          || contains(needs.*.result, 'cancelled')
-          || contains(needs.*.result, 'skipped')
+          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true')
+          || needs.build-and-test-locally.result == 'skipped'
+          || needs.check-codestyle-python.result == 'skipped'
+          || needs.check-codestyle-rust.result == 'skipped'
+          || needs.files-changed.result == 'skipped'
+          || needs.promote-images-dev.result == 'skipped'
+          || needs.test-images.result == 'skipped'
+          || needs.trigger-custom-extensions-build-and-wait.result == 'skipped'
--- a/.github/workflows/cargo-deny.yml
+++ b/.github/workflows/cargo-deny.yml
@@ -0,0 +1,57 @@
+name: cargo deny checks
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        required: false
+        type: string
+  schedule:
+    - cron: '0 0 * * *'
+
+jobs:
+  cargo-deny:
+    strategy:
+      matrix:
+        ref: >-
+          ${{
+            fromJSON(
+              github.event_name == 'schedule'
+                && '["main","release","release-proxy","release-compute"]'
+                || format('["{0}"]', github.sha)
+            )
+          }}
+
+    runs-on: [self-hosted, small]
+
+    container:
+      image: ${{ inputs.build-tools-image || 'neondatabase/build-tools:pinned' }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ matrix.ref }}
+
+      - name: Check rust licenses/bans/advisories/sources
+        env:
+          CARGO_DENY_TARGET: >-
+            ${{ github.event_name == 'schedule' && 'advisories' || 'all' }}
+        run: cargo deny check --hide-inclusion-graph $CARGO_DENY_TARGET
+
+      - name: Post to a Slack channel
+        if: ${{ github.event_name == 'schedule' && failure() }}
+        uses: slackapi/slack-github-action@v2
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
+            text: |
+              Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
+              <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+              Pinging @oncall-devprod.
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -12,8 +12,8 @@ on:
  pull_request:
    paths:
      - '.github/workflows/pg-clients.yml'
-      - 'test_runner/pg_clients/**'
-      - 'test_runner/logical_repl/**'
+      - 'test_runner/pg_clients/**/*.py'
+      - 'test_runner/logical_repl/**/*.py'
      - 'poetry.lock'
  workflow_dispatch:

@@ -104,6 +104,8 @@ jobs:
        with:
          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
          postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+          project_settings: >-
+            {"enable_logical_replication": true}

      - name: Run tests
        uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -59,7 +59,10 @@ jobs:
          echo "${RUST_CHANGED_FILES}"

  build-build-tools-image:
-    if: needs.get-changed-files.outputs.python-changed == 'true'
+    if: |
+      false
+      || needs.get-changed-files.outputs.python-changed == 'true'
+      || needs.get-changed-files.outputs.rust-changed == 'true'
    needs: [ get-changed-files ]
    uses: ./.github/workflows/build-build-tools-image.yml
    with:
@@ -92,7 +95,8 @@ jobs:
  # - conclusion
  # - neon-cloud-e2e
  conclusion:
-    if: always()
+    # Do not run job on Pull Requests as it interferes with the `conclusion` job from the `build_and_test` workflow
+    if: always() && github.event_name == 'merge_group'
    permissions:
      statuses: write # for `github.repos.createCommitStatus(...)`
      contents: write
@@ -124,6 +128,8 @@ jobs:
      - name: Fail the job if any of the dependencies do not succeed or skipped
        run: exit 1
        if: |
-          (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
+          false
+          || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true')
+          || (needs.check-codestyle-rust.result   == 'skipped' && needs.get-changed-files.outputs.rust-changed   == 'true')
          || contains(needs.*.result, 'failure')
          || contains(needs.*.result, 'cancelled')
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -290,9 +290,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"

 [[package]]
 name = "aws-config"
-version = "1.5.10"
+version = "1.5.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
+checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90"
 dependencies = [
 "aws-credential-types",
 "aws-runtime",
@@ -301,7 +301,7 @@ dependencies = [
 "aws-sdk-sts",
 "aws-smithy-async",
 "aws-smithy-http",
- "aws-smithy-json 0.60.7",
+ "aws-smithy-json",
 "aws-smithy-runtime",
 "aws-smithy-runtime-api",
 "aws-smithy-types",
@@ -332,9 +332,9 @@ dependencies = [

 [[package]]
 name = "aws-runtime"
-version = "1.4.4"
+version = "1.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
+checksum = "bee7643696e7fdd74c10f9eb42848a87fe469d35eae9c3323f80aa98f350baac"
 dependencies = [
 "aws-credential-types",
 "aws-sigv4",
@@ -366,7 +366,7 @@ dependencies = [
 "aws-runtime",
 "aws-smithy-async",
 "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
 "aws-smithy-query",
 "aws-smithy-runtime",
 "aws-smithy-runtime-api",
@@ -389,7 +389,7 @@ dependencies = [
 "aws-runtime",
 "aws-smithy-async",
 "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
 "aws-smithy-runtime",
 "aws-smithy-runtime-api",
 "aws-smithy-types",
@@ -414,7 +414,7 @@ dependencies = [
 "aws-smithy-checksums",
 "aws-smithy-eventstream",
 "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
 "aws-smithy-runtime",
 "aws-smithy-runtime-api",
 "aws-smithy-types",
@@ -437,15 +437,15 @@ dependencies = [

 [[package]]
 name = "aws-sdk-sso"
-version = "1.50.0"
+version = "1.57.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
+checksum = "c54bab121fe1881a74c338c5f723d1592bf3b53167f80268a1274f404e1acc38"
 dependencies = [
 "aws-credential-types",
 "aws-runtime",
 "aws-smithy-async",
 "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
 "aws-smithy-runtime",
 "aws-smithy-runtime-api",
 "aws-smithy-types",
@@ -459,15 +459,15 @@ dependencies = [

 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.51.0"
+version = "1.58.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
+checksum = "8c8234fd024f7ac61c4e44ea008029bde934250f371efe7d4a39708397b1080c"
 dependencies = [
 "aws-credential-types",
 "aws-runtime",
 "aws-smithy-async",
 "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
 "aws-smithy-runtime",
 "aws-smithy-runtime-api",
 "aws-smithy-types",
@@ -481,15 +481,15 @@ dependencies = [

 [[package]]
 name = "aws-sdk-sts"
-version = "1.51.0"
+version = "1.58.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
+checksum = "ba60e1d519d6f23a9df712c04fdeadd7872ac911c84b2f62a8bda92e129b7962"
 dependencies = [
 "aws-credential-types",
 "aws-runtime",
 "aws-smithy-async",
 "aws-smithy-http",
- "aws-smithy-json 0.61.1",
+ "aws-smithy-json",
 "aws-smithy-query",
 "aws-smithy-runtime",
 "aws-smithy-runtime-api",
@@ -504,9 +504,9 @@ dependencies = [

 [[package]]
 name = "aws-sigv4"
-version = "1.2.6"
+version = "1.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2"
+checksum = "690118821e46967b3c4501d67d7d52dd75106a9c54cf36cefa1985cedbe94e05"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-eventstream",
@@ -533,9 +533,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-async"
-version = "1.2.1"
+version = "1.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
+checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e"
 dependencies = [
 "futures-util",
 "pin-project-lite",
@@ -565,9 +565,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.60.5"
+version = "0.60.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90"
+checksum = "8b18559a41e0c909b77625adf2b8c50de480a8041e5e4a3f5f7d177db70abc5a"
 dependencies = [
 "aws-smithy-types",
 "bytes",
@@ -576,9 +576,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-http"
-version = "0.60.11"
+version = "0.60.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6"
+checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc"
 dependencies = [
 "aws-smithy-eventstream",
 "aws-smithy-runtime-api",
@@ -597,18 +597,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-json"
-version = "0.60.7"
+version = "0.61.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6"
-dependencies = [
- "aws-smithy-types",
-]
-
-[[package]]
-name = "aws-smithy-json"
-version = "0.61.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095"
+checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422"
 dependencies = [
 "aws-smithy-types",
 ]
@@ -625,9 +616,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-runtime"
-version = "1.7.4"
+version = "1.7.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
+checksum = "865f7050bbc7107a6c98a397a9fcd9413690c27fa718446967cf03b2d3ac517e"
 dependencies = [
 "aws-smithy-async",
 "aws-smithy-http",
@@ -669,9 +660,9 @@ dependencies = [

 [[package]]
 name = "aws-smithy-types"
-version = "1.2.9"
+version = "1.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
+checksum = "a28f6feb647fb5e0d5b50f0472c19a7db9462b74e2fec01bb0b44eedcc834e97"
 dependencies = [
 "base64-simd",
 "bytes",
@@ -704,9 +695,9 @@ dependencies = [

 [[package]]
 name = "aws-types"
-version = "1.3.3"
+version = "1.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef"
+checksum = "b0df5a18c4f951c645300d365fec53a61418bcf4650f604f85fe2a665bfaa0c2"
 dependencies = [
 "aws-credential-types",
 "aws-smithy-async",
@@ -978,7 +969,7 @@ version = "0.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "cexpr",
 "clang-sys",
 "itertools 0.12.1",
@@ -1006,9 +997,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

 [[package]]
 name = "bitflags"
-version = "2.4.1"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"

 [[package]]
 name = "block-buffer"
@@ -1225,6 +1216,20 @@ version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"

+[[package]]
+name = "clashmap"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93bd59c81e2bd87a775ae2de75f070f7e2bfe97363a6ad652f46824564c23e4d"
+dependencies = [
+ "crossbeam-utils",
+ "hashbrown 0.15.2",
+ "lock_api",
+ "parking_lot_core 0.9.8",
+ "polonius-the-crab",
+ "replace_with",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1312,7 +1317,7 @@ dependencies = [
 "tar",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "tokio-stream",
 "tokio-util",
 "tower 0.5.2",
@@ -1421,7 +1426,7 @@ dependencies = [
 "storage_broker",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "tokio-util",
 "toml",
 "toml_edit",
@@ -1561,7 +1566,7 @@ version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "crossterm_winapi",
 "libc",
 "parking_lot 0.12.1",
@@ -1792,7 +1797,7 @@ version = "2.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "byteorder",
 "chrono",
 "diesel_derives",
@@ -1812,7 +1817,7 @@ dependencies = [
 "futures-util",
 "scoped-futures",
 "tokio",
- "tokio-postgres 0.7.12",
+ "tokio-postgres",
 ]

 [[package]]
@@ -2556,6 +2561,12 @@ dependencies = [
 "allocator-api2",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.15.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
+
 [[package]]
 name = "hashlink"
 version = "0.9.1"
@@ -2606,6 +2617,15 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"

+[[package]]
+name = "higher-kinded-types"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "561985554c8b8d4808605c90a5f1979cc6c31a5d20b78465cd59501233c6678e"
+dependencies = [
+ "never-say-never",
+]
+
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -3084,11 +3104,11 @@ dependencies = [

 [[package]]
 name = "inotify"
-version = "0.9.6"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff"
+checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.8.0",
 "inotify-sys",
 "libc",
 ]
@@ -3265,9 +3285,9 @@ dependencies = [

 [[package]]
 name = "kqueue"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c8fc60ba15bf51257aa9807a48a61013db043fcf3a78cb0d916e8e396dcad98"
+checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c"
 dependencies = [
 "kqueue-sys",
 "libc",
@@ -3275,9 +3295,9 @@ dependencies = [

 [[package]]
 name = "kqueue-sys"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587"
+checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b"
 dependencies = [
 "bitflags 1.3.2",
 "libc",
@@ -3304,9 +3324,9 @@ dependencies = [

 [[package]]
 name = "libc"
-version = "0.2.167"
+version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
+checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"

 [[package]]
 name = "libloading"
@@ -3553,14 +3573,14 @@ dependencies = [

 [[package]]
 name = "mio"
-version = "0.8.11"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
 "libc",
 "log",
 "wasi 0.11.0+wasi-snapshot-preview1",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -3569,6 +3589,12 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

+[[package]]
+name = "never-say-never"
+version = "6.6.666"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6"
+
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -3600,7 +3626,7 @@ version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "cfg-if",
 "libc",
 "memoffset 0.9.0",
@@ -3618,12 +3644,11 @@ dependencies = [

 [[package]]
 name = "notify"
-version = "6.1.1"
+version = "8.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
+checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943"
 dependencies = [
- "bitflags 2.4.1",
- "crossbeam-channel",
+ "bitflags 2.8.0",
 "filetime",
 "fsevent-sys",
 "inotify",
@@ -3631,10 +3656,17 @@ dependencies = [
 "libc",
 "log",
 "mio",
+ "notify-types",
 "walkdir",
- "windows-sys 0.48.0",
+ "windows-sys 0.59.0",
 ]

+[[package]]
+name = "notify-types"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d"
+
 [[package]]
 name = "ntapi"
 version = "0.4.1"
@@ -4060,8 +4092,8 @@ dependencies = [
 "pageserver_compaction",
 "pin-project-lite",
 "postgres",
- "postgres-protocol 0.6.6",
- "postgres-types 0.2.6",
+ "postgres-protocol",
+ "postgres-types",
 "postgres_backend",
 "postgres_connection",
 "postgres_ffi",
@@ -4092,7 +4124,7 @@ dependencies = [
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -4150,7 +4182,7 @@ dependencies = [
 "serde",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "tokio-stream",
 "tokio-util",
 "utils",
@@ -4446,48 +4478,40 @@ dependencies = [
 "plotters-backend",
 ]

+[[package]]
+name = "polonius-the-crab"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e97ca2c89572ae41bbec1c99498251f87dd5a94e500c5ec19c382dd593dd5ce9"
+dependencies = [
+ "higher-kinded-types",
+ "never-say-never",
+]
+
 [[package]]
 name = "postgres"
-version = "0.19.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+version = "0.19.7"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "bytes",
 "fallible-iterator",
 "futures-util",
 "log",
 "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 ]

 [[package]]
 name = "postgres-protocol"
 version = "0.6.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
-dependencies = [
- "base64 0.21.1",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "hmac",
- "lazy_static",
- "md-5",
- "memchr",
- "rand 0.8.5",
- "sha2",
- "stringprep",
-]
-
-[[package]]
-name = "postgres-protocol"
-version = "0.6.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acda0ebdebc28befa84bee35e651e4c5f09073d668c7aed4cf7e23c3cda84b23"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "base64 0.22.1",
 "byteorder",
 "bytes",
 "fallible-iterator",
 "hmac",
+ "lazy_static",
 "md-5",
 "memchr",
 "rand 0.8.5",
@@ -4514,23 +4538,12 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "bytes",
 "chrono",
 "fallible-iterator",
- "postgres-protocol 0.6.6",
-]
-
-[[package]]
-name = "postgres-types"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f66ea23a2d0e5734297357705193335e0a957696f34bed2f2faefacb2fec336f"
-dependencies = [
- "bytes",
- "fallible-iterator",
- "postgres-protocol 0.6.7",
+ "postgres-protocol",
 ]

 [[package]]
@@ -4555,7 +4568,7 @@ dependencies = [
 "serde",
 "thiserror 1.0.69",
 "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.26.0",
 "tokio-util",
@@ -4570,7 +4583,7 @@ dependencies = [
 "itertools 0.10.5",
 "once_cell",
 "postgres",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "url",
 ]

@@ -4664,7 +4677,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "itertools 0.10.5",
- "postgres-protocol 0.6.6",
+ "postgres-protocol",
 "rand 0.8.5",
 "serde",
 "thiserror 1.0.69",
@@ -4705,7 +4718,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "chrono",
 "flate2",
 "hex",
@@ -4720,7 +4733,7 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "chrono",
 "hex",
 ]
@@ -4839,9 +4852,9 @@ dependencies = [
 "camino-tempfile",
 "chrono",
 "clap",
+ "clashmap",
 "compute_api",
 "consumption_metrics",
- "dashmap 5.5.0",
 "ecdsa 0.16.9",
 "ed25519-dalek",
 "env_logger 0.10.2",
@@ -4912,7 +4925,7 @@ dependencies = [
 "tikv-jemalloc-ctl",
 "tikv-jemallocator",
 "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "tokio-postgres2",
 "tokio-rustls 0.26.0",
 "tokio-tungstenite 0.21.0",
@@ -5249,6 +5262,12 @@ dependencies = [
 "utils",
 ]

+[[package]]
+name = "replace_with"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a8614ee435691de62bcffcf4a66d91b3594bf1428a5722e79103249a095690"
+
 [[package]]
 name = "reqwest"
 version = "0.12.4"
@@ -5528,7 +5547,7 @@ version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "errno",
 "libc",
 "linux-raw-sys 0.4.14",
@@ -5700,7 +5719,7 @@ dependencies = [
 "pageserver_api",
 "parking_lot 0.12.1",
 "postgres",
- "postgres-protocol 0.6.6",
+ "postgres-protocol",
 "postgres_backend",
 "postgres_ffi",
 "pprof",
@@ -5724,7 +5743,7 @@ dependencies = [
 "tikv-jemallocator",
 "tokio",
 "tokio-io-timeout",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -6341,6 +6360,8 @@ dependencies = [
 "rand 0.8.5",
 "reqwest",
 "routerify",
+ "rustls 0.23.18",
+ "rustls-native-certs 0.8.0",
 "scoped-futures",
 "scopeguard",
 "serde",
@@ -6349,6 +6370,8 @@ dependencies = [
 "strum_macros",
 "thiserror 1.0.69",
 "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
 "tokio-util",
 "tracing",
 "utils",
@@ -6394,7 +6417,7 @@ dependencies = [
 "serde_json",
 "storage_controller_client",
 "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-stream",
 "tokio-util",
@@ -6591,7 +6614,7 @@ dependencies = [
 "fastrand 2.2.0",
 "once_cell",
 "rustix",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]

 [[package]]
@@ -6803,21 +6826,20 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"

 [[package]]
 name = "tokio"
-version = "1.38.1"
+version = "1.43.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df"
+checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
 dependencies = [
 "backtrace",
 "bytes",
 "libc",
 "mio",
- "num_cpus",
 "parking_lot 0.12.1",
 "pin-project-lite",
 "signal-hook-registry",
 "socket2",
 "tokio-macros",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -6848,9 +6870,9 @@ dependencies = [

 [[package]]
 name = "tokio-macros"
-version = "2.3.0"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
+checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6859,8 +6881,8 @@ dependencies = [

 [[package]]
 name = "tokio-postgres"
-version = "0.7.9"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#8b44892f7851e705810b2cb54504325699966070"
+version = "0.7.10"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -6873,34 +6895,8 @@ dependencies = [
 "percent-encoding",
 "phf",
 "pin-project-lite",
- "postgres-protocol 0.6.6",
- "postgres-types 0.2.6",
- "rand 0.8.5",
- "socket2",
- "tokio",
- "tokio-util",
- "whoami",
-]
-
-[[package]]
-name = "tokio-postgres"
-version = "0.7.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b5d3742945bc7d7f210693b0c58ae542c6fd47b17adbbda0885f3dcb34a6bdb"
-dependencies = [
- "async-trait",
- "byteorder",
- "bytes",
- "fallible-iterator",
- "futures-channel",
- "futures-util",
- "log",
- "parking_lot 0.12.1",
- "percent-encoding",
- "phf",
- "pin-project-lite",
- "postgres-protocol 0.6.7",
- "postgres-types 0.2.8",
+ "postgres-protocol",
+ "postgres-types",
 "rand 0.8.5",
 "socket2",
 "tokio",
@@ -6917,7 +6913,7 @@ dependencies = [
 "ring",
 "rustls 0.23.18",
 "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "tokio-rustls 0.26.0",
 "x509-certificate",
 ]
@@ -7161,7 +7157,7 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
 dependencies = [
- "bitflags 2.4.1",
+ "bitflags 2.8.0",
 "bytes",
 "http 1.1.0",
 "http-body 1.0.0",
@@ -7595,7 +7591,7 @@ dependencies = [
 "serde_json",
 "sysinfo",
 "tokio",
- "tokio-postgres 0.7.9",
+ "tokio-postgres",
 "tokio-util",
 "tracing",
 "tracing-subscriber",
@@ -7658,9 +7654,9 @@ dependencies = [

 [[package]]
 name = "walkdir"
-version = "2.3.3"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36df944cda56c7d8d8b7496af378e6b16de9284591917d307c9b4d313c44e698"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
 dependencies = [
 "same-file",
 "winapi-util",
@@ -7912,6 +7908,15 @@ dependencies = [
 "windows-targets 0.52.6",
 ]

+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -77,10 +77,10 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive", "env"] }
+clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
-dashmap = { version = "5.5.0", features = ["raw-api"] }
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
 enum-map = "2.4.2"
@@ -123,7 +123,7 @@ measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
 nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
-notify = "6.0.0"
+notify = "8.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
@@ -177,7 +177,7 @@ test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
-tokio = { version = "1.17", features = ["macros"] }
+tokio = { version = "1.41", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
--- a/2
+++ b/2
@@ -64,6 +64,7 @@ ARG DEFAULT_PG_VERSION
 WORKDIR /data

 RUN set -e \
+    && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
    && apt update \
    && apt install -y \
        libreadline-dev \
@@ -72,6 +73,7 @@ RUN set -e \
 	# System postgres for use with client libraries (e.g. in storage controller)
        postgresql-15 \
        openssl \
+    && rm -f /etc/apt/apt.conf.d/80-retries \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
    && useradd -d /data neon \
    && chown -R neon:neon /data
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -3,6 +3,10 @@ ARG DEBIAN_VERSION=bookworm
 FROM debian:bookworm-slim AS pgcopydb_builder
 ARG DEBIAN_VERSION

+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
        set -e && \
        apt update && \
@@ -61,6 +65,10 @@ RUN mkdir -p /pgcopydb/bin && \
 COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
 COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5

+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
 # System deps
 #
 # 'gdb' is included so that we get backtraces of core dumps produced in
@@ -218,6 +226,8 @@ RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/re
 USER nonroot:nonroot
 WORKDIR /home/nonroot

+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
+
 # Python
 ENV PYTHON_VERSION=3.11.10 \
    PYENV_ROOT=/home/nonroot/.pyenv \
@@ -243,7 +253,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.84.0
+ENV RUSTC_VERSION=1.84.1
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
@@ -251,6 +261,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33
 ARG CARGO_DENY_VERSION=0.16.2
 ARG CARGO_HACK_VERSION=0.6.33
 ARG CARGO_NEXTEST_VERSION=0.9.85
+ARG CARGO_DIESEL_CLI_VERSION=2.2.6
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -264,6 +275,8 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} \
+                                      --features postgres-bundled --no-default-features && \
    rm -rf /home/nonroot/.cargo/registry && \
    rm -rf /home/nonroot/.cargo/git

--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -18,6 +18,10 @@ ARG DEBIAN_VERSION
 # Use strict mode for bash to catch errors early
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc \
+    echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
+
 RUN case $DEBIAN_VERSION in \
      # Version-specific installs for Bullseye (PG14-PG16):
      # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
@@ -821,11 +825,11 @@ RUN case "${PG_VERSION}" in "v17") \

 #########################################################################################
 #
-# Layer "rust extensions"
-# This layer is used to build `pgrx` deps
+# Layer "pg build with nonroot user and cargo installed"
+# This layer is base and common for layers with `pgrx`
 #
 #########################################################################################
-FROM pg-build AS rust-extensions-build
+FROM pg-build AS pg-build-nonroot-with-cargo
 ARG PG_VERSION

 RUN apt update && \
@@ -838,11 +842,23 @@ ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot

+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc
+
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
-    rm rustup-init && \
-    case "${PG_VERSION}" in \
+    rm rustup-init
+
+#########################################################################################
+#
+# Layer "rust extensions"
+# This layer is used to build `pgrx` deps
+#
+#########################################################################################
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build
+ARG PG_VERSION
+
+RUN case "${PG_VERSION}" in \
        'v17') \
            echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
    esac && \
@@ -861,24 +877,10 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM pg-build AS rust-extensions-build-pgrx12
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx12
 ARG PG_VERSION

-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
-    apt clean && rm -rf /var/lib/apt/lists/* && \
-    useradd -ms /bin/bash nonroot -b /home
-
-ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:$PATH"
-USER nonroot
-WORKDIR /home/nonroot
-
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
-    chmod +x rustup-init && \
-    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
-    rm rustup-init && \
-    cargo install --locked --version 0.12.9 cargo-pgrx && \
+RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'

 USER root
@@ -1132,8 +1134,8 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION

-RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
-    echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.1/pg_mooncake-0.1.1.tar.gz -O pg_mooncake.tar.gz && \
+    echo "a2d16eff7948dde64f072609ca5d2962d6b4d07cb89d45952add473529c55f55 pg_mooncake.tar.gz" | sha256sum --check && \
    mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
    make release -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -1243,6 +1245,7 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin c

 FROM debian:$DEBIAN_FLAVOR AS pgbouncer
 RUN set -e \
+    && echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries \
    && apt update \
    && apt install --no-install-suggests --no-install-recommends -y \
        build-essential \
@@ -1274,7 +1277,8 @@ FROM alpine/curl:${ALPINE_CURL_VERSION} AS exporters
 ARG TARGETARCH
 # Keep sql_exporter version same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py
-RUN if [ "$TARGETARCH" = "amd64" ]; then\
+RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc; \
+    if [ "$TARGETARCH" = "amd64" ]; then\
        postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
        pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
        sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
@@ -1336,6 +1340,7 @@ FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
 RUN mkdir /ext-src

+COPY --from=pg-build /postgres /postgres
 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
 COPY --from=plv8-build /plv8.tar.gz /ext-src/
@@ -1444,6 +1449,8 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca
 # libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl

+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    echo -e "retry_connrefused = on\ntimeout=15\ntries=5\n" > /root/.wgetrc

 RUN apt update && \
    case $DEBIAN_VERSION in \
@@ -1500,7 +1507,7 @@ RUN set -ex; \
    else \
        echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
    fi; \
-    curl -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
+    curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
    echo "${CHECKSUM}  /tmp/awscliv2.zip" | sha256sum -c -; \
    unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
    /tmp/awscliv2/aws/install; \
--- a/compute/patches/contrib_pg16.patch
+++ b/compute/patches/contrib_pg16.patch
@@ -0,0 +1,242 @@
+diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
+index 979e5e8..2375b45 100644
+--- a/contrib/amcheck/expected/check_heap.out
+++ b/contrib/amcheck/expected/check_heap.out
+@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
+  
+ (1 row)
+ 
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ CREATE ROLE regress_heaptest_role;
+ -- verify permissions are checked (error due to function not callable)
+ SET ROLE regress_heaptest_role;
+@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
+ DETAIL:  This operation is not supported for foreign tables.
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
+index 1745bae..3b429c3 100644
+--- a/contrib/amcheck/sql/check_heap.sql
+++ b/contrib/amcheck/sql/check_heap.sql
+@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -58,9 +55,6 @@ COMMIT;
+ --   ALTER TABLE ... SET TABLESPACE ...
+ -- causing an additional bulkread, which should be reflected in pg_stat_io.
+ SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+ 
+ CREATE ROLE regress_heaptest_role;
+ 
+@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
+ 
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
+index 33be13a..70a406c 100644
+--- a/contrib/citext/expected/create_index_acl.out
+++ b/contrib/citext/expected/create_index_acl.out
+@@ -5,9 +5,6 @@
+ -- owner having as few applicable privileges as possible.  (The privileges.sql
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -66,11 +61,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ ERROR:  42704
+ \set VERBOSITY default
+ ROLLBACK;
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
+index 10b5225..ae442e1 100644
+--- a/contrib/citext/sql/create_index_acl.sql
+++ b/contrib/citext/sql/create_index_acl.sql
+@@ -6,10 +6,6 @@
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+ 
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+-
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -68,11 +62,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ \set VERBOSITY default
+ ROLLBACK;
+ 
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
+index 72304e0..ebe131b 100644
+--- a/contrib/file_fdw/expected/file_fdw.out
+++ b/contrib/file_fdw/expected/file_fdw.out
+@@ -4,6 +4,7 @@
+ -- directory paths are passed to us in environment variables
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ -- Clean up in case a prior regression run failed
+SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
+index f0548e1..848a08c 100644
+--- a/contrib/file_fdw/sql/file_fdw.sql
+++ b/contrib/file_fdw/sql/file_fdw.sql
+@@ -6,6 +6,7 @@
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ 
+ -- Clean up in case a prior regression run failed
+SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out
+index d1adbab..38b52ac 100644
+--- a/contrib/pageinspect/expected/gist.out
+++ b/contrib/pageinspect/expected/gist.out
+@@ -10,25 +10,6 @@ BEGIN;
+ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+     generate_series(1,1000) i;
+ CREATE INDEX test_gist_idx ON test_gist USING gist (p);
+--- Page 0 is the root, the rest are leaf pages
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
+- lsn | nsn | rightlink  | flags 
+------+-----+------------+-------
+- 0/1 | 0/0 | 4294967295 | {}
+-(1 row)
+-
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
+- lsn | nsn | rightlink  | flags  
+------+-----+------------+--------
+- 0/1 | 0/0 | 4294967295 | {leaf}
+-(1 row)
+-
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
+- lsn | nsn | rightlink | flags  
+------+-----+-----------+--------
+- 0/1 | 0/0 |         1 | {leaf}
+-(1 row)
+-
+ COMMIT;
+ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
+  itemoffset |   ctid    | itemlen | dead |             keys              
+diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql
+index d263542..607992f 100644
+--- a/contrib/pageinspect/sql/gist.sql
+++ b/contrib/pageinspect/sql/gist.sql
+@@ -12,11 +12,6 @@ CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+     generate_series(1,1000) i;
+ CREATE INDEX test_gist_idx ON test_gist USING gist (p);
+ 
+--- Page 0 is the root, the rest are leaf pages
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
+-SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
+-
+ COMMIT;
+ 
+ SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
--- a/compute/patches/contrib_pg17.patch
+++ b/compute/patches/contrib_pg17.patch
@@ -0,0 +1,196 @@
+diff --git a/contrib/amcheck/expected/check_heap.out b/contrib/amcheck/expected/check_heap.out
+index 979e5e8..2375b45 100644
+--- a/contrib/amcheck/expected/check_heap.out
+++ b/contrib/amcheck/expected/check_heap.out
+@@ -80,12 +80,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -118,14 +115,6 @@ SELECT pg_stat_force_next_flush();
+  
+ (1 row)
+ 
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ CREATE ROLE regress_heaptest_role;
+ -- verify permissions are checked (error due to function not callable)
+ SET ROLE regress_heaptest_role;
+@@ -233,7 +222,6 @@ ERROR:  cannot check relation "test_foreign_table"
+ DETAIL:  This operation is not supported for foreign tables.
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/amcheck/sql/check_heap.sql b/contrib/amcheck/sql/check_heap.sql
+index 1745bae..3b429c3 100644
+--- a/contrib/amcheck/sql/check_heap.sql
+++ b/contrib/amcheck/sql/check_heap.sql
+@@ -40,12 +40,9 @@ INSERT INTO heaptest (a, b)
+ -- same transaction.  The heaptest table is smaller than the default
+ -- wal_skip_threshold, so a wal_level=minimal commit reads the table into
+ -- shared_buffers.  A transaction delays that and excludes any autovacuum.
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_test_stats_tblspc LOCATION '';
+ SELECT sum(reads) AS stats_bulkreads_before
+   FROM pg_stat_io WHERE context = 'bulkread' \gset
+ BEGIN;
+-ALTER TABLE heaptest SET TABLESPACE regress_test_stats_tblspc;
+ -- Check that valid options are not rejected nor corruption reported
+ -- for a non-empty table
+ SELECT * FROM verify_heapam(relation := 'heaptest', skip := 'none');
+@@ -58,9 +55,6 @@ COMMIT;
+ --   ALTER TABLE ... SET TABLESPACE ...
+ -- causing an additional bulkread, which should be reflected in pg_stat_io.
+ SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS stats_bulkreads_after
+-  FROM pg_stat_io WHERE context = 'bulkread' \gset
+-SELECT :stats_bulkreads_after > :stats_bulkreads_before;
+ 
+ CREATE ROLE regress_heaptest_role;
+ 
+@@ -140,7 +134,6 @@ SELECT * FROM verify_heapam('test_foreign_table',
+ 
+ -- cleanup
+ DROP TABLE heaptest;
+-DROP TABLESPACE regress_test_stats_tblspc;
+ DROP TABLE test_partition;
+ DROP TABLE test_partitioned;
+ DROP OWNED BY regress_heaptest_role; -- permissions
+diff --git a/contrib/citext/expected/create_index_acl.out b/contrib/citext/expected/create_index_acl.out
+index 33be13a..70a406c 100644
+--- a/contrib/citext/expected/create_index_acl.out
+++ b/contrib/citext/expected/create_index_acl.out
+@@ -5,9 +5,6 @@
+ -- owner having as few applicable privileges as possible.  (The privileges.sql
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -49,11 +46,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -66,11 +61,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -78,9 +71,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ ERROR:  42704
+ \set VERBOSITY default
+ ROLLBACK;
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/citext/sql/create_index_acl.sql b/contrib/citext/sql/create_index_acl.sql
+index 10b5225..ae442e1 100644
+--- a/contrib/citext/sql/create_index_acl.sql
+++ b/contrib/citext/sql/create_index_acl.sql
+@@ -6,10 +6,6 @@
+ -- regress_sro_user tests look for the opposite defect; they confirm that
+ -- DefineIndex() uses the table owner userid where necessary.)
+ 
+-SET allow_in_place_tablespaces = true;
+-CREATE TABLESPACE regress_create_idx_tblspace LOCATION '';
+-RESET allow_in_place_tablespaces;
+-
+ BEGIN;
+ CREATE ROLE regress_minimal;
+ CREATE SCHEMA s;
+@@ -51,11 +47,9 @@ ALTER TABLE s.x OWNER TO regress_minimal;
+ -- Empty-table DefineIndex()
+ CREATE UNIQUE INDEX u0rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e0rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Make the table nonempty.
+ INSERT INTO s.x VALUES ('foo'), ('bar');
+@@ -68,11 +62,9 @@ RESET search_path;
+ GRANT EXECUTE ON FUNCTION s.index_this_expr TO regress_minimal;
+ CREATE UNIQUE INDEX u2rows ON s.x USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll s.citext_pattern_ops)
+-  TABLESPACE regress_create_idx_tblspace
+   WHERE s.index_row_if(y);
+ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE s.coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ -- Shall not find s.coll via search_path, despite the s.const->public.setter
+ -- call having set search_path=s during expression planning.  Suppress the
+@@ -80,9 +72,7 @@ ALTER TABLE s.x ADD CONSTRAINT e2rows EXCLUDE USING btree
+ \set VERBOSITY sqlstate
+ ALTER TABLE s.x ADD CONSTRAINT underqualified EXCLUDE USING btree
+   ((s.index_this_expr(y, s.const())) COLLATE coll WITH s.=)
+-  USING INDEX TABLESPACE regress_create_idx_tblspace
+   WHERE (s.index_row_if(y));
+ \set VERBOSITY default
+ ROLLBACK;
+ 
+-DROP TABLESPACE regress_create_idx_tblspace;
+diff --git a/contrib/file_fdw/expected/file_fdw.out b/contrib/file_fdw/expected/file_fdw.out
+index 86c148a..81bdb2c 100644
+--- a/contrib/file_fdw/expected/file_fdw.out
+++ b/contrib/file_fdw/expected/file_fdw.out
+@@ -4,6 +4,7 @@
+ -- directory paths are passed to us in environment variables
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ -- Clean up in case a prior regression run failed
+SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
+diff --git a/contrib/file_fdw/sql/file_fdw.sql b/contrib/file_fdw/sql/file_fdw.sql
+index f0548e1..848a08c 100644
+--- a/contrib/file_fdw/sql/file_fdw.sql
+++ b/contrib/file_fdw/sql/file_fdw.sql
+@@ -6,6 +6,7 @@
+ \getenv abs_srcdir PG_ABS_SRCDIR
+ 
+ -- Clean up in case a prior regression run failed
+SET compute_query_id TO 'off';
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_file_fdw_superuser, regress_file_fdw_user, regress_no_priv_user;
+ RESET client_min_messages;
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -34,6 +34,7 @@
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::collections::HashMap;
+use std::ffi::OsString;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
@@ -44,7 +45,7 @@ use std::{thread, time::Duration};

 use anyhow::{Context, Result};
 use chrono::Utc;
-use clap::Arg;
+use clap::Parser;
 use compute_tools::disk_quota::set_disk_quota;
 use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
@@ -73,10 +74,75 @@ use utils::failpoint_support;
 // in-case of not-set environment var
 const BUILD_TAG_DEFAULT: &str = "latest";

-fn main() -> Result<()> {
-    let scenario = failpoint_support::init();
+// Compatibility hack: if the control plane specified any remote-ext-config
+// use the default value for extension storage proxy gateway.
+// Remove this once the control plane is updated to pass the gateway URL
+fn parse_remote_ext_config(arg: &str) -> Result<String> {
+    if arg.starts_with("http") {
+        Ok(arg.trim_end_matches('/').to_string())
+    } else {
+        Ok("http://pg-ext-s3-gateway".to_string())
+    }
+}

-    let (build_tag, clap_args) = init()?;
+#[derive(Parser)]
+#[command(rename_all = "kebab-case")]
+struct Cli {
+    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
+    pub pgbin: String,
+
+    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
+    pub remote_ext_config: Option<String>,
+
+    #[arg(long, default_value_t = 3080)]
+    pub http_port: u16,
+
+    #[arg(short = 'D', long, value_name = "DATADIR")]
+    pub pgdata: String,
+
+    #[arg(short = 'C', long, value_name = "DATABASE_URL")]
+    pub connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "neon-postgres")]
+    pub cgroup: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(
+        long,
+        default_value = "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor"
+    )]
+    pub filecache_connstr: String,
+
+    #[cfg(target_os = "linux")]
+    #[arg(long, default_value = "0.0.0.0:10301")]
+    pub vm_monitor_addr: String,
+
+    #[arg(long, action = clap::ArgAction::SetTrue)]
+    pub resize_swap_on_bind: bool,
+
+    #[arg(long)]
+    pub set_disk_quota_for_fs: Option<String>,
+
+    #[arg(short = 's', long = "spec", group = "spec")]
+    pub spec_json: Option<String>,
+
+    #[arg(short = 'S', long, group = "spec-path")]
+    pub spec_path: Option<OsString>,
+
+    #[arg(short = 'i', long, group = "compute-id", conflicts_with_all = ["spec", "spec-path"])]
+    pub compute_id: Option<String>,
+
+    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], requires = "compute-id", value_name = "CONTROL_PLANE_API_BASE_URL")]
+    pub control_plane_uri: Option<String>,
+}
+
+fn main() -> Result<()> {
+    let cli = Cli::parse();
+
+    let build_tag = init()?;
+
+    let scenario = failpoint_support::init();

    // enable core dumping for all child processes
    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -85,13 +151,11 @@ fn main() -> Result<()> {
        // Enter startup tracing context
        let _startup_context_guard = startup_context_from_env();

-        let cli_args = process_cli(&clap_args)?;
+        let cli_spec = try_spec_from_cli(&cli)?;

-        let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
+        let compute = wait_spec(build_tag, &cli, cli_spec)?;

-        let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
-
-        start_postgres(&clap_args, wait_spec_result)?
+        start_postgres(&cli, compute)?

        // Startup is finished, exit the startup tracing span
    };
@@ -108,7 +172,7 @@ fn main() -> Result<()> {
    deinit_and_exit(wait_pg_result);
 }

-fn init() -> Result<(String, clap::ArgMatches)> {
+fn init() -> Result<String> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -123,66 +187,7 @@ fn init() -> Result<(String, clap::ArgMatches)> {
        .to_string();
    info!("build_tag: {build_tag}");

-    Ok((build_tag, cli().get_matches()))
-}
-
-fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
-    let pgbin_default = "postgres";
-    let pgbin = matches
-        .get_one::<String>("pgbin")
-        .map(|s| s.as_str())
-        .unwrap_or(pgbin_default);
-
-    let ext_remote_storage = matches
-        .get_one::<String>("remote-ext-config")
-        // Compatibility hack: if the control plane specified any remote-ext-config
-        // use the default value for extension storage proxy gateway.
-        // Remove this once the control plane is updated to pass the gateway URL
-        .map(|conf| {
-            if conf.starts_with("http") {
-                conf.trim_end_matches('/')
-            } else {
-                "http://pg-ext-s3-gateway"
-            }
-        });
-
-    let http_port = *matches
-        .get_one::<u16>("http-port")
-        .expect("http-port is required");
-    let pgdata = matches
-        .get_one::<String>("pgdata")
-        .expect("PGDATA path is required");
-    let connstr = matches
-        .get_one::<String>("connstr")
-        .expect("Postgres connection string is required");
-    let spec_json = matches.get_one::<String>("spec");
-    let spec_path = matches.get_one::<String>("spec-path");
-    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
-    let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
-
-    Ok(ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        spec_json,
-        spec_path,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    })
-}
-
-struct ProcessCliResult<'clap> {
-    connstr: &'clap str,
-    pgdata: &'clap str,
-    pgbin: &'clap str,
-    ext_remote_storage: Option<&'clap str>,
-    http_port: u16,
-    spec_json: Option<&'clap String>,
-    spec_path: Option<&'clap String>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<&'clap String>,
+    Ok(build_tag)
 }

 fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
@@ -235,19 +240,9 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
    }
 }

-fn try_spec_from_cli(
-    matches: &clap::ArgMatches,
-    ProcessCliResult {
-        spec_json,
-        spec_path,
-        ..
-    }: &ProcessCliResult,
-) -> Result<CliSpecParams> {
-    let compute_id = matches.get_one::<String>("compute-id");
-    let control_plane_uri = matches.get_one::<String>("control-plane-uri");
-
+fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
    // First, try to get cluster spec from the cli argument
-    if let Some(spec_json) = spec_json {
+    if let Some(ref spec_json) = cli.spec_json {
        info!("got spec from cli argument {}", spec_json);
        return Ok(CliSpecParams {
            spec: Some(serde_json::from_str(spec_json)?),
@@ -256,7 +251,7 @@ fn try_spec_from_cli(
    }

    // Second, try to read it from the file if path is provided
-    if let Some(spec_path) = spec_path {
+    if let Some(ref spec_path) = cli.spec_path {
        let file = File::open(Path::new(spec_path))?;
        return Ok(CliSpecParams {
            spec: Some(serde_json::from_reader(file)?),
@@ -264,17 +259,20 @@ fn try_spec_from_cli(
        });
    }

-    let Some(compute_id) = compute_id else {
+    if cli.compute_id.is_none() {
        panic!(
            "compute spec should be provided by one of the following ways: \
                --spec OR --spec-path OR --control-plane-uri and --compute-id"
        );
    };
-    let Some(control_plane_uri) = control_plane_uri else {
+    if cli.control_plane_uri.is_none() {
        panic!("must specify both --control-plane-uri and --compute-id or none");
    };

-    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+    match get_spec_from_control_plane(
+        cli.control_plane_uri.as_ref().unwrap(),
+        cli.compute_id.as_ref().unwrap(),
+    ) {
        Ok(spec) => Ok(CliSpecParams {
            spec,
            live_config_allowed: true,
@@ -298,21 +296,12 @@ struct CliSpecParams {

 fn wait_spec(
    build_tag: String,
-    ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-        http_port,
-        ..
-    }: ProcessCliResult,
+    cli: &Cli,
    CliSpecParams {
        spec,
        live_config_allowed,
    }: CliSpecParams,
-) -> Result<WaitSpecResult> {
+) -> Result<Arc<ComputeNode>> {
    let mut new_state = ComputeState::new();
    let spec_set;

@@ -324,7 +313,7 @@ fn wait_spec(
    } else {
        spec_set = false;
    }
-    let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?;
+    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
    let conn_conf = postgres::config::Config::from_str(connstr.as_str())
        .context("cannot build postgres config from connstr")?;
    let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
@@ -333,14 +322,14 @@ fn wait_spec(
        connstr,
        conn_conf,
        tokio_conn_conf,
-        pgdata: pgdata.to_string(),
-        pgbin: pgbin.to_string(),
-        pgversion: get_pg_version_string(pgbin),
-        http_port,
+        pgdata: cli.pgdata.clone(),
+        pgbin: cli.pgbin.clone(),
+        pgversion: get_pg_version_string(&cli.pgbin),
+        http_port: cli.http_port,
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
-        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
+        ext_remote_storage: cli.remote_ext_config.clone(),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
    };
@@ -357,7 +346,7 @@ fn wait_spec(
    // Launch http service first, so that we can serve control-plane requests
    // while configuration is still in progress.
    let _http_handle =
-        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
+        launch_http_server(cli.http_port, &compute).expect("cannot launch http endpoint thread");

    if !spec_set {
        // No spec provided, hang waiting for it.
@@ -389,27 +378,12 @@ fn wait_spec(

    launch_lsn_lease_bg_task_for_static(&compute);

-    Ok(WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
-    })
-}
-
-struct WaitSpecResult {
-    compute: Arc<ComputeNode>,
-    resize_swap_on_bind: bool,
-    set_disk_quota_for_fs: Option<String>,
+    Ok(compute)
 }

 fn start_postgres(
-    // need to allow unused because `matches` is only used if target_os = "linux"
-    #[allow(unused_variables)] matches: &clap::ArgMatches,
-    WaitSpecResult {
-        compute,
-        resize_swap_on_bind,
-        set_disk_quota_for_fs,
-    }: WaitSpecResult,
+    cli: &Cli,
+    compute: Arc<ComputeNode>,
 ) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
    // We got all we need, update the state.
    let mut state = compute.state.lock().unwrap();
@@ -437,7 +411,7 @@ fn start_postgres(
    let mut delay_exit = false;

    // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
+    if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
        // *before* starting postgres.
        //
@@ -464,9 +438,9 @@ fn start_postgres(

    // Set disk quota if the compute spec says so
    if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
-        (disk_quota_bytes, set_disk_quota_for_fs)
+        (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
    {
-        match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
+        match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
            Ok(()) => {
                let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
                info!(%disk_quota_bytes, %size_mib, "set disk quota");
@@ -509,13 +483,7 @@ fn start_postgres(
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            let vm_monitor_addr = matches
-                .get_one::<String>("vm-monitor-addr")
-                .expect("--vm-monitor-addr should always be set because it has a default arg");
-            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
-            let cgroup = matches.get_one::<String>("cgroup");

-            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
@@ -538,15 +506,15 @@ fn start_postgres(
            let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
                None
            } else {
-                file_cache_connstr.cloned()
+                Some(cli.filecache_connstr.clone())
            };

            let vm_monitor = rt.as_ref().map(|rt| {
                rt.spawn(vm_monitor::start(
                    Box::leak(Box::new(vm_monitor::Args {
-                        cgroup: cgroup.cloned(),
+                        cgroup: Some(cli.cgroup.clone()),
                        pgconnstr,
-                        addr: vm_monitor_addr.clone(),
+                        addr: cli.vm_monitor_addr.clone(),
                    })),
                    token.clone(),
                ))
@@ -702,105 +670,6 @@ fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
    exit(exit_code.unwrap_or(1))
 }

-fn cli() -> clap::Command {
-    // Env variable is set by `cargo`
-    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
-    clap::Command::new("compute_ctl")
-        .version(version)
-        .arg(
-            Arg::new("http-port")
-                .long("http-port")
-                .value_name("HTTP_PORT")
-                .default_value("3080")
-                .value_parser(clap::value_parser!(u16))
-                .required(false),
-        )
-        .arg(
-            Arg::new("connstr")
-                .short('C')
-                .long("connstr")
-                .value_name("DATABASE_URL")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgdata")
-                .short('D')
-                .long("pgdata")
-                .value_name("DATADIR")
-                .required(true),
-        )
-        .arg(
-            Arg::new("pgbin")
-                .short('b')
-                .long("pgbin")
-                .default_value("postgres")
-                .value_name("POSTGRES_PATH"),
-        )
-        .arg(
-            Arg::new("spec")
-                .short('s')
-                .long("spec")
-                .value_name("SPEC_JSON"),
-        )
-        .arg(
-            Arg::new("spec-path")
-                .short('S')
-                .long("spec-path")
-                .value_name("SPEC_PATH"),
-        )
-        .arg(
-            Arg::new("compute-id")
-                .short('i')
-                .long("compute-id")
-                .value_name("COMPUTE_ID"),
-        )
-        .arg(
-            Arg::new("control-plane-uri")
-                .short('p')
-                .long("control-plane-uri")
-                .value_name("CONTROL_PLANE_API_BASE_URI"),
-        )
-        .arg(
-            Arg::new("remote-ext-config")
-                .short('r')
-                .long("remote-ext-config")
-                .value_name("REMOTE_EXT_CONFIG"),
-        )
-        // TODO(fprasx): we currently have default arguments because the cloud PR
-        // to pass them in hasn't been merged yet. We should get rid of them once
-        // the PR is merged.
-        .arg(
-            Arg::new("vm-monitor-addr")
-                .long("vm-monitor-addr")
-                .default_value("0.0.0.0:10301")
-                .value_name("VM_MONITOR_ADDR"),
-        )
-        .arg(
-            Arg::new("cgroup")
-                .long("cgroup")
-                .default_value("neon-postgres")
-                .value_name("CGROUP"),
-        )
-        .arg(
-            Arg::new("filecache-connstr")
-                .long("filecache-connstr")
-                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
-                )
-                .value_name("FILECACHE_CONNSTR"),
-        )
-        .arg(
-            Arg::new("resize-swap-on-bind")
-                .long("resize-swap-on-bind")
-                .action(clap::ArgAction::SetTrue),
-        )
-        .arg(
-            Arg::new("set-disk-quota-for-fs")
-                .long("set-disk-quota-for-fs")
-                .value_name("SET_DISK_QUOTA_FOR_FS")
-        )
-}
-
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
 /// to prevent leakage. TODO: it is better to convert compute_ctl to async and
 /// wait for termination which would be easy then.
@@ -810,7 +679,14 @@ fn handle_exit_signal(sig: i32) {
    exit(1);
 }

-#[test]
-fn verify_cli() {
-    cli().debug_assert()
+#[cfg(test)]
+mod test {
+    use clap::CommandFactory;
+
+    use super::Cli;
+
+    #[test]
+    fn verify_cli() {
+        Cli::command().debug_assert()
+    }
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -258,14 +258,11 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
    let uri = format!("{}/{}", ext_remote_storage, ext_path);

-    info!("Download extension {:?} from uri {:?}", ext_path, uri);
+    info!("Download extension {} from uri {}", ext_path, uri);

    match do_extension_server_request(&uri).await {
        Ok(resp) => {
-            info!(
-                "Successfully downloaded remote extension data {:?}",
-                ext_path
-            );
+            info!("Successfully downloaded remote extension data {}", ext_path);
            REMOTE_EXT_REQUESTS_TOTAL
                .with_label_values(&[&StatusCode::OK.to_string()])
                .inc();
@@ -285,7 +282,10 @@ async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Res
 async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
    let resp = reqwest::get(uri).await.map_err(|e| {
        (
-            format!("could not perform remote extensions server request: {}", e),
+            format!(
+                "could not perform remote extensions server request: {:?}",
+                e
+            ),
            UNKNOWN_HTTP_STATUS.to_string(),
        )
    })?;
@@ -295,7 +295,7 @@ async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String
        StatusCode::OK => match resp.bytes().await {
            Ok(resp) => Ok(resp),
            Err(e) => Err((
-                format!("could not read remote extensions server response: {}", e),
+                format!("could not read remote extensions server response: {:?}", e),
                // It's fine to return and report error with status as 200 OK,
                // because we still failed to read the response.
                status.to_string(),
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -125,7 +125,7 @@ impl<'m> MigrationRunner<'m> {
                    info!("Finished migration id={}", migration_id);
                }
                Err(e) => {
-                    error!("Failed to run migration id={}: {}", migration_id, e);
+                    error!("Failed to run migration id={}: {:?}", migration_id, e);
                    DB_MIGRATION_FAILED
                        .with_label_values(&[migration_id.to_string().as_str()])
                        .inc();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -28,7 +28,7 @@ fn do_control_plane_request(
        .map_err(|e| {
            (
                true,
-                format!("could not perform spec request to control plane: {}", e),
+                format!("could not perform spec request to control plane: {:?}", e),
                UNKNOWN_HTTP_STATUS.to_string(),
            )
        })?;
@@ -39,7 +39,7 @@ fn do_control_plane_request(
            Ok(spec_resp) => Ok(spec_resp),
            Err(e) => Err((
                true,
-                format!("could not deserialize control plane response: {}", e),
+                format!("could not deserialize control plane response: {:?}", e),
                status.to_string(),
            )),
        },
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -388,6 +388,11 @@ impl PageServerNode {
                .map(|x| x.parse::<u8>())
                .transpose()
                .context("Failed to parse 'image_creation_check_threshold' as integer")?,
+            image_creation_preempt_threshold: settings
+                .remove("image_creation_preempt_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'image_creation_preempt_threshold' as integer")?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -10,8 +10,8 @@ use pageserver_api::{
    controller_api::{
        AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
        SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
-        TenantPolicyRequest,
+        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy,
+        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -800,7 +800,7 @@ async fn main() -> anyhow::Result<()> {
                    .collect(),
            };
            storcon_client
-                .dispatch::<ShardsPreferredAzsRequest, ()>(
+                .dispatch::<ShardsPreferredAzsRequest, ShardsPreferredAzsResponse>(
                    Method::PUT,
                    "control/v1/preferred_azs".to_string(),
                    Some(req),
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -7,11 +7,12 @@ FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
 ARG COMPUTE_IMAGE

 USER root
-RUN apt-get update &&       \
+RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
+    apt-get update &&       \
    apt-get install -y curl \
                       jq   \
                       netcat-openbsd
 #This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
+RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw

-USER postgres
+USER postgres
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -61,17 +61,32 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
+        # The following block does the same for the contrib/file_fdw test
+        TMPDIR=$(mktemp -d)
+        docker cp $TEST_CONTAINER_NAME:/postgres/contrib/file_fdw/data $TMPDIR/data
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/postgres/contrib/file_fdw/data
+        rm -rf $TMPDIR
+        # Apply patches
+        cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
        # We are running tests now
-        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
-            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
-        then
-            FAILED=$(tail -1 testout.txt)
-            for d in $FAILED
-            do
-                mkdir $d
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
-                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
-                cat $d/regression.out $d/regression.diffs || true
+        rm -f testout.txt testout_contrib.txt
+        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
+        $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
+        docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
+        $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
+        if [ $EXT_SUCCESS -eq 0 ] || [ $CONTRIB_SUCCESS -eq 0 ]; then
+            CONTRIB_FAILED=
+            FAILED=
+            [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
+            [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
+            for d in $FAILED $CONTRIB_FAILED; do
+                dn="$(basename $d)"
+                rm -rf $dn
+                mkdir $dn
+                docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ]
+                docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ]
+                cat $dn/regression.out $dn/regression.diffs || true
+                rm -rf $dn
            done
        rm -rf $FAILED
        exit 1
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 set -x

-cd /ext-src || exit 2
+extdir=${1}
+
+cd "${extdir}" || exit 2
 FAILED=
-LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
+LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
 for d in ${LIST}; do
    [ -d "${d}" ] || continue
    if ! psql -w -c "select 1" >/dev/null; then
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -323,6 +323,10 @@ pub struct TenantConfigToml {
    // Expresed in multiples of checkpoint distance.
    pub image_layer_creation_check_threshold: u8,

+    // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction.
+    // Set to 0 to disable preemption.
+    pub image_creation_preempt_threshold: usize,
+
    /// The length for an explicit LSN lease request.
    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
    #[serde(with = "humantime_serde")]
@@ -547,6 +551,10 @@ pub mod tenant_conf_defaults {
    // Relevant: https://github.com/neondatabase/neon/issues/3394
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
+    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we
+    // want to enable this feature.
+    pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
@@ -605,6 +613,7 @@ impl Default for TenantConfigToml {
            lazy_slru_download: false,
            timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
+            image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD,
            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
            timeline_offloading: false,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -498,6 +498,8 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub image_layer_creation_check_threshold: FieldPatch<u8>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_preempt_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub lsn_lease_length: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub lsn_lease_length_for_ts: FieldPatch<String>,
@@ -544,6 +546,7 @@ pub struct TenantConfig {
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
+    pub image_creation_preempt_threshold: Option<usize>,
    pub lsn_lease_length: Option<String>,
    pub lsn_lease_length_for_ts: Option<String>,
    pub timeline_offloading: Option<bool>,
@@ -581,6 +584,7 @@ impl TenantConfig {
            mut lazy_slru_download,
            mut timeline_get_throttle,
            mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
            mut lsn_lease_length,
            mut lsn_lease_length_for_ts,
            mut timeline_offloading,
@@ -635,6 +639,9 @@ impl TenantConfig {
        patch
            .image_layer_creation_check_threshold
            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
        patch.lsn_lease_length.apply(&mut lsn_lease_length);
        patch
            .lsn_lease_length_for_ts
@@ -679,6 +686,7 @@ impl TenantConfig {
            lazy_slru_download,
            timeline_get_throttle,
            image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
            lsn_lease_length,
            lsn_lease_length_for_ts,
            timeline_offloading,
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -38,14 +38,12 @@ impl Display for SafekeeperId {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 #[serde(transparent)]
 pub struct MemberSet {
-    pub members: Vec<SafekeeperId>,
+    pub m: Vec<SafekeeperId>,
 }

 impl MemberSet {
    pub fn empty() -> Self {
-        MemberSet {
-            members: Vec::new(),
-        }
+        MemberSet { m: Vec::new() }
    }

    pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
@@ -53,11 +51,11 @@ impl MemberSet {
        if hs.len() != members.len() {
            bail!("duplicate safekeeper id in the set {:?}", members);
        }
-        Ok(MemberSet { members })
+        Ok(MemberSet { m: members })
    }

    pub fn contains(&self, sk: &SafekeeperId) -> bool {
-        self.members.iter().any(|m| m.id == sk.id)
+        self.m.iter().any(|m| m.id == sk.id)
    }

    pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
@@ -67,7 +65,7 @@ impl MemberSet {
                sk.id, self
            ));
        }
-        self.members.push(sk);
+        self.m.push(sk);
        Ok(())
    }
 }
@@ -75,11 +73,7 @@ impl MemberSet {
 impl Display for MemberSet {
    /// Display as a comma separated list of members.
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let sks_str = self
-            .members
-            .iter()
-            .map(|m| m.to_string())
-            .collect::<Vec<_>>();
+        let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::<Vec<_>>();
        write!(f, "({})", sks_str.join(", "))
    }
 }
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -5,6 +5,24 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};

+/// Logs a critical error, similarly to `tracing::error!`. This will:
+///
+/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
+/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
+/// * Trigger a pageable alert (via the metric above).
+/// * In debug builds, panic the process.
+#[macro_export]
+macro_rules! critical {
+    ($($arg:tt)*) => {
+        if cfg!(debug_assertions) {
+            panic!($($arg)*);
+        }
+        $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
+        let backtrace = std::backtrace::Backtrace::capture();
+        tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
+    };
+}
+
 #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
@@ -25,7 +43,10 @@ impl LogFormat {
    }
 }

-struct TracingEventCountMetric {
+pub struct TracingEventCountMetric {
+    /// CRITICAL is not a `tracing` log level. Instead, we increment it in the `critical!` macro,
+    /// and also emit it as a regular error. These are thus double-counted, but that seems fine.
+    critical: IntCounter,
    error: IntCounter,
    warn: IntCounter,
    info: IntCounter,
@@ -33,7 +54,7 @@ struct TracingEventCountMetric {
    trace: IntCounter,
 }

-static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
+pub static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
    let vec = metrics::register_int_counter_vec!(
        "libmetrics_tracing_event_count",
        "Number of tracing events, by level",
@@ -46,6 +67,7 @@ static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(||
 impl TracingEventCountMetric {
    fn new(vec: IntCounterVec) -> Self {
        Self {
+            critical: vec.with_label_values(&["critical"]),
            error: vec.with_label_values(&["error"]),
            warn: vec.with_label_values(&["warn"]),
            info: vec.with_label_values(&["info"]),
@@ -54,6 +76,11 @@ impl TracingEventCountMetric {
        }
    }

+    // Allow public access from `critical!` macro.
+    pub fn inc_critical(&self) {
+        self.critical.inc();
+    }
+
    fn inc_for_level(&self, level: tracing::Level) {
        let counter = match level {
            tracing::Level::ERROR => &self.error,
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -177,8 +177,8 @@ impl FileCacheState {
        crate::spawn_with_cancel(
            token,
            |res| {
-                if let Err(error) = res {
-                    error!(%error, "postgres error")
+                if let Err(e) = res {
+                    error!(error = format_args!("{e:#}"), "postgres error");
                }
            },
            conn,
@@ -205,7 +205,7 @@ impl FileCacheState {
        {
            Ok(rows) => Ok(rows),
            Err(e) => {
-                error!(error = ?e, "postgres error: {e} -> retrying");
+                error!(error = format_args!("{e:#}"), "postgres error -> retrying");

                let client = FileCacheState::connect(&self.conn_str, self.token.clone())
                    .await
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -191,15 +191,12 @@ async fn start_monitor(
    .await;
    let mut monitor = match monitor {
        Ok(Ok(monitor)) => monitor,
-        Ok(Err(error)) => {
-            error!(?error, "failed to create monitor");
+        Ok(Err(e)) => {
+            error!(error = format_args!("{e:#}"), "failed to create monitor");
            return;
        }
        Err(_) => {
-            error!(
-                ?timeout,
-                "creating monitor timed out (probably waiting to receive protocol range)"
-            );
+            error!(?timeout, "creating monitor timed out");
            return;
        }
    };
@@ -207,6 +204,9 @@ async fn start_monitor(

    match monitor.run().await {
        Ok(()) => info!("monitor was killed due to new connection"),
-        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+        Err(e) => error!(
+            error = format_args!("{e:#}"),
+            "monitor terminated unexpectedly"
+        ),
    }
 }
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -370,12 +370,16 @@ impl Runner {
                }),
            InboundMsgKind::InvalidMessage { error } => {
                warn!(
-                    %error, id, "received notification of an invalid message we sent"
+                    error = format_args!("{error:#}"),
+                    id, "received notification of an invalid message we sent"
                );
                Ok(None)
            }
            InboundMsgKind::InternalError { error } => {
-                warn!(error, id, "agent experienced an internal error");
+                warn!(
+                    error = format_args!("{error:#}"),
+                    id, "agent experienced an internal error"
+                );
                Ok(None)
            }
            InboundMsgKind::HealthCheck {} => {
@@ -476,7 +480,7 @@ impl Runner {
                                        // gives the outermost cause, and the debug impl
                                        // pretty-prints the error, whereas {:#} contains all the
                                        // causes, but is compact (no newlines).
-                                        warn!(error = format!("{e:#}"), "error handling message");
+                                        warn!(error = format_args!("{e:#}"), "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
                                                error: e.to_string(),
@@ -492,7 +496,7 @@ impl Runner {
                                    .context("failed to send message")?;
                            }
                            Err(e) => warn!(
-                                error = format!("{e}"),
+                                error = format_args!("{e:#}"),
                                msg = ?msg,
                                "received error message"
                            ),
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -215,6 +215,7 @@ impl Wrapper {
            syncSafekeepers: config.sync_safekeepers,
            systemId: 0,
            pgTimeline: 1,
+            proto_version: 3,
            callback_data,
        };
        let c_config = Box::into_raw(Box::new(c_config));
@@ -276,6 +277,7 @@ mod tests {
    use core::panic;
    use std::{
        cell::Cell,
+        ffi::CString,
        sync::{atomic::AtomicUsize, mpsc::sync_channel},
    };

@@ -496,57 +498,64 @@ mod tests {
        // Messages definitions are at walproposer.h
        // xxx: it would be better to extract them from safekeeper crate and
        // use serialization/deserialization here.
-        let greeting_tag = (b'g' as u64).to_ne_bytes();
-        let proto_version = 2_u32.to_ne_bytes();
-        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
-        let proposer_id = [0; 16];
-        let system_id = 0_u64.to_ne_bytes();
-        let tenant_id = ttid.tenant_id.as_arr();
-        let timeline_id = ttid.timeline_id.as_arr();
-        let pg_tli = 1_u32.to_ne_bytes();
-        let wal_seg_size = 16777216_u32.to_ne_bytes();
+        let greeting_tag = (b'g').to_be_bytes();
+        let tenant_id = CString::new(ttid.tenant_id.to_string())
+            .unwrap()
+            .into_bytes_with_nul();
+        let timeline_id = CString::new(ttid.timeline_id.to_string())
+            .unwrap()
+            .into_bytes_with_nul();
+        let mconf_gen = 0_u32.to_be_bytes();
+        let mconf_members_len = 0_u32.to_be_bytes();
+        let mconf_members_new_len = 0_u32.to_be_bytes();
+        let pg_version: [u8; 4] = PG_VERSION_NUM.to_be_bytes();
+        let system_id = 0_u64.to_be_bytes();
+        let wal_seg_size = 16777216_u32.to_be_bytes();
+
        let proposer_greeting = [
            greeting_tag.as_slice(),
-            proto_version.as_slice(),
-            pg_version.as_slice(),
-            proposer_id.as_slice(),
-            system_id.as_slice(),
            tenant_id.as_slice(),
            timeline_id.as_slice(),
-            pg_tli.as_slice(),
+            mconf_gen.as_slice(),
+            mconf_members_len.as_slice(),
+            mconf_members_new_len.as_slice(),
+            pg_version.as_slice(),
+            system_id.as_slice(),
            wal_seg_size.as_slice(),
        ]
        .concat();

-        let voting_tag = (b'v' as u64).to_ne_bytes();
-        let vote_request_term = 3_u64.to_ne_bytes();
-        let proposer_id = [0; 16];
+        let voting_tag = (b'v').to_be_bytes();
+        let vote_request_term = 3_u64.to_be_bytes();
        let vote_request = [
            voting_tag.as_slice(),
+            mconf_gen.as_slice(),
            vote_request_term.as_slice(),
-            proposer_id.as_slice(),
        ]
        .concat();

-        let acceptor_greeting_term = 2_u64.to_ne_bytes();
-        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
+        let acceptor_greeting_term = 2_u64.to_be_bytes();
+        let acceptor_greeting_node_id = 1_u64.to_be_bytes();
        let acceptor_greeting = [
            greeting_tag.as_slice(),
-            acceptor_greeting_term.as_slice(),
            acceptor_greeting_node_id.as_slice(),
+            mconf_gen.as_slice(),
+            mconf_members_len.as_slice(),
+            mconf_members_new_len.as_slice(),
+            acceptor_greeting_term.as_slice(),
        ]
        .concat();

-        let vote_response_term = 3_u64.to_ne_bytes();
-        let vote_given = 1_u64.to_ne_bytes();
-        let flush_lsn = 0x539_u64.to_ne_bytes();
-        let truncate_lsn = 0x539_u64.to_ne_bytes();
-        let th_len = 1_u32.to_ne_bytes();
-        let th_term = 2_u64.to_ne_bytes();
-        let th_lsn = 0x539_u64.to_ne_bytes();
-        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
+        let vote_response_term = 3_u64.to_be_bytes();
+        let vote_given = 1_u8.to_be_bytes();
+        let flush_lsn = 0x539_u64.to_be_bytes();
+        let truncate_lsn = 0x539_u64.to_be_bytes();
+        let th_len = 1_u32.to_be_bytes();
+        let th_term = 2_u64.to_be_bytes();
+        let th_lsn = 0x539_u64.to_be_bytes();
        let vote_response = [
            voting_tag.as_slice(),
+            mconf_gen.as_slice(),
            vote_response_term.as_slice(),
            vote_given.as_slice(),
            flush_lsn.as_slice(),
@@ -554,7 +563,6 @@ mod tests {
            th_len.as_slice(),
            th_term.as_slice(),
            th_lsn.as_slice(),
-            timeline_start_lsn.as_slice(),
        ]
        .concat();

--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1472,7 +1472,13 @@ async fn layer_download_handler(
    let downloaded = timeline
        .download_layer(&layer_name)
        .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| match e {
+            tenant::storage_layer::layer::DownloadError::TimelineShutdown
+            | tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
+                ApiError::ShuttingDown
+            }
+            other => ApiError::InternalServerError(other.into()),
+        })?;

    match downloaded {
        Some(true) => json_response(StatusCode::OK, ()),
@@ -3169,12 +3175,16 @@ async fn put_tenant_timeline_import_basebackup(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let span = info_span!("import_basebackup",
+        tenant_id=%tenant_id, timeline_id=%timeline_id, shard_id=%tenant_shard_id.shard_slug(),
+        base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
    async move {
        let state = get_state(&request);
        let tenant = state
            .tenant_manager
-            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
+            .get_attached_tenant_shard(tenant_shard_id)?;

        let broker_client = state.broker_client.clone();

@@ -3383,7 +3393,17 @@ where
                            let status = response.status();
                            info!(%status, "Cancelled request finished successfully")
                        }
-                        Err(e) => error!("Cancelled request finished with an error: {e:?}"),
+                        Err(e) => match e {
+                            ApiError::ShuttingDown | ApiError::ResourceUnavailable(_) => {
+                                // Don't log this at error severity: they are normal during lifecycle of tenants/process
+                                info!("Cancelled request aborted for shutdown")
+                            }
+                            _ => {
+                                // Log these in a highly visible way, because we have no client to send the response to, but
+                                // would like to know that something went wrong.
+                                error!("Cancelled request finished with an error: {e:?}")
+                            }
+                        },
                    }
                }
                // only logging for cancelled panicked request handlers is the tracing_panic_hook,
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -263,14 +263,6 @@ pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
 /// data directory at pageserver startup can be automatically removed.
 pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";

-/// A marker file to mark that a timeline directory was not fully initialized.
-/// If a timeline directory with this marker is encountered at pageserver startup,
-/// the timeline directory and the marker file are both removed.
-/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
-pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
-
-pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
-
 pub fn is_temporary(path: &Utf8Path) -> bool {
    match path.file_name() {
        Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
@@ -278,25 +270,6 @@ pub fn is_temporary(path: &Utf8Path) -> bool {
    }
 }

-fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
-    match path.file_name() {
-        Some(name) => name.ends_with(suffix),
-        None => false,
-    }
-}
-
-// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
-// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
-// from the name.
-
-pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
-}
-
-pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
-    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,4 +1,13 @@
+use std::collections::HashMap;
+use std::num::NonZeroUsize;
+use std::pin::Pin;
+use std::sync::atomic::AtomicU64;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+use std::time::{Duration, Instant};
+
 use enum_map::EnumMap;
+use futures::Future;
 use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -11,13 +20,26 @@ use pageserver_api::config::{
    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
    PageServiceProtocolPipelinedExecutionStrategy,
 };
+use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
+use pin_project_lite::pin_project;
 use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
-use strum::{EnumCount, VariantNames};
+
+use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
 use utils::id::TimelineId;

+use crate::config::PageServerConf;
+use crate::context::{PageContentKind, RequestContext};
+use crate::task_mgr::TaskKind;
+use crate::tenant::layer_map::LayerMap;
+use crate::tenant::mgr::TenantSlot;
+use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
+use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
+use crate::tenant::Timeline;
+
 /// Prometheus histogram buckets (in seconds) for operations in the critical
 /// path. In other words, operations that directly affect that latency of user
 /// queries.
@@ -94,11 +116,38 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+/// Measures layers visited per read (i.e. read amplification).
+///
+/// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits
+/// are amortized across the batch, and some layers may not intersect with a given key, each visited
+/// layer contributes directly to the observed latency for every read in the batch, which is what we
+/// care about.
+pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_layers_per_read",
+        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        // Low resolution to reduce cardinality.
+        vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "pageserver_layers_visited_per_vectored_read_global",
-        "Average number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+        "pageserver_layers_per_read_global",
+        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
+        vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
+    register_histogram!(
+        "pageserver_deltas_per_read_global",
+        "Number of delta pages applied to image page per read",
+        vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
    )
    .expect("failed to define a metric")
 });
@@ -443,18 +492,38 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
+#[derive(
+    strum_macros::EnumIter,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+)]
 #[strum(serialize_all = "kebab_case")]
-pub(crate) enum MetricLayerKind {
+pub(crate) enum LayerKind {
    Delta,
    Image,
 }

+#[derive(
+    strum_macros::EnumIter,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum LayerLevel {
+    // We don't track the currently open ephemeral layer, since there's always exactly 1 and its
+    // size changes. See `TIMELINE_EPHEMERAL_BYTES`.
+    Frozen,
+    L0,
+    L1,
+}
+
 static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_layer_bytes",
-        "Sum of layer physical sizes in bytes",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
+        "Sum of frozen, L0, and L1 layer physical sizes in bytes (excluding the open ephemeral layer)",
+        &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
    )
    .expect("failed to define a metric")
 });
@@ -462,8 +531,8 @@ static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
 static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_layer_count",
-        "Number of layers that exist",
-        &["tenant_id", "shard_id", "timeline_id", "kind"]
+        "Number of frozen, L0, and L1 layers (excluding the open ephemeral layer)",
+        &["tenant_id", "shard_id", "timeline_id", "level", "kind"]
    )
    .expect("failed to define a metric")
 });
@@ -2590,10 +2659,7 @@ pub(crate) struct TimelineMetrics {
    pub disk_consistent_lsn_gauge: IntGauge,
    pub pitr_history_size: UIntGauge,
    pub archival_size: UIntGauge,
-    pub(crate) layer_size_image: UIntGauge,
-    pub(crate) layer_count_image: UIntGauge,
-    pub(crate) layer_size_delta: UIntGauge,
-    pub(crate) layer_count_delta: UIntGauge,
+    pub layers_per_read: Histogram,
    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    pub visible_physical_size_gauge: UIntGauge,
@@ -2691,40 +2757,8 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

-        let layer_size_image = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_count_image = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Image.into(),
-            ])
-            .unwrap();
-
-        let layer_size_delta = TIMELINE_LAYER_SIZE
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
-            .unwrap();
-
-        let layer_count_delta = TIMELINE_LAYER_COUNT
-            .get_metric_with_label_values(&[
-                &tenant_id,
-                &shard_id,
-                &timeline_id,
-                MetricLayerKind::Delta.into(),
-            ])
+        let layers_per_read = LAYERS_PER_READ
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

        let standby_horizon_gauge = STANDBY_HORIZON
@@ -2791,10 +2825,7 @@ impl TimelineMetrics {
            disk_consistent_lsn_gauge,
            pitr_history_size,
            archival_size,
-            layer_size_image,
-            layer_count_image,
-            layer_size_delta,
-            layer_count_delta,
+            layers_per_read,
            standby_horizon_gauge,
            resident_physical_size_gauge,
            visible_physical_size_gauge,
@@ -2837,6 +2868,92 @@ impl TimelineMetrics {
            .add(duration);
    }

+    /// Generates TIMELINE_LAYER labels for a persistent layer.
+    fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] {
+        let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) {
+            true => LayerLevel::L0,
+            false => LayerLevel::L1,
+        };
+        let kind = match layer_desc.is_delta() {
+            true => LayerKind::Delta,
+            false => LayerKind::Image,
+        };
+        [
+            &self.tenant_id,
+            &self.shard_id,
+            &self.timeline_id,
+            level.into(),
+            kind.into(),
+        ]
+    }
+
+    /// Generates TIMELINE_LAYER labels for a frozen ephemeral layer.
+    fn make_frozen_layer_labels(&self, _layer: &InMemoryLayer) -> [&str; 5] {
+        [
+            &self.tenant_id,
+            &self.shard_id,
+            &self.timeline_id,
+            LayerLevel::Frozen.into(),
+            LayerKind::Delta.into(), // by definition
+        ]
+    }
+
+    /// Removes a frozen ephemeral layer to TIMELINE_LAYER metrics.
+    pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
+        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
+        let labels = self.make_frozen_layer_labels(layer);
+        let size = layer.try_len().expect("frozen layer should have no writer");
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .dec();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .sub(size);
+    }
+
+    /// Adds a frozen ephemeral layer to TIMELINE_LAYER metrics.
+    pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
+        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
+        let labels = self.make_frozen_layer_labels(layer);
+        let size = layer.try_len().expect("frozen layer should have no writer");
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .inc();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .add(size);
+    }
+
+    /// Removes a persistent layer from TIMELINE_LAYER metrics.
+    pub fn dec_layer(&self, layer_desc: &PersistentLayerDesc) {
+        let labels = self.make_layer_labels(layer_desc);
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .dec();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .sub(layer_desc.file_size);
+    }
+
+    /// Adds a persistent layer to TIMELINE_LAYER metrics.
+    pub fn inc_layer(&self, layer_desc: &PersistentLayerDesc) {
+        let labels = self.make_layer_labels(layer_desc);
+        TIMELINE_LAYER_COUNT
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .inc();
+        TIMELINE_LAYER_SIZE
+            .get_metric_with_label_values(&labels)
+            .unwrap()
+            .add(layer_desc.file_size);
+    }
+
    pub(crate) fn shutdown(&self) {
        let was_shutdown = self
            .shutdown
@@ -2869,30 +2986,16 @@ impl TimelineMetrics {
        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Image.into(),
-        ]);
-        let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
-        let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
-            tenant_id,
-            shard_id,
-            timeline_id,
-            MetricLayerKind::Delta.into(),
-        ]);
+        for ref level in LayerLevel::iter() {
+            for ref kind in LayerKind::iter() {
+                let labels: [&str; 5] =
+                    [tenant_id, shard_id, timeline_id, level.into(), kind.into()];
+                let _ = TIMELINE_LAYER_SIZE.remove_label_values(&labels);
+                let _ = TIMELINE_LAYER_COUNT.remove_label_values(&labels);
+            }
+        }
+
+        let _ = LAYERS_PER_READ.remove_label_values(&[tenant_id, shard_id, timeline_id]);

        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2974,24 +3077,6 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
    // we leave the BROKEN_TENANTS_SET entry if any
 }

-use futures::Future;
-use pin_project_lite::pin_project;
-use std::collections::HashMap;
-use std::num::NonZeroUsize;
-use std::pin::Pin;
-use std::sync::atomic::AtomicU64;
-use std::sync::{Arc, Mutex};
-use std::task::{Context, Poll};
-use std::time::{Duration, Instant};
-
-use crate::config::PageServerConf;
-use crate::context::{PageContentKind, RequestContext};
-use crate::task_mgr::TaskKind;
-use crate::tenant::mgr::TenantSlot;
-use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::throttle::ThrottleResult;
-use crate::tenant::Timeline;
-
 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
    last_set: AtomicU64,
@@ -3862,7 +3947,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {

    // histograms
    [
-        &VEC_READ_NUM_LAYERS_VISITED,
+        &LAYERS_PER_READ_GLOBAL,
+        &DELTAS_PER_READ_GLOBAL,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -46,6 +46,7 @@ use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::CompactionOutcome;
 use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
@@ -95,7 +96,6 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
-use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::CONCURRENT_INITDBS;
 use crate::metrics::INITDB_RUN_TIME;
@@ -1793,11 +1793,7 @@ impl Tenant {
            let entry = entry.context("read timeline dir entry")?;
            let entry_path = entry.path();

-            let purge = if crate::is_temporary(entry_path)
-                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
-                || is_uninit_mark(entry_path)
-                || crate::is_delete_mark(entry_path)
-            {
+            let purge = if crate::is_temporary(entry_path) {
                true
            } else {
                match TimelineId::try_from(entry_path.file_name()) {
@@ -2426,7 +2422,7 @@ impl Tenant {
        // Make sure the freeze_and_flush reaches remote storage.
        tline.remote_client.wait_completion().await.unwrap();

-        let tl = uninit_tl.finish_creation()?;
+        let tl = uninit_tl.finish_creation().await?;
        // The non-test code would call tl.activate() here.
        tl.set_state(TimelineState::Active);
        Ok(tl)
@@ -2912,10 +2908,10 @@ impl Tenant {
        self: &Arc<Self>,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<bool, timeline::CompactionError> {
+    ) -> Result<CompactionOutcome, timeline::CompactionError> {
        // Don't start doing work during shutdown, or when broken, we do not need those in the logs
        if !self.is_active() {
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
        }

        {
@@ -2929,7 +2925,7 @@ impl Tenant {
            // to AttachedSingle state.
            if !conf.location.may_upload_layers_hint() {
                info!("Skipping compaction in location state {:?}", conf.location);
-                return Ok(false);
+                return Ok(CompactionOutcome::Done);
            }
        }

@@ -2972,7 +2968,7 @@ impl Tenant {
        // Before doing any I/O work, check our circuit breaker
        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
            info!("Skipping compaction due to previous failures");
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
        }

        let mut has_pending_task = false;
@@ -2980,10 +2976,10 @@ impl Tenant {
        for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
        {
            // pending_task_left == None: cannot compact, maybe still pending tasks
-            // pending_task_left == Some(true): compaction task left
-            // pending_task_left == Some(false): no compaction task left
+            // pending_task_left == Some(Pending): compaction task left
+            // pending_task_left == Some(Done): no compaction task left
            let pending_task_left = if *can_compact {
-                let has_pending_l0_compaction_task = timeline
+                let compaction_outcome = timeline
                    .compact(cancel, EnumSet::empty(), ctx)
                    .instrument(info_span!("compact_timeline", %timeline_id))
                    .await
@@ -3001,27 +2997,27 @@ impl Tenant {
                                .fail(&CIRCUIT_BREAKERS_BROKEN, e);
                        }
                    })?;
-                if has_pending_l0_compaction_task {
-                    Some(true)
+                if let CompactionOutcome::Pending = compaction_outcome {
+                    Some(CompactionOutcome::Pending)
                } else {
                    let queue = {
                        let guard = self.scheduled_compaction_tasks.lock().unwrap();
                        guard.get(timeline_id).cloned()
                    };
                    if let Some(queue) = queue {
-                        let has_pending_tasks = queue
+                        let outcome = queue
                            .iteration(cancel, ctx, &self.gc_block, timeline)
                            .await?;
-                        Some(has_pending_tasks)
+                        Some(outcome)
                    } else {
-                        Some(false)
+                        Some(CompactionOutcome::Done)
                    }
                }
            } else {
                None
            };
-            has_pending_task |= pending_task_left.unwrap_or(false);
-            if pending_task_left == Some(false) && *can_offload {
+            has_pending_task |= pending_task_left == Some(CompactionOutcome::Pending);
+            if pending_task_left == Some(CompactionOutcome::Done) && *can_offload {
                pausable_failpoint!("before-timeline-auto-offload");
                match offload_timeline(self, timeline)
                    .instrument(info_span!("offload_timeline", %timeline_id))
@@ -3041,7 +3037,11 @@ impl Tenant {
            .unwrap()
            .success(&CIRCUIT_BREAKERS_UNBROKEN);

-        Ok(has_pending_task)
+        Ok(if has_pending_task {
+            CompactionOutcome::Pending
+        } else {
+            CompactionOutcome::Done
+        })
    }

    /// Cancel scheduled compaction tasks
@@ -4702,7 +4702,7 @@ impl Tenant {
            )
            .await?;

-        let new_timeline = uninitialized_timeline.finish_creation()?;
+        let new_timeline = uninitialized_timeline.finish_creation().await?;

        // Root timeline gets its layers during creation and uploads them along with the metadata.
        // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
@@ -4892,10 +4892,11 @@ impl Tenant {
        }

        // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
+        let pgdata_path_deferred = pgdata_path.clone();
        scopeguard::defer! {
-            if let Err(e) = fs::remove_dir_all(&pgdata_path) {
+            if let Err(e) = fs::remove_dir_all(&pgdata_path_deferred) {
                // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
-                error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
+                error!("Failed to remove temporary initdb directory '{pgdata_path_deferred}': {e}");
            }
        }
        if let Some(existing_initdb_timeline_id) = load_existing_initdb {
@@ -4962,7 +4963,7 @@ impl Tenant {
            pgdata_lsn,
            pg_version,
        );
-        let raw_timeline = self
+        let mut raw_timeline = self
            .prepare_new_timeline(
                timeline_id,
                &new_metadata,
@@ -4973,42 +4974,33 @@ impl Tenant {
            .await?;

        let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
-        let unfinished_timeline = raw_timeline.raw_timeline()?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        unfinished_timeline.maybe_spawn_flush_loop();
-
-        import_datadir::import_timeline_from_postgres_datadir(
-            unfinished_timeline,
-            &pgdata_path,
-            pgdata_lsn,
-            ctx,
-        )
-        .await
-        .with_context(|| {
-            format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
-        })?;
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            Err(CreateTimelineError::Other(anyhow::anyhow!(
-                "failpoint before-checkpoint-new-timeline"
-            )))
-        });
-
-        unfinished_timeline
-            .freeze_and_flush()
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}"
+        raw_timeline
+            .write(|unfinished_timeline| async move {
+                import_datadir::import_timeline_from_postgres_datadir(
+                    &unfinished_timeline,
+                    &pgdata_path,
+                    pgdata_lsn,
+                    ctx,
                )
-            })?;
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}"
+                    )
+                })?;
+
+                fail::fail_point!("before-checkpoint-new-timeline", |_| {
+                    Err(CreateTimelineError::Other(anyhow::anyhow!(
+                        "failpoint before-checkpoint-new-timeline"
+                    )))
+                });
+
+                Ok(())
+            })
+            .await?;

        // All done!
-        let timeline = raw_timeline.finish_creation()?;
+        let timeline = raw_timeline.finish_creation().await?;

        // Callers are responsible to wait for uploads to complete and for activating the timeline.

@@ -5499,6 +5491,9 @@ pub(crate) mod harness {
                image_layer_creation_check_threshold: Some(
                    tenant_conf.image_layer_creation_check_threshold,
                ),
+                image_creation_preempt_threshold: Some(
+                    tenant_conf.image_creation_preempt_threshold,
+                ),
                lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                timeline_offloading: Some(tenant_conf.timeline_offloading),
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -357,6 +357,9 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_layer_creation_check_threshold: Option<u8>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_creation_preempt_threshold: Option<usize>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    #[serde(default)]
@@ -453,6 +456,9 @@ impl TenantConfOpt {
            image_layer_creation_check_threshold: self
                .image_layer_creation_check_threshold
                .unwrap_or(global_conf.image_layer_creation_check_threshold),
+            image_creation_preempt_threshold: self
+                .image_creation_preempt_threshold
+                .unwrap_or(global_conf.image_creation_preempt_threshold),
            lsn_lease_length: self
                .lsn_lease_length
                .unwrap_or(global_conf.lsn_lease_length),
@@ -504,6 +510,7 @@ impl TenantConfOpt {
            mut lazy_slru_download,
            mut timeline_get_throttle,
            mut image_layer_creation_check_threshold,
+            mut image_creation_preempt_threshold,
            mut lsn_lease_length,
            mut lsn_lease_length_for_ts,
            mut timeline_offloading,
@@ -578,6 +585,9 @@ impl TenantConfOpt {
        patch
            .image_layer_creation_check_threshold
            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .image_creation_preempt_threshold
+            .apply(&mut image_creation_preempt_threshold);
        patch
            .lsn_lease_length
            .map(|v| humantime::parse_duration(&v))?
@@ -626,6 +636,7 @@ impl TenantConfOpt {
            lazy_slru_download,
            timeline_get_throttle,
            image_layer_creation_check_threshold,
+            image_creation_preempt_threshold,
            lsn_lease_length,
            lsn_lease_length_for_ts,
            timeline_offloading,
@@ -689,6 +700,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle,
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
+            image_creation_preempt_threshold: value.image_creation_preempt_threshold,
            lsn_lease_length: value.lsn_lease_length.map(humantime),
            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
            timeline_offloading: value.timeline_offloading,
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -673,12 +673,30 @@ impl<'a> TenantDownloader<'a> {
            HeatMapDownload::Modified(m) => m,
        };

-        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
-
-        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
-        // layer metadata without having to re-download it.
+        // Heatmap storage location
        let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);

+        let last_heatmap = if last_download.is_none() {
+            match load_heatmap(&heatmap_path, ctx).await {
+                Ok(htm) => htm,
+                Err(e) => {
+                    tracing::warn!("Couldn't load heatmap from {heatmap_path}: {e:?}");
+                    None
+                }
+            }
+        } else {
+            None
+        };
+
+        let last_heatmap_timelines = last_heatmap.as_ref().map(|htm| {
+            htm.timelines
+                .iter()
+                .map(|tl| (tl.timeline_id, tl))
+                .collect::<HashMap<_, _>>()
+        });
+
+        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
+
        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
        let heatmap_path_bg = heatmap_path.clone();
@@ -707,10 +725,17 @@ impl<'a> TenantDownloader<'a> {
            let timeline_state = match timeline_state {
                Some(t) => t,
                None => {
+                    let last_heatmap =
+                        last_heatmap_timelines
+                            .as_ref()
+                            .and_then(|last_heatmap_timelines| {
+                                last_heatmap_timelines.get(&timeline.timeline_id).copied()
+                            });
                    // We have no existing state: need to scan local disk for layers first.
                    let timeline_state = init_timeline_state(
                        self.conf,
                        tenant_shard_id,
+                        last_heatmap,
                        timeline,
                        &self.secondary_state.resident_size_metric,
                    )
@@ -1079,12 +1104,12 @@ impl<'a> TenantDownloader<'a> {
                }
            }

-            if on_disk.metadata.generation_file_size() != on_disk.metadata.generation_file_size() {
+            if on_disk.metadata.generation_file_size() != layer.metadata.generation_file_size() {
                tracing::info!(
                    "Re-downloading layer {} with changed size or generation: {:?}->{:?}",
                    layer.name,
                    on_disk.metadata.generation_file_size(),
-                    on_disk.metadata.generation_file_size()
+                    layer.metadata.generation_file_size()
                );
                return LayerAction::Download;
            }
@@ -1277,6 +1302,7 @@ impl<'a> TenantDownloader<'a> {
 async fn init_timeline_state(
    conf: &'static PageServerConf,
    tenant_shard_id: &TenantShardId,
+    last_heatmap: Option<&HeatMapTimeline>,
    heatmap: &HeatMapTimeline,
    resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
@@ -1306,6 +1332,13 @@ async fn init_timeline_state(
    let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
        heatmap.layers.iter().map(|l| (&l.name, l)).collect();

+    let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
+        if let Some(last_heatmap) = last_heatmap {
+            last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
+        } else {
+            HashMap::new()
+        };
+
    while let Some(dentry) = dir
        .next_entry()
        .await
@@ -1339,18 +1372,32 @@ async fn init_timeline_state(
        match LayerName::from_str(file_name) {
            Ok(name) => {
                let remote_meta = heatmap_metadata.get(&name);
+                let last_meta = last_heatmap_metadata.get(&name);
+                let mut remove = false;
                match remote_meta {
                    Some(remote_meta) => {
+                        let last_meta_generation_file_size = last_meta
+                            .map(|m| m.metadata.generation_file_size())
+                            .unwrap_or(remote_meta.metadata.generation_file_size());
                        // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
-                        if local_meta.len() != remote_meta.metadata.file_size {
-                            // This should not happen, because we do crashsafe write-then-rename when downloading
-                            // layers, and layers in remote storage are immutable.  Remove the local file because
-                            // we cannot trust it.
-                            tracing::warn!(
+                        if remote_meta.metadata.generation_file_size()
+                            != last_meta_generation_file_size
+                        {
+                            tracing::info!(
+                                "Removing local layer {name} as on-disk json metadata has different generation or file size from remote: {:?} -> {:?}",
+                                last_meta_generation_file_size,
+                                remote_meta.metadata.generation_file_size()
+                            );
+                            remove = true;
+                        } else if local_meta.len() != remote_meta.metadata.file_size {
+                            // This can happen in the presence of race conditions: the remote and on-disk metadata have changed, but we haven't had
+                            // the chance yet to download the new layer to disk, before the process restarted.
+                            tracing::info!(
                                "Removing local layer {name} with unexpected local size {} != {}",
                                local_meta.len(),
                                remote_meta.metadata.file_size
                            );
+                            remove = true;
                        } else {
                            // We expect the access time to be initialized immediately afterwards, when
                            // the latest heatmap is applied to the state.
@@ -1372,15 +1419,18 @@ async fn init_timeline_state(
                            "Removing secondary local layer {} because it's absent in heatmap",
                            name
                        );
-                        tokio::fs::remove_file(&dentry.path())
-                            .await
-                            .or_else(fs_ext::ignore_not_found)
-                            .fatal_err(&format!(
-                                "Removing layer {}",
-                                dentry.path().to_string_lossy()
-                            ));
+                        remove = true;
                    }
                }
+                if remove {
+                    tokio::fs::remove_file(&dentry.path())
+                        .await
+                        .or_else(fs_ext::ignore_not_found)
+                        .fatal_err(&format!(
+                            "Removing layer {}",
+                            dentry.path().to_string_lossy()
+                        ));
+                }
            }
            Err(_) => {
                // Ignore it.
@@ -1391,3 +1441,18 @@ async fn init_timeline_state(

    detail
 }
+
+/// Loads a json-encoded heatmap file from the provided on-disk path
+async fn load_heatmap(
+    path: &Utf8PathBuf,
+    ctx: &RequestContext,
+) -> Result<Option<HeatMapTenant>, anyhow::Error> {
+    let mut file = match VirtualFile::open(path, ctx).await {
+        Ok(file) => file,
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
+        Err(e) => Err(e)?,
+    };
+    let st = file.read_to_string(ctx).await?;
+    let htm = serde_json::from_str(&st)?;
+    Ok(Some(htm))
+}
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -80,6 +80,16 @@ pub(crate) struct ValueReconstructState {
    pub(crate) img: Option<(Lsn, Bytes)>,
 }

+impl ValueReconstructState {
+    /// Returns the number of page deltas applied to the page image.
+    pub fn num_deltas(&self) -> usize {
+        match self.img {
+            Some(_) => self.records.len(),
+            None => self.records.len() - 1, // omit will_init record
+        }
+    }
+}
+
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
 pub(crate) enum ValueReconstructSituation {
    Complete,
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -166,6 +166,10 @@ impl BatchLayerWriter {
        // END: catch every error and do the recovery in the above section
        Ok(generated_layers)
    }
+
+    pub fn pending_layer_num(&self) -> usize {
+        self.generated_layer_writers.len()
+    }
 }

 /// An image writer that takes images and produces multiple image layers.
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -340,7 +340,7 @@ impl Layer {
    /// Download the layer if evicted.
    ///
    /// Will not error when the layer is already downloaded.
-    pub(crate) async fn download(&self) -> anyhow::Result<()> {
+    pub(crate) async fn download(&self) -> Result<(), DownloadError> {
        self.0.get_or_maybe_download(true, None).await?;
        Ok(())
    }
@@ -701,13 +701,7 @@ impl Drop for LayerInner {
        if let Some(timeline) = timeline.as_ref() {
            // Only need to decrement metrics if the timeline still exists: otherwise
            // it will have already de-registered these metrics via TimelineMetrics::shutdown
-            if self.desc.is_delta() {
-                timeline.metrics.layer_count_delta.dec();
-                timeline.metrics.layer_size_delta.sub(self.desc.file_size);
-            } else {
-                timeline.metrics.layer_count_image.dec();
-                timeline.metrics.layer_size_image.sub(self.desc.file_size);
-            }
+            timeline.metrics.dec_layer(&self.desc);

            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
                debug_assert!(
@@ -817,13 +811,7 @@ impl LayerInner {
        };

        // This object acts as a RAII guard on these metrics: increment on construction
-        if desc.is_delta() {
-            timeline.metrics.layer_count_delta.inc();
-            timeline.metrics.layer_size_delta.add(desc.file_size);
-        } else {
-            timeline.metrics.layer_count_image.inc();
-            timeline.metrics.layer_size_image.add(desc.file_size);
-        }
+        timeline.metrics.inc_layer(&desc);

        // New layers are visible by default. This metric is later updated on drop or in set_visibility
        timeline
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -11,6 +11,7 @@ use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::throttle::Stats;
+use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use rand::Rng;
@@ -206,10 +207,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    .run(tenant.compaction_iteration(&cancel, &ctx))
                    .await;
                match output {
-                    Ok(has_pending_task) => {
+                    Ok(outcome) => {
                        error_run_count = 0;
                        // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task {
+                        sleep_duration = if let CompactionOutcome::Pending = outcome {
                            Duration::ZERO
                        } else {
                            period
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,6 +18,7 @@ use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
+use compaction::CompactionOutcome;
 use enumset::EnumSet;
 use fail::fail_point;
 use futures::{stream::FuturesUnordered, StreamExt};
@@ -51,6 +52,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::rate_limit::RateLimit;
 use utils::{
    fs_ext,
    guard_arc_swap::GuardArcSwap,
@@ -115,7 +117,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
@@ -188,6 +190,14 @@ pub enum ImageLayerCreationMode {
    Initial,
 }

+#[derive(Clone, Debug, Default)]
+pub enum LastImageLayerCreationStatus {
+    Incomplete, // TODO: record the last key being processed
+    Complete,
+    #[default]
+    Initial,
+}
+
 impl std::fmt::Display for ImageLayerCreationMode {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:?}", self)
@@ -340,10 +350,14 @@ pub struct Timeline {
    // Needed to ensure that we can't create a branch at a point that was already garbage collected
    pub latest_gc_cutoff_lsn: Rcu<Lsn>,

+    pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,
+
    // List of child timelines and their branch points. This is needed to avoid
    // garbage collecting data that is still needed by the child timelines.
    pub(crate) gc_info: std::sync::RwLock<GcInfo>,

+    pub(crate) last_image_layer_creation_status: ArcSwap<LastImageLayerCreationStatus>,
+
    // It may change across major versions so for simplicity
    // keep it after running initdb for a timeline.
    // It is needed in checks when we want to error on some operations
@@ -933,9 +947,16 @@ pub(crate) enum ShutdownMode {
    Hard,
 }

-struct ImageLayerCreationOutcome {
-    unfinished_image_layer: Option<ImageLayerWriter>,
-    next_start_key: Key,
+enum ImageLayerCreationOutcome {
+    /// We generated an image layer
+    Generated {
+        unfinished_image_layer: ImageLayerWriter,
+    },
+    /// The key range is empty
+    Empty,
+    /// (Only used in metadata image layer creation), after reading the metadata keys, we decide to skip
+    /// the image layer creation.
+    Skip,
 }

 /// Public interface functions
@@ -1044,7 +1065,7 @@ impl Timeline {
    }

    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
-    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
+    pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;

    /// Look up multiple page versions at a given LSN
    ///
@@ -1194,6 +1215,7 @@ impl Timeline {
                            return (key, Err(err));
                        }
                    };
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);

                    // The walredo module expects the records to be descending in terms of Lsn.
                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
@@ -1221,25 +1243,28 @@ impl Timeline {
        // (this is a requirement, not a bug). Skip updating the metric in these cases
        // to avoid infinite results.
        if !results.is_empty() {
-            let avg = layers_visited as f64 / results.len() as f64;
-            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
+            // Record the total number of layers visited towards each key in the batch. While some
+            // layers may not intersect with a given read, and the cost of layer visits are
+            // amortized across the batch, each visited layer contributes directly to the observed
+            // latency for every read in the batch, which is what we care about.
+            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                static LOG_PACER: Lazy<Mutex<RateLimit>> =
                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
+                LOG_PACER.lock().unwrap().call(|| {
+                    let num_keys = keyspace.total_raw_size();
+                    let num_pages = results.len();
                    tracing::info!(
                      shard_id = %self.tenant_shard_id.shard_slug(),
                      lsn = %lsn,
-                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
-                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
+                      "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
+                    );
                });
            }

-            // Note that this is an approximation. Tracking the exact number of layers visited
-            // per key requires virtually unbounded memory usage and is inefficient
-            // (i.e. segment tree tracking each range queried from a layer)
-            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
+            for _ in &results {
+                self.metrics.layers_per_read.observe(layers_visited as f64);
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
+            }
        }

        Ok(results)
@@ -1655,7 +1680,7 @@ impl Timeline {
        cancel: &CancellationToken,
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        self.compact_with_options(
            cancel,
            CompactOptions {
@@ -1677,7 +1702,7 @@ impl Timeline {
        cancel: &CancellationToken,
        options: CompactOptions,
        ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        // most likely the cancellation token is from background task, but in tests it could be the
        // request task as well.

@@ -1697,8 +1722,8 @@ impl Timeline {
        // compaction task goes over it's period (20s) which is quite often in production.
        let (_guard, _permit) = tokio::select! {
            tuple = prepare => { tuple },
-            _ = self.cancel.cancelled() => return Ok(false),
-            _ = cancel.cancelled() => return Ok(false),
+            _ = self.cancel.cancelled() => return Ok(CompactionOutcome::Done),
+            _ = cancel.cancelled() => return Ok(CompactionOutcome::Done),
        };

        let last_record_lsn = self.get_last_record_lsn();
@@ -1706,13 +1731,13 @@ impl Timeline {
        // Last record Lsn could be zero in case the timeline was just created
        if !last_record_lsn.is_valid() {
            warn!("Skipping compaction for potentially just initialized timeline, it has invalid last record lsn: {last_record_lsn}");
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
        }

        let result = match self.get_compaction_algorithm_settings().kind {
            CompactionAlgorithm::Tiered => {
                self.compact_tiered(cancel, ctx).await?;
-                Ok(false)
+                Ok(CompactionOutcome::Done)
            }
            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, options, ctx).await,
        };
@@ -1811,7 +1836,7 @@ impl Timeline {
        self.last_record_lsn.shutdown();

        if let ShutdownMode::FreezeAndFlush = mode {
-            if let Some((open, frozen)) = self
+            let do_flush = if let Some((open, frozen)) = self
                .layers
                .read()
                .await
@@ -1820,43 +1845,54 @@ impl Timeline {
                .ok()
                .filter(|(open, frozen)| *open || *frozen > 0)
            {
-                tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+                if self.remote_client.is_archived() == Some(true) {
+                    // No point flushing on shutdown for an archived timeline: it is not important
+                    // to have it nice and fresh after our restart, and trying to flush here might
+                    // race with trying to offload it (which also stops the flush loop)
+                    false
+                } else {
+                    tracing::info!(?open, frozen, "flushing and freezing on shutdown");
+                    true
+                }
            } else {
-                // this is double-shutdown, ignore it
-            }
+                // this is double-shutdown, it'll be a no-op
+                true
+            };

            // we shut down walreceiver above, so, we won't add anything more
            // to the InMemoryLayer; freeze it and wait for all frozen layers
            // to reach the disk & upload queue, then shut the upload queue and
            // wait for it to drain.
-            match self.freeze_and_flush().await {
-                Ok(_) => {
-                    // drain the upload queue
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    self.remote_client.shutdown().await;
+            if do_flush {
+                match self.freeze_and_flush().await {
+                    Ok(_) => {
+                        // drain the upload queue
+                        // if we did not wait for completion here, it might be our shutdown process
+                        // didn't wait for remote uploads to complete at all, as new tasks can forever
+                        // be spawned.
+                        //
+                        // what is problematic is the shutting down of RemoteTimelineClient, because
+                        // obviously it does not make sense to stop while we wait for it, but what
+                        // about corner cases like s3 suddenly hanging up?
+                        self.remote_client.shutdown().await;
+                    }
+                    Err(FlushLayerError::Cancelled) => {
+                        // this is likely the second shutdown, ignore silently.
+                        // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
+                        debug_assert!(self.cancel.is_cancelled());
+                    }
+                    Err(e) => {
+                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                        // we have some extra WAL replay to do next time the timeline starts.
+                        warn!("failed to freeze and flush: {e:#}");
+                    }
                }
-                Err(FlushLayerError::Cancelled) => {
-                    // this is likely the second shutdown, ignore silently.
-                    // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080
-                    debug_assert!(self.cancel.is_cancelled());
-                }
-                Err(e) => {
-                    // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                    // we have some extra WAL replay to do next time the timeline starts.
-                    warn!("failed to freeze and flush: {e:#}");
-                }
-            }

-            // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
-            // we also do a final check here to ensure that the queue is empty.
-            if !self.remote_client.no_pending_work() {
-                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
+                // we also do a final check here to ensure that the queue is empty.
+                if !self.remote_client.no_pending_work() {
+                    warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+                }
            }
        }

@@ -2021,8 +2057,16 @@ impl Timeline {
    pub(crate) async fn download_layer(
        &self,
        layer_file_name: &LayerName,
-    ) -> anyhow::Result<Option<bool>> {
-        let Some(layer) = self.find_layer(layer_file_name).await? else {
+    ) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
+        let Some(layer) = self
+            .find_layer(layer_file_name)
+            .await
+            .map_err(|e| match e {
+                layer_manager::Shutdown => {
+                    super::storage_layer::layer::DownloadError::TimelineShutdown
+                }
+            })?
+        else {
            return Ok(None);
        };

@@ -2323,6 +2367,18 @@ impl Timeline {
            )
    }

+    fn get_image_creation_preempt_threshold(&self) -> usize {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_creation_preempt_threshold
+            .unwrap_or(
+                self.conf
+                    .default_tenant_conf
+                    .image_creation_preempt_threshold,
+            )
+    }
+
    /// Resolve the effective WAL receiver protocol to use for this tenant.
    ///
    /// Priority order is:
@@ -2432,6 +2488,7 @@ impl Timeline {
                shard_identity,
                pg_version,
                layers: Default::default(),
+                gc_compaction_layer_update_lock: tokio::sync::RwLock::new(()),

                walredo_mgr,
                walreceiver: Mutex::new(None),
@@ -2472,6 +2529,10 @@ impl Timeline {

                gc_info: std::sync::RwLock::new(GcInfo::default()),

+                last_image_layer_creation_status: ArcSwap::new(Arc::new(
+                    LastImageLayerCreationStatus::default(),
+                )),
+
                latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
                initdb_lsn: metadata.initdb_lsn(),

@@ -3475,6 +3536,9 @@ impl Timeline {
        // image layer).
        let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();

+        // See `compaction::compact_with_gc` for why we need this.
+        let _guard = timeline.gc_compaction_layer_update_lock.read().await;
+
        loop {
            if cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
@@ -3703,7 +3767,7 @@ impl Timeline {
            let mut guard = self.layers.write().await;
            guard
                .open_mut()?
-                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
+                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics)
                .await
        };

@@ -4012,15 +4076,20 @@ impl Timeline {
            }

            let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
+            let (generated_image_layers, is_complete) = self
+                .create_image_layers(
                    &partitions,
                    self.initdb_lsn,
                    ImageLayerCreationMode::Initial,
                    ctx,
+                    LastImageLayerCreationStatus::Initial,
                )
-                .await?,
+                .await?;
+            debug_assert!(
+                matches!(is_complete, LastImageLayerCreationStatus::Complete),
+                "init image generation mode must fully cover the keyspace"
            );
+            layers_to_upload.extend(generated_image_layers);

            (layers_to_upload, None)
        } else {
@@ -4340,7 +4409,6 @@ impl Timeline {
        lsn: Lsn,
        ctx: &RequestContext,
        img_range: Range<Key>,
-        start: Key,
        io_concurrency: IoConcurrency,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
        let mut wrote_keys = false;
@@ -4428,26 +4496,23 @@ impl Timeline {
                    lsn
                },
            );
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: Some(image_layer_writer),
-                next_start_key: img_range.end,
+            Ok(ImageLayerCreationOutcome::Generated {
+                unfinished_image_layer: image_layer_writer,
            })
        } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard.  In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: start,
-            })
+            Ok(ImageLayerCreationOutcome::Empty)
        }
    }

    /// Create an image layer for metadata keys. This function produces one image layer for all metadata
    /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
    /// would not be too large to fit in a single image layer.
+    ///
+    /// Creating image layers for metadata keys are different from relational keys. Firstly, instead of
+    /// iterating each key and get an image for each of them, we do a `vectored_get` scan over the sparse
+    /// keyspace to get all images in one run. Secondly, we use a different image layer generation metrics
+    /// for metadata keys than relational keys, which is the number of delta files visited during the scan.
    #[allow(clippy::too_many_arguments)]
    async fn create_image_layer_for_metadata_keys(
        self: &Arc<Self>,
@@ -4457,12 +4522,13 @@ impl Timeline {
        ctx: &RequestContext,
        img_range: Range<Key>,
        mode: ImageLayerCreationMode,
-        start: Key,
        io_concurrency: IoConcurrency,
    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
        // Metadata keys image layer creation.
        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
        let begin = Instant::now();
+        // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
+        // not contain too many keys, otherwise this takes a lot of memory.
        let data = self
            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
            .await?;
@@ -4487,10 +4553,7 @@ impl Timeline {
        );

        if !trigger_generation && mode == ImageLayerCreationMode::Try {
-            return Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: img_range.end,
-            });
+            return Ok(ImageLayerCreationOutcome::Skip);
        }
        if self.cancel.is_cancelled() {
            return Err(CreateImageLayersError::Cancelled);
@@ -4521,20 +4584,12 @@ impl Timeline {
                    lsn
                }
            );
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: Some(image_layer_writer),
-                next_start_key: img_range.end,
+            Ok(ImageLayerCreationOutcome::Generated {
+                unfinished_image_layer: image_layer_writer,
            })
        } else {
-            // Special case: the image layer may be empty if this is a sharded tenant and the
-            // partition does not cover any keys owned by this shard. In this case, to ensure
-            // we don't leave gaps between image layers, leave `start` where it is, so that the next
-            // layer we write will cover the key range that we just scanned.
            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
-            Ok(ImageLayerCreationOutcome {
-                unfinished_image_layer: None,
-                next_start_key: start,
-            })
+            Ok(ImageLayerCreationOutcome::Empty)
        }
    }

@@ -4590,6 +4645,8 @@ impl Timeline {
        decision
    }

+    /// Returns the image layers generated and an enum indicating whether the process is fully completed.
+    /// true = we have generate all image layers, false = we preempt the process for L0 compaction.
    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
    async fn create_image_layers(
        self: &Arc<Timeline>,
@@ -4597,7 +4654,8 @@ impl Timeline {
        lsn: Lsn,
        mode: ImageLayerCreationMode,
        ctx: &RequestContext,
-    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
+        last_status: LastImageLayerCreationStatus,
+    ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
        let timer = self.metrics.create_images_time_histo.start_timer();

        // We need to avoid holes between generated image layers.
@@ -4611,10 +4669,23 @@ impl Timeline {
        // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
        let mut start = Key::MIN;

-        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
+        let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status {
+            info!(
+                "resuming image layer creation: last_status={:?}",
+                last_status
+            );
+            true
+        } else {
+            self.should_check_if_image_layers_required(lsn)
+        };

        let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;

+        let mut all_generated = true;
+
+        let mut partition_processed = 0;
+        let total_partitions = partitioning.parts.len();
+
        for partition in partitioning.parts.iter() {
            if self.cancel.is_cancelled() {
                return Err(CreateImageLayersError::Cancelled);
@@ -4687,17 +4758,13 @@ impl Timeline {
                    .map_err(|_| CreateImageLayersError::Cancelled)?,
            );

-            let ImageLayerCreationOutcome {
-                unfinished_image_layer,
-                next_start_key,
-            } = if !compact_metadata {
+            let outcome = if !compact_metadata {
                self.create_image_layer_for_rel_blocks(
                    partition,
                    image_layer_writer,
                    lsn,
                    ctx,
                    img_range.clone(),
-                    start,
                    io_concurrency,
                )
                .await?
@@ -4709,18 +4776,58 @@ impl Timeline {
                    ctx,
                    img_range.clone(),
                    mode,
-                    start,
                    io_concurrency,
                )
                .await?
            };
-            start = next_start_key;
-            if let Some(unfinished_image_layer) = unfinished_image_layer {
-                batch_image_writer.add_unfinished_image_writer(
+            match outcome {
+                ImageLayerCreationOutcome::Empty => {
+                    // No data in this partition, so we don't need to create an image layer (for now).
+                    // The next image layer should cover this key range, so we don't advance the `start`
+                    // key.
+                }
+                ImageLayerCreationOutcome::Generated {
                    unfinished_image_layer,
-                    img_range,
-                    lsn,
-                );
+                } => {
+                    batch_image_writer.add_unfinished_image_writer(
+                        unfinished_image_layer,
+                        img_range.clone(),
+                        lsn,
+                    );
+                    // The next image layer should be generated right after this one.
+                    start = img_range.end;
+                }
+                ImageLayerCreationOutcome::Skip => {
+                    // We don't need to create an image layer for this partition.
+                    // The next image layer should NOT cover this range, otherwise
+                    // the keyspace becomes empty (reads don't go past image layers).
+                    start = img_range.end;
+                }
+            }
+
+            partition_processed += 1;
+
+            if let ImageLayerCreationMode::Try = mode {
+                // We have at least made some progress
+                if batch_image_writer.pending_layer_num() >= 1 {
+                    // The `Try` mode is currently only used on the compaction path. We want to avoid
+                    // image layer generation taking too long time and blocking L0 compaction. So in this
+                    // mode, we also inspect the current number of L0 layers and skip image layer generation
+                    // if there are too many of them.
+                    let num_of_l0_layers = {
+                        let layers = self.layers.read().await;
+                        layers.layer_map()?.level0_deltas().len()
+                    };
+                    let image_preempt_threshold = self.get_image_creation_preempt_threshold()
+                        * self.get_compaction_threshold();
+                    if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
+                        tracing::info!(
+                        "preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}",
+                    );
+                        all_generated = false;
+                        break;
+                    }
+                }
            }
        }

@@ -4735,14 +4842,35 @@ impl Timeline {
            .open_mut()?
            .track_new_image_layers(&image_layers, &self.metrics);
        drop_wlock(guard);
-        timer.stop_and_record();
+        let duration = timer.stop_and_record();

        // Creating image layers may have caused some previously visible layers to be covered
        if !image_layers.is_empty() {
            self.update_layer_visibility().await?;
        }

-        Ok(image_layers)
+        let total_layer_size = image_layers
+            .iter()
+            .map(|l| l.metadata().file_size)
+            .sum::<u64>();
+
+        info!(
+            "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
+            image_layers.len(),
+            total_layer_size,
+            duration.as_secs_f64(),
+            partition_processed,
+            total_partitions
+        );
+
+        Ok((
+            image_layers,
+            if all_generated {
+                LastImageLayerCreationStatus::Complete
+            } else {
+                LastImageLayerCreationStatus::Incomplete
+            },
+        ))
    }

    /// Wait until the background initial logical size calculation is complete, or
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -11,7 +11,7 @@ use std::sync::Arc;
 use super::layer_manager::LayerManager;
 use super::{
    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
+    LastImageLayerCreationStatus, RecordedDuration, Timeline,
 };

 use anyhow::{anyhow, bail, Context};
@@ -262,13 +262,13 @@ impl GcCompactionQueue {
        ctx: &RequestContext,
        gc_block: &GcBlock,
        timeline: &Arc<Timeline>,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
        let has_pending_tasks;
        let (id, item) = {
            let mut guard = self.inner.lock().unwrap();
            let Some((id, item)) = guard.queued.pop_front() else {
-                return Ok(false);
+                return Ok(CompactionOutcome::Done);
            };
            guard.running = Some((id, item.clone()));
            has_pending_tasks = !guard.queued.is_empty();
@@ -323,7 +323,11 @@ impl GcCompactionQueue {
            let mut guard = self.inner.lock().unwrap();
            guard.running = None;
        }
-        Ok(has_pending_tasks)
+        Ok(if has_pending_tasks {
+            CompactionOutcome::Pending
+        } else {
+            CompactionOutcome::Done
+        })
    }

    #[allow(clippy::type_complexity)]
@@ -589,6 +593,17 @@ impl CompactionStatistics {
    }
 }

+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CompactionOutcome {
+    #[default]
+    /// No layers need to be compacted after this round. Compaction doesn't need
+    /// to be immediately scheduled.
+    Done,
+    /// Still has pending layers to be compacted after this round. Ideally, the scheduler
+    /// should immediately schedule another compaction.
+    Pending,
+}
+
 impl Timeline {
    /// TODO: cancellation
    ///
@@ -598,7 +613,7 @@ impl Timeline {
        cancel: &CancellationToken,
        options: CompactOptions,
        ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        if options
            .flags
            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
@@ -606,7 +621,7 @@ impl Timeline {
            self.compact_with_gc(cancel, options, ctx)
                .await
                .map_err(CompactionError::Other)?;
-            return Ok(false);
+            return Ok(CompactionOutcome::Done);
        }

        if options.flags.contains(CompactFlags::DryRun) {
@@ -624,7 +639,13 @@ impl Timeline {

        // High level strategy for compaction / image creation:
        //
-        // 1. First, calculate the desired "partitioning" of the
+        // 1. First, do a L0 compaction to ensure we move the L0
+        // layers into the historic layer map get flat levels of
+        // layers. If we did not compact all L0 layers, we will
+        // prioritize compacting the timeline again and not do
+        // any of the compactions below.
+        //
+        // 2. Then, calculate the desired "partitioning" of the
        // currently in-use key space. The goal is to partition the
        // key space into roughly fixed-size chunks, but also take into
        // account any existing image layers, and try to align the
@@ -638,7 +659,7 @@ impl Timeline {
        // identify a relation. This is just an optimization,
        // though.
        //
-        // 2. Once we know the partitioning, for each partition,
+        // 3. Once we know the partitioning, for each partition,
        // decide if it's time to create a new image layer. The
        // criteria is: there has been too much "churn" since the last
        // image layer? The "churn" is fuzzy concept, it's a
@@ -646,15 +667,8 @@ impl Timeline {
        // total in the delta file. Or perhaps: if creating an image
        // file would allow to delete some older files.
        //
-        // 3. After that, we compact all level0 delta files if there
-        // are too many of them.  While compacting, we also garbage
-        // collect any page versions that are no longer needed because
-        // of the new image layers we created in step 2.
-        //
-        // TODO: This high level strategy hasn't been implemented yet.
-        // Below are functions compact_level0() and create_image_layers()
-        // but they are a bit ad hoc and don't quite work like it's explained
-        // above. Rewrite it.
+        // 4. In the end, if the tenant gets auto-sharded, we will run
+        // a shard-ancestor compaction.

        // Is the timeline being deleted?
        if self.is_stopping() {
@@ -666,10 +680,32 @@ impl Timeline {

        // Define partitioning schema if needed

-        // FIXME: the match should only cover repartitioning, not the next steps
-        let (partition_count, has_pending_tasks) = match self
+        // 1. L0 Compact
+        let l0_compaction_outcome = {
+            let timer = self.metrics.compact_time_histo.start_timer();
+            let l0_compaction_outcome = self
+                .compact_level0(
+                    target_file_size,
+                    options.flags.contains(CompactFlags::ForceL0Compaction),
+                    ctx,
+                )
+                .await?;
+            timer.stop_and_record();
+            l0_compaction_outcome
+        };
+
+        if let CompactionOutcome::Pending = l0_compaction_outcome {
+            // Yield and do not do any other kind of compaction. True means
+            // that we have pending L0 compaction tasks and the compaction scheduler
+            // will prioritize compacting this tenant/timeline again.
+            info!("skipping image layer generation and shard ancestor compaction due to L0 compaction did not include all layers.");
+            return Ok(CompactionOutcome::Pending);
+        }
+
+        // 2. Repartition and create image layers if necessary
+        let partition_count = match self
            .repartition(
-                self.get_last_record_lsn(),
+                self.get_last_record_lsn(), // TODO: use L0-L1 boundary
                self.get_compaction_target_size(),
                options.flags,
                ctx,
@@ -682,46 +718,42 @@ impl Timeline {
                    .access_stats_behavior(AccessStatsBehavior::Skip)
                    .build();

-                // 2. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                let fully_compacted = self
-                    .compact_level0(
-                        target_file_size,
-                        options.flags.contains(CompactFlags::ForceL0Compaction),
-                        ctx,
-                    )
-                    .await?;
-                timer.stop_and_record();
-
                let mut partitioning = dense_partitioning;
                partitioning
                    .parts
                    .extend(sparse_partitioning.into_dense().parts);

-                // 3. Create new image layers for partitions that have been modified
-                // "enough". Skip image layer creation if L0 compaction cannot keep up.
-                if fully_compacted {
-                    let image_layers = self
-                        .create_image_layers(
-                            &partitioning,
-                            lsn,
-                            if options
-                                .flags
-                                .contains(CompactFlags::ForceImageLayerCreation)
-                            {
-                                ImageLayerCreationMode::Force
-                            } else {
-                                ImageLayerCreationMode::Try
-                            },
-                            &image_ctx,
-                        )
-                        .await?;
+                // 3. Create new image layers for partitions that have been modified "enough".
+                let (image_layers, outcome) = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        if options
+                            .flags
+                            .contains(CompactFlags::ForceImageLayerCreation)
+                        {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                        self.last_image_layer_creation_status
+                            .load()
+                            .as_ref()
+                            .clone(),
+                    )
+                    .await?;

-                    self.upload_new_image_layers(image_layers)?;
-                } else {
-                    info!("skipping image layer generation due to L0 compaction did not include all layers.");
+                self.last_image_layer_creation_status
+                    .store(Arc::new(outcome.clone()));
+
+                self.upload_new_image_layers(image_layers)?;
+                if let LastImageLayerCreationStatus::Incomplete = outcome {
+                    // Yield and do not do any other kind of compaction.
+                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
+                    return Ok(CompactionOutcome::Pending);
                }
-                (partitioning.parts.len(), !fully_compacted)
+                partitioning.parts.len()
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -733,10 +765,12 @@ impl Timeline {
                if !self.cancel.is_cancelled() && !err.is_cancelled() {
                    tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
                }
-                (1, false)
+                1
            }
        };

+        // 4. Shard ancestor compaction
+
        if self.shard_identity.count >= ShardCount::new(2) {
            // Limit the number of layer rewrites to the number of partitions: this means its
            // runtime should be comparable to a full round of image layer creations, rather than
@@ -746,7 +780,7 @@ impl Timeline {
            self.compact_shard_ancestors(rewrite_max, ctx).await?;
        }

-        Ok(has_pending_tasks)
+        Ok(CompactionOutcome::Done)
    }

    /// Check for layers that are elegible to be rewritten:
@@ -1003,11 +1037,11 @@ impl Timeline {
        target_file_size: u64,
        force_compaction_ignore_threshold: bool,
        ctx: &RequestContext,
-    ) -> Result<bool, CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
        let CompactLevel0Phase1Result {
            new_layers,
            deltas_to_compact,
-            fully_compacted,
+            outcome,
        } = {
            let phase1_span = info_span!("compact_level0_phase1");
            let ctx = ctx.attached_child();
@@ -1036,12 +1070,12 @@ impl Timeline {

        if new_layers.is_empty() && deltas_to_compact.is_empty() {
            // nothing to do
-            return Ok(true);
+            return Ok(CompactionOutcome::Done);
        }

        self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact)
            .await?;
-        Ok(fully_compacted)
+        Ok(outcome)
    }

    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
@@ -1115,7 +1149,13 @@ impl Timeline {
        // Under normal circumstances, we will accumulate up to compaction_upper_limit L0s of size
        // checkpoint_distance each.  To avoid edge cases using extra system resources, bound our
        // work in this function to only operate on this much delta data at once.
-        let delta_size_limit = self.get_compaction_upper_limit() as u64
+        //
+        // In general, compaction_threshold should be <= compaction_upper_limit, but in case that
+        // the constraint is not respected, we use the larger of the two.
+        let delta_size_limit = std::cmp::max(
+            self.get_compaction_upper_limit(),
+            self.get_compaction_threshold(),
+        ) as u64
            * std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);

        let mut fully_compacted = true;
@@ -1490,11 +1530,9 @@ impl Timeline {
                    .await
                    .map_err(CompactionError::Other)?;
            } else {
-                let shard = self.shard_identity.shard_index();
                let owner = self.shard_identity.get_shard_number(&key);
-                if cfg!(debug_assertions) {
-                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
-                }
+
+                // This happens after a shard split, when we're compacting an L0 created by our parent shard
                debug!("dropping key {key} during compaction (it belongs on shard {owner})");
            }

@@ -1579,7 +1617,11 @@ impl Timeline {
                .into_iter()
                .map(|x| x.drop_eviction_guard())
                .collect::<Vec<_>>(),
-            fully_compacted,
+            outcome: if fully_compacted {
+                CompactionOutcome::Done
+            } else {
+                CompactionOutcome::Pending
+            },
        })
    }
 }
@@ -1590,7 +1632,7 @@ struct CompactLevel0Phase1Result {
    deltas_to_compact: Vec<Layer>,
    // Whether we have included all L0 layers, or selected only part of them due to the
    // L0 compaction size limit.
-    fully_compacted: bool,
+    outcome: CompactionOutcome,
 }

 #[derive(Default)]
@@ -2906,10 +2948,45 @@ impl Timeline {
        // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
        // operate on L1 layers.
        {
+            // Gc-compaction will rewrite the history of a key. This could happen in two ways:
+            //
+            // 1. We create an image layer to replace all the deltas below the compact LSN. In this case, assume
+            // we have 2 delta layers A and B, both below the compact LSN. We create an image layer I to replace
+            // A and B at the compact LSN. If the read path finishes reading A, yields, and now we update the layer
+            // map, the read path then cannot find any keys below A, reporting a missing key error, while the key
+            // now gets stored in I at the compact LSN.
+            //
+            // ---------------                                       ---------------
+            //   delta1@LSN20                                         image1@LSN20
+            // ---------------  (read path collects delta@LSN20,  => ---------------  (read path cannot find anything
+            //   delta1@LSN10    yields)                                               below LSN 20)
+            // ---------------
+            //
+            // 2. We create a delta layer to replace all the deltas below the compact LSN, and in the delta layers,
+            // we combines the history of a key into a single image. For example, we have deltas at LSN 1, 2, 3, 4,
+            // Assume one delta layer contains LSN 1, 2, 3 and the other contains LSN 4.
+            //
+            // We let gc-compaction combine delta 2, 3, 4 into an image at LSN 4, which produces a delta layer that
+            // contains the delta at LSN 1, the image at LSN 4. If the read path finishes reading the original delta
+            // layer containing 4, yields, and we update the layer map to put the delta layer.
+            //
+            // ---------------                                      ---------------
+            //   delta1@LSN4                                          image1@LSN4
+            // ---------------  (read path collects delta@LSN4,  => ---------------  (read path collects LSN4 and LSN1,
+            //  delta1@LSN1-3    yields)                              delta1@LSN1     which is an invalid history)
+            // ---------------                                      ---------------
+            //
+            // Therefore, the gc-compaction layer update operation should wait for all ongoing reads, block all pending reads,
+            // and only allow reads to continue after the update is finished.
+
+            let update_guard = self.gc_compaction_layer_update_lock.write().await;
+            // Acquiring the update guard ensures current read operations end and new read operations are blocked.
+            // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect?
            let mut guard = self.layers.write().await;
            guard
                .open_mut()?
-                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
+                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics);
+            drop(update_guard); // Allow new reads to start ONLY after we finished updating the layer map.
        };

        // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json.
@@ -3186,11 +3263,7 @@ impl TimelineAdaptor {
            ranges: self.get_keyspace(key_range, lsn, ctx).await?,
        };
        // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
-        let start = Key::MIN;
-        let ImageLayerCreationOutcome {
-            unfinished_image_layer,
-            next_start_key: _,
-        } = self
+        let outcome = self
            .timeline
            .create_image_layer_for_rel_blocks(
                &keyspace,
@@ -3198,13 +3271,15 @@ impl TimelineAdaptor {
                lsn,
                ctx,
                key_range.clone(),
-                start,
                IoConcurrency::sequential(),
            )
            .await?;

-        if let Some(image_layer_writer) = unfinished_image_layer {
-            let (desc, path) = image_layer_writer.finish(ctx).await?;
+        if let ImageLayerCreationOutcome::Generated {
+            unfinished_image_layer,
+        } = outcome
+        {
+            let (desc, path) = unfinished_image_layer.finish(ctx).await?;
            let image_layer =
                Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
            self.new_images.push(image_layer);
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -91,6 +91,7 @@ impl LayerManager {
                layer_map,
                layer_fmgr: LayerFileManager(hashmap),
            }) => {
+                // NB: no need to decrement layer metrics; metrics are removed on timeline shutdown.
                let open = layer_map.open_layer.take();
                let frozen = layer_map.frozen_layers.len();
                let taken_writer_state = writer_state.take();
@@ -234,6 +235,7 @@ impl OpenLayerManager {
        lsn: Lsn,
        last_freeze_at: &AtomicLsn,
        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
+        metrics: &TimelineMetrics,
    ) -> bool {
        let Lsn(last_record_lsn) = lsn;
        let end_lsn = Lsn(last_record_lsn + 1);
@@ -242,6 +244,11 @@ impl OpenLayerManager {
            let open_layer_rc = Arc::clone(open_layer);
            open_layer.freeze(end_lsn).await;

+            // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`.
+            // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a
+            // reference to the timeline metrics. Other methods use a metrics borrow as well.
+            metrics.inc_frozen_layer(open_layer);
+
            // The layer is no longer open, update the layer map to reflect this.
            // We will replace it with on-disk historics below.
            self.layer_map.frozen_layers.push_back(open_layer_rc);
@@ -298,6 +305,7 @@ impl OpenLayerManager {
            .frozen_layers
            .pop_front()
            .expect("there must be a inmem layer to flush");
+        metrics.dec_frozen_layer(&inmem);

        // Only one task may call this function at a time (for this
        // timeline). If two tasks tried to flush the same frozen
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,4 +1,4 @@
-use std::{collections::hash_map::Entry, fs, sync::Arc};
+use std::{collections::hash_map::Entry, fs, future::Future, sync::Arc};

 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -8,7 +8,8 @@ use utils::{fs_ext, id::TimelineId, lsn::Lsn, sync::gate::GateGuard};
 use crate::{
    context::RequestContext,
    import_datadir,
-    tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
+    span::debug_assert_current_span_has_tenant_and_timeline_id,
+    tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
 };

 use super::Timeline;
@@ -24,6 +25,9 @@ pub struct UninitializedTimeline<'t> {
    pub(crate) owning_tenant: &'t Tenant,
    timeline_id: TimelineId,
    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
+    /// Whether we spawned the inner Timeline's tasks such that we must later shut it down
+    /// if aborting the timeline creation
+    needs_shutdown: bool,
 }

 impl<'t> UninitializedTimeline<'t> {
@@ -36,6 +40,50 @@ impl<'t> UninitializedTimeline<'t> {
            owning_tenant,
            timeline_id,
            raw_timeline,
+            needs_shutdown: false,
+        }
+    }
+
+    /// When writing data to this timeline during creation, use this wrapper: it will take care of
+    /// setup of Timeline tasks required for I/O (flush loop) and making sure they are torn down
+    /// later.
+    pub(crate) async fn write<F, Fut>(&mut self, f: F) -> anyhow::Result<()>
+    where
+        F: FnOnce(Arc<Timeline>) -> Fut,
+        Fut: Future<Output = Result<(), CreateTimelineError>>,
+    {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        // Remember that we did I/O (spawned the flush loop), so that we can check we shut it down on drop
+        self.needs_shutdown = true;
+
+        let timeline = self.raw_timeline()?;
+
+        // Spawn flush loop so that the Timeline is ready to accept writes
+        timeline.maybe_spawn_flush_loop();
+
+        // Invoke the provided function, which will write some data into the new timeline
+        if let Err(e) = f(timeline.clone()).await {
+            self.abort().await;
+            return Err(e.into());
+        }
+
+        // Flush the underlying timeline's ephemeral layers to disk
+        if let Err(e) = timeline
+            .freeze_and_flush()
+            .await
+            .context("Failed to flush after timeline creation writes")
+        {
+            self.abort().await;
+            return Err(e);
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn abort(&self) {
+        if let Some((raw_timeline, _)) = self.raw_timeline.as_ref() {
+            raw_timeline.shutdown(super::ShutdownMode::Hard).await;
        }
    }

@@ -44,11 +92,13 @@ impl<'t> UninitializedTimeline<'t> {
    /// This function launches the flush loop if not already done.
    ///
    /// The caller is responsible for activating the timeline (function `.activate()`).
-    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
+    pub(crate) async fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
        let timeline_id = self.timeline_id;
        let tenant_shard_id = self.owning_tenant.tenant_shard_id;

        if self.raw_timeline.is_none() {
+            self.abort().await;
+
            return Err(anyhow::anyhow!(
                "No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
            ));
@@ -62,16 +112,25 @@ impl<'t> UninitializedTimeline<'t> {
            .0
            .get_disk_consistent_lsn();

-        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
+        if !new_disk_consistent_lsn.is_valid() {
+            self.abort().await;
+
+            return Err(anyhow::anyhow!(
+                "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
+            ));
+        }

        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
        match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => anyhow::bail!(
+            Entry::Occupied(_) => {
+                // Unexpected, bug in the caller.  Tenant is responsible for preventing concurrent creation of the same timeline.
+                //
+                // We do not call Self::abort here.  Because we don't cleanly shut down our Timeline, [`Self::drop`] should
+                // skip trying to delete the timeline directory too.
+                anyhow::bail!(
                "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
-            ),
+                )
+            }
            Entry::Vacant(v) => {
                // after taking here should be no fallible operations, because the drop guard will not
                // cleanup after and would block for example the tenant deletion
@@ -93,36 +152,31 @@ impl<'t> UninitializedTimeline<'t> {

    /// Prepares timeline data by loading it from the basebackup archive.
    pub(crate) async fn import_basebackup_from_tar(
-        self,
+        mut self,
        tenant: Arc<Tenant>,
        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
-        let raw_timeline = self.raw_timeline()?;
+        self.write(|raw_timeline| async move {
+            import_datadir::import_basebackup_from_tar(&raw_timeline, copyin_read, base_lsn, ctx)
+                .await
+                .context("Failed to import basebackup")
+                .map_err(CreateTimelineError::Other)?;

-        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
-            .await
-            .context("Failed to import basebackup")?;
+            fail::fail_point!("before-checkpoint-new-timeline", |_| {
+                Err(CreateTimelineError::Other(anyhow::anyhow!(
+                    "failpoint before-checkpoint-new-timeline"
+                )))
+            });

-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        raw_timeline.maybe_spawn_flush_loop();
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            anyhow::bail!("failpoint before-checkpoint-new-timeline");
-        });
-
-        raw_timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after basebackup import")?;
+            Ok(())
+        })
+        .await?;

        // All the data has been imported. Insert the Timeline into the tenant's timelines map
-        let tl = self.finish_creation()?;
+        let tl = self.finish_creation().await?;
        tl.activate(tenant, broker_client, None, ctx);
        Ok(tl)
    }
@@ -143,12 +197,19 @@ impl<'t> UninitializedTimeline<'t> {

 impl Drop for UninitializedTimeline<'_> {
    fn drop(&mut self) {
-        if let Some((_, create_guard)) = self.raw_timeline.take() {
+        if let Some((timeline, create_guard)) = self.raw_timeline.take() {
            let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
-            // This is unusual, but can happen harmlessly if the pageserver is stopped while
-            // creating a timeline.
-            info!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(create_guard);
+            if self.needs_shutdown && !timeline.gate.close_complete() {
+                // This should not happen: caller should call [`Self::abort`] on failures
+                tracing::warn!(
+                    "Timeline not shut down after initialization failure, cannot clean up files"
+                );
+            } else {
+                // This is unusual, but can happen harmlessly if the pageserver is stopped while
+                // creating a timeline.
+                info!("Timeline got dropped without initializing, cleaning its files");
+                cleanup_timeline_directory(create_guard);
+            }
        }
    }
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -234,6 +234,19 @@ impl VirtualFile {
    ) -> (FullSlice<Buf>, Result<usize, Error>) {
        self.inner.write_all(buf, ctx).await
    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        self.inner.read_to_end(buf, ctx).await
+    }
+
+    pub(crate) async fn read_to_string(
+        &mut self,
+        ctx: &RequestContext,
+    ) -> Result<String, anyhow::Error> {
+        let mut buf = Vec::new();
+        self.read_to_end(&mut buf, ctx).await?;
+        Ok(String::from_utf8(buf)?)
+    }
 }

 /// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing
@@ -993,6 +1006,24 @@ impl VirtualFileInner {
            (buf, result)
        })
    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        let mut tmp = vec![0; 128];
+        loop {
+            let slice = tmp.slice(..128);
+            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
+            match res {
+                Ok(0) => return Ok(()),
+                Ok(n) => {
+                    self.pos += n as u64;
+                    buf.extend_from_slice(&slice[..n]);
+                }
+                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+                Err(e) => return Err(e),
+            }
+            tmp = slice.into_inner();
+        }
+    }
 }

 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
@@ -1237,10 +1268,6 @@ impl VirtualFile {
    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
        self.inner.read_blk(blknum, ctx).await
    }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        self.inner.read_to_end(buf, ctx).await
-    }
 }

 #[cfg(test)]
@@ -1260,24 +1287,6 @@ impl VirtualFileInner {
            slice.into_inner(),
        ))
    }
-
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
-        let mut tmp = vec![0; 128];
-        loop {
-            let slice = tmp.slice(..128);
-            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
-            match res {
-                Ok(0) => return Ok(()),
-                Ok(n) => {
-                    self.pos += n as u64;
-                    buf.extend_from_slice(&slice[..n]);
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-            tmp = slice.into_inner();
-        }
-    }
 }

 impl Drop for VirtualFileInner {
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -51,6 +51,26 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
 	return true;
 }

+/* --------------------------------
+ *		pq_getmsgint16	- get a binary 2-byte int from a message buffer
+ * --------------------------------
+ */
+uint16
+pq_getmsgint16(StringInfo msg)
+{
+	return pq_getmsgint(msg, 2);
+}
+
+/* --------------------------------
+ *		pq_getmsgint32	- get a binary 4-byte int from a message buffer
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32(StringInfo msg)
+{
+	return pq_getmsgint(msg, 4);
+}
+
 /* --------------------------------
 *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
 * --------------------------------
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -8,6 +8,8 @@
 #endif

 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
+uint16      pq_getmsgint16(StringInfo msg);
+uint32      pq_getmsgint32(StringInfo msg);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -70,6 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
 static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp);
+static void PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version);
 static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk);
 static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
 static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
@@ -81,6 +82,8 @@ static char *FormatSafekeeperState(Safekeeper *sk);
 static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
 static char *FormatEvents(WalProposer *wp, uint32 events);
 static void UpdateDonorShmem(WalProposer *wp);
+static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
+static void MembershipConfigurationFree(MembershipConfiguration *mconf);

 WalProposer *
 WalProposerCreate(WalProposerConfig *config, walproposer_api api)
@@ -137,25 +140,21 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	}
 	wp->quorum = wp->n_safekeepers / 2 + 1;

+	if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
+		wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
+	wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
+
 	/* Fill the greeting package */
-	wp->greetRequest.tag = 'g';
-	wp->greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
-	wp->greetRequest.pgVersion = PG_VERSION_NUM;
-	wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
-	wp->greetRequest.systemId = wp->config->systemId;
-	if (!wp->config->neon_timeline)
-		wp_log(FATAL, "neon.timeline_id is not provided");
-	if (*wp->config->neon_timeline != '\0' &&
-		!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
-		wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
+	wp->greetRequest.pam.tag = 'g';
 	if (!wp->config->neon_tenant)
 		wp_log(FATAL, "neon.tenant_id is not provided");
-	if (*wp->config->neon_tenant != '\0' &&
-		!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
-		wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
-
-	wp->greetRequest.timeline = wp->config->pgTimeline;
-	wp->greetRequest.walSegSize = wp->config->wal_segment_size;
+	wp->greetRequest.tenant_id = wp->config->neon_tenant;
+	if (!wp->config->neon_timeline)
+		wp_log(FATAL, "neon.timeline_id is not provided");
+	wp->greetRequest.timeline_id = wp->config->neon_timeline;
+	wp->greetRequest.pg_version = PG_VERSION_NUM;
+	wp->greetRequest.system_id = wp->config->systemId;
+	wp->greetRequest.wal_seg_size = wp->config->wal_segment_size;

 	wp->api.init_event_set(wp);

@@ -165,12 +164,14 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 void
 WalProposerFree(WalProposer *wp)
 {
+	MembershipConfigurationFree(&wp->mconf);
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
 		Safekeeper *sk = &wp->safekeeper[i];

 		Assert(sk->outbuf.data != NULL);
 		pfree(sk->outbuf.data);
+		MembershipConfigurationFree(&sk->greetResponse.mconf);
 		if (sk->voteResponse.termHistory.entries)
 			pfree(sk->voteResponse.termHistory.entries);
 		sk->voteResponse.termHistory.entries = NULL;
@@ -308,6 +309,7 @@ ShutdownConnection(Safekeeper *sk)
 	sk->state = SS_OFFLINE;
 	sk->streamingAt = InvalidXLogRecPtr;

+	MembershipConfigurationFree(&sk->greetResponse.mconf);
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
 	sk->voteResponse.termHistory.entries = NULL;
@@ -598,11 +600,14 @@ static void
 SendStartWALPush(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
+#define CMD_LEN 512
+	char		cmd[CMD_LEN];

-	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
+	snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
+	if (!wp->api.conn_send_query(sk, cmd))
 	{
-		wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-			   sk->host, sk->port, wp->api.conn_error_message(sk));
+		wp_log(WARNING, "failed to send %s query to safekeeper %s:%s: %s",
+			   cmd, sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -658,23 +663,33 @@ RecvStartWALPushResult(Safekeeper *sk)

 /*
 * Start handshake: first of all send information about the
- * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
+ * walproposer. After sending, we wait on SS_HANDSHAKE_RECV for
 * a response to finish the handshake.
 */
 static void
 SendProposerGreeting(Safekeeper *sk)
 {
+	WalProposer *wp = sk->wp;
+	char	   *mconf_toml = MembershipConfigurationToString(&wp->greetRequest.mconf);
+
+	wp_log(LOG, "sending ProposerGreeting to safekeeper %s:%s with mconf = %s", sk->host, sk->port, mconf_toml);
+	pfree(mconf_toml);
+
+	PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->greetRequest,
+					   &sk->outbuf, wp->config->proto_version);
+
 	/*
 	 * On failure, logging & resetting the connection is handled. We just need
 	 * to handle the control flow.
 	 */
-	BlockingWrite(sk, &sk->wp->greetRequest, sizeof(sk->wp->greetRequest), SS_HANDSHAKE_RECV);
+	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
 }

 static void
 RecvAcceptorGreeting(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
+	char	   *mconf_toml;

 	/*
 	 * If our reading doesn't immediately succeed, any necessary error
@@ -685,7 +700,10 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;

-	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term);
+	mconf_toml = MembershipConfigurationToString(&sk->greetResponse.mconf);
+	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, node_id = %lu, mconf = %s, term=" UINT64_FORMAT,
+		   sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
+	pfree(mconf_toml);

 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
@@ -707,12 +725,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 			wp->propTerm++;
 			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);

-			wp->voteRequest = (VoteRequest)
-			{
-				.tag = 'v',
-					.term = wp->propTerm
-			};
-			memcpy(wp->voteRequest.proposerId.data, wp->greetRequest.proposerId.data, UUID_LEN);
+			wp->voteRequest.pam.tag = 'v';
+			wp->voteRequest.generation = wp->mconf.generation;
+			wp->voteRequest.term = wp->propTerm;
 		}
 	}
 	else if (sk->greetResponse.term > wp->propTerm)
@@ -759,12 +774,14 @@ SendVoteRequest(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;

-	/* We have quorum for voting, send our vote request */
-	wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
-	/* On failure, logging & resetting is handled */
-	if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
-		return;
+	PAMessageSerialize(wp, (ProposerAcceptorMessage *) &wp->voteRequest,
+					   &sk->outbuf, wp->config->proto_version);

+	/* We have quorum for voting, send our vote request */
+	wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
+		   wp->voteRequest.generation, wp->voteRequest.term);
+	/* On failure, logging & resetting is handled */
+	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
 }

@@ -778,11 +795,12 @@ RecvVoteResponse(Safekeeper *sk)
 		return;

 	wp_log(LOG,
-		   "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-		   sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+		   "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+		   sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
+		   sk->voteResponse.voteGiven,
+		   GetHighestTerm(&sk->voteResponse.termHistory),
 		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-		   LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-		   LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+		   LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));

 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -847,9 +865,9 @@ HandleElectedProposer(WalProposer *wp)
 	 * otherwise we must be sync-safekeepers and we have nothing to do then.
 	 *
 	 * Proceeding is not only pointless but harmful, because we'd give
-	 * safekeepers term history starting with 0/0. These hacks will go away once
-	 * we disable implicit timeline creation on safekeepers and create it with
-	 * non zero LSN from the start.
+	 * safekeepers term history starting with 0/0. These hacks will go away
+	 * once we disable implicit timeline creation on safekeepers and create it
+	 * with non zero LSN from the start.
 	 */
 	if (wp->propEpochStartLsn == InvalidXLogRecPtr)
 	{
@@ -942,7 +960,6 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propEpochStartLsn = InvalidXLogRecPtr;
 	wp->donorEpoch = 0;
 	wp->truncateLsn = InvalidXLogRecPtr;
-	wp->timelineStartLsn = InvalidXLogRecPtr;

 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
@@ -959,20 +976,6 @@ DetermineEpochStartLsn(WalProposer *wp)
 				wp->donor = i;
 			}
 			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
-
-			if (wp->safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
-			{
-				/* timelineStartLsn should be the same everywhere or unknown */
-				if (wp->timelineStartLsn != InvalidXLogRecPtr &&
-					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
-				{
-					wp_log(WARNING,
-						   "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-						   LSN_FORMAT_ARGS(wp->timelineStartLsn),
-						   LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
-				}
-				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
-			}
 		}
 	}

@@ -995,22 +998,11 @@ DetermineEpochStartLsn(WalProposer *wp)
 	if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
 	{
 		wp->propEpochStartLsn = wp->truncateLsn = wp->api.get_redo_start_lsn(wp);
-		if (wp->timelineStartLsn == InvalidXLogRecPtr)
-		{
-			wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
-		}
 		wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 	}
 	pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);

-	/*
-	 * Safekeepers are setting truncateLsn after timelineStartLsn is known, so
-	 * it should never be zero at this point, if we know timelineStartLsn.
-	 *
-	 * timelineStartLsn can be zero only on the first syncSafekeepers run.
-	 */
-	Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
-		   (wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn));
+	Assert(wp->truncateLsn != InvalidXLogRecPtr || wp->config->syncSafekeepers);

 	/*
 	 * We will be generating WAL since propEpochStartLsn, so we should set
@@ -1053,10 +1045,11 @@ DetermineEpochStartLsn(WalProposer *wp)
 		if (SkipXLogPageHeader(wp, wp->propEpochStartLsn) != wp->api.get_redo_start_lsn(wp))
 		{
 			/*
-			 * However, allow to proceed if last_log_term on the node which gave
-			 * the highest vote (i.e. point where we are going to start writing)
-			 * actually had been won by me; plain restart of walproposer not
-			 * intervened by concurrent compute which wrote WAL is ok.
+			 * However, allow to proceed if last_log_term on the node which
+			 * gave the highest vote (i.e. point where we are going to start
+			 * writing) actually had been won by me; plain restart of
+			 * walproposer not intervened by concurrent compute which wrote
+			 * WAL is ok.
 			 *
 			 * This avoids compute crash after manual term_bump.
 			 */
@@ -1126,14 +1119,8 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
-		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
-			   sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
-
-		/*
-		 * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
-		 * is created manually (test_s3_wal_replay)
-		 */
-		Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
+		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, termHistory.n_entries=%u",
+			   sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), wp->propTermHistory.n_entries);
 	}
 	else
 	{
@@ -1158,29 +1145,19 @@ SendProposerElected(Safekeeper *sk)

 	Assert(sk->startStreamingAt <= wp->availableLsn);

-	msg.tag = 'e';
+	msg.apm.tag = 'e';
+	msg.generation = wp->mconf.generation;
 	msg.term = wp->propTerm;
 	msg.startStreamingAt = sk->startStreamingAt;
 	msg.termHistory = &wp->propTermHistory;
-	msg.timelineStartLsn = wp->timelineStartLsn;

 	lastCommonTerm = idx >= 0 ? wp->propTermHistory.entries[idx].term : 0;
 	wp_log(LOG,
-		   "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-		   sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
-
-	resetStringInfo(&sk->outbuf);
-	pq_sendint64_le(&sk->outbuf, msg.tag);
-	pq_sendint64_le(&sk->outbuf, msg.term);
-	pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
-	pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
-	for (int i = 0; i < msg.termHistory->n_entries; i++)
-	{
-		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
-		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
-	}
-	pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);
+		   "sending elected msg to node " UINT64_FORMAT " generation=%u term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
+		   sk->greetResponse.nodeId, msg.generation, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt),
+		   lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);

+	PAMessageSerialize(wp, (ProposerAcceptorMessage *) &msg, &sk->outbuf, wp->config->proto_version);
 	if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
 		return;

@@ -1246,14 +1223,13 @@ static void
 PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
 {
 	Assert(endLsn >= beginLsn);
-	req->tag = 'a';
+	req->apm.tag = 'a';
+	req->generation = wp->mconf.generation;
 	req->term = wp->propTerm;
-	req->epochStartLsn = wp->propEpochStartLsn;
 	req->beginLsn = beginLsn;
 	req->endLsn = endLsn;
 	req->commitLsn = wp->commitLsn;
 	req->truncateLsn = wp->truncateLsn;
-	req->proposerId = wp->greetRequest.proposerId;
 }

 /*
@@ -1354,7 +1330,8 @@ SendAppendRequests(Safekeeper *sk)
 			resetStringInfo(&sk->outbuf);

 			/* write AppendRequest header */
-			appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
+			PAMessageSerialize(wp, (ProposerAcceptorMessage *) req, &sk->outbuf, wp->config->proto_version);
+			/* prepare for reading WAL into the outbuf */
 			enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
 			sk->active_state = SS_ACTIVE_READ_WAL;
 		}
@@ -1367,14 +1344,17 @@ SendAppendRequests(Safekeeper *sk)
 			req = &sk->appendRequest;
 			req_len = req->endLsn - req->beginLsn;

-			/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
+			/*
+			 * We send zero sized AppenRequests as heartbeats; don't wal_read
+			 * for these.
+			 */
 			if (req_len > 0)
 			{
 				switch (wp->api.wal_read(sk,
-										&sk->outbuf.data[sk->outbuf.len],
-										req->beginLsn,
-										req_len,
-										&errmsg))
+										 &sk->outbuf.data[sk->outbuf.len],
+										 req->beginLsn,
+										 req_len,
+										 &errmsg))
 				{
 					case NEON_WALREAD_SUCCESS:
 						break;
@@ -1382,7 +1362,7 @@ SendAppendRequests(Safekeeper *sk)
 						return true;
 					case NEON_WALREAD_ERROR:
 						wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
-							sk->host, sk->port, errmsg);
+							   sk->host, sk->port, errmsg);
 						ShutdownConnection(sk);
 						return false;
 					default:
@@ -1470,11 +1450,11 @@ RecvAppendResponses(Safekeeper *sk)
 			 * Term has changed to higher one, probably another compute is
 			 * running. If this is the case we could PANIC as well because
 			 * likely it inserted some data and our basebackup is unsuitable
-			 * anymore. However, we also bump term manually (term_bump endpoint)
-			 * on safekeepers for migration purposes, in this case we do want
-			 * compute to stay alive. So restart walproposer with FATAL instead
-			 * of panicking; if basebackup is spoiled next election will notice
-			 * this.
+			 * anymore. However, we also bump term manually (term_bump
+			 * endpoint) on safekeepers for migration purposes, in this case
+			 * we do want compute to stay alive. So restart walproposer with
+			 * FATAL instead of panicking; if basebackup is spoiled next
+			 * election will notice this.
 			 */
 			wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT ", meaning another compute is running at the same time, and it conflicts with us",
 				   sk->host, sk->port,
@@ -1509,7 +1489,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese

 	for (i = 0; i < nkeys; i++)
 	{
-		const char *key = pq_getmsgstring(reply_message);
+		const char *key = pq_getmsgrawstring(reply_message);
 		unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32));

 		if (strcmp(key, "current_timeline_size") == 0)
@@ -1750,6 +1730,208 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	}
 }

+/* Serialize MembershipConfiguration into buf. */
+static void
+MembershipConfigurationSerialize(MembershipConfiguration *mconf, StringInfo buf)
+{
+	uint32		i;
+
+	pq_sendint32(buf, mconf->generation);
+
+	pq_sendint32(buf, mconf->members.len);
+	for (i = 0; i < mconf->members.len; i++)
+	{
+		pq_sendint64(buf, mconf->members.m[i].node_id);
+		pq_send_ascii_string(buf, mconf->members.m[i].host);
+		pq_sendint16(buf, mconf->members.m[i].port);
+	}
+
+	/*
+	 * There is no special mark for absent new_members; zero members in
+	 * invalid, so zero len means absent.
+	 */
+	pq_sendint32(buf, mconf->new_members.len);
+	for (i = 0; i < mconf->new_members.len; i++)
+	{
+		pq_sendint64(buf, mconf->new_members.m[i].node_id);
+		pq_send_ascii_string(buf, mconf->new_members.m[i].host);
+		pq_sendint16(buf, mconf->new_members.m[i].port);
+	}
+}
+
+/* Serialize proposer -> acceptor message into buf using specified version */
+static void
+PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf, int proto_version)
+{
+	/* both version are supported currently until we fully migrate to 3 */
+	Assert(proto_version == 3 || proto_version == 2);
+
+	resetStringInfo(buf);
+
+	if (proto_version == 3)
+	{
+		/*
+		 * v2 sends structs for some messages as is, so commonly send tag only
+		 * for v3
+		 */
+		pq_sendint8(buf, msg->tag);
+
+		switch (msg->tag)
+		{
+			case 'g':
+				{
+					ProposerGreeting *m = (ProposerGreeting *) msg;
+
+					pq_send_ascii_string(buf, m->tenant_id);
+					pq_send_ascii_string(buf, m->timeline_id);
+					MembershipConfigurationSerialize(&m->mconf, buf);
+					pq_sendint32(buf, m->pg_version);
+					pq_sendint64(buf, m->system_id);
+					pq_sendint32(buf, m->wal_seg_size);
+					break;
+				}
+			case 'v':
+				{
+					VoteRequest *m = (VoteRequest *) msg;
+
+					pq_sendint32(buf, m->generation);
+					pq_sendint64(buf, m->term);
+					break;
+
+				}
+			case 'e':
+				{
+					ProposerElected *m = (ProposerElected *) msg;
+
+					pq_sendint32(buf, m->generation);
+					pq_sendint64(buf, m->term);
+					pq_sendint64(buf, m->startStreamingAt);
+					pq_sendint32(buf, m->termHistory->n_entries);
+					for (uint32 i = 0; i < m->termHistory->n_entries; i++)
+					{
+						pq_sendint64(buf, m->termHistory->entries[i].term);
+						pq_sendint64(buf, m->termHistory->entries[i].lsn);
+					}
+					break;
+				}
+			case 'a':
+				{
+					/*
+					 * Note: this serializes only AppendRequestHeader, caller
+					 * is expected to append WAL data later.
+					 */
+					AppendRequestHeader *m = (AppendRequestHeader *) msg;
+
+					pq_sendint32(buf, m->generation);
+					pq_sendint64(buf, m->term);
+					pq_sendint64(buf, m->beginLsn);
+					pq_sendint64(buf, m->endLsn);
+					pq_sendint64(buf, m->commitLsn);
+					pq_sendint64(buf, m->truncateLsn);
+					break;
+				}
+			default:
+				wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
+		}
+		return;
+	}
+
+	if (proto_version == 2)
+	{
+		switch (msg->tag)
+		{
+			case 'g':
+				{
+					/* v2 sent struct as is */
+					ProposerGreeting *m = (ProposerGreeting *) msg;
+					ProposerGreetingV2 greetRequestV2;
+
+					/* Fill also v2 struct. */
+					greetRequestV2.tag = 'g';
+					greetRequestV2.protocolVersion = proto_version;
+					greetRequestV2.pgVersion = m->pg_version;
+
+					/*
+					 * v3 removed this field because it's easier to pass as
+					 * libq or START_WAL_PUSH options
+					 */
+					memset(&greetRequestV2.proposerId, 0, sizeof(greetRequestV2.proposerId));
+					greetRequestV2.systemId = wp->config->systemId;
+					if (*m->timeline_id != '\0' &&
+						!HexDecodeString(greetRequestV2.timeline_id, m->timeline_id, 16))
+						wp_log(FATAL, "could not parse neon.timeline_id, %s", m->timeline_id);
+					if (*m->tenant_id != '\0' &&
+						!HexDecodeString(greetRequestV2.tenant_id, m->tenant_id, 16))
+						wp_log(FATAL, "could not parse neon.tenant_id, %s", m->tenant_id);
+
+					greetRequestV2.timeline = wp->config->pgTimeline;
+					greetRequestV2.walSegSize = wp->config->wal_segment_size;
+
+					pq_sendbytes(buf, (char *) &greetRequestV2, sizeof(greetRequestV2));
+					break;
+				}
+			case 'v':
+				{
+					/* v2 sent struct as is */
+					VoteRequest *m = (VoteRequest *) msg;
+					VoteRequestV2 voteRequestV2;
+
+					voteRequestV2.tag = m->pam.tag;
+					voteRequestV2.term = m->term;
+					/* removed field */
+					memset(&voteRequestV2.proposerId, 0, sizeof(voteRequestV2.proposerId));
+					pq_sendbytes(buf, (char *) &voteRequestV2, sizeof(voteRequestV2));
+					break;
+				}
+			case 'e':
+				{
+					ProposerElected *m = (ProposerElected *) msg;
+
+					pq_sendint64_le(buf, m->apm.tag);
+					pq_sendint64_le(buf, m->term);
+					pq_sendint64_le(buf, m->startStreamingAt);
+					pq_sendint32_le(buf, m->termHistory->n_entries);
+					for (int i = 0; i < m->termHistory->n_entries; i++)
+					{
+						pq_sendint64_le(buf, m->termHistory->entries[i].term);
+						pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
+					}
+					pq_sendint64_le(buf, 0);	/* removed timeline_start_lsn */
+					break;
+				}
+			case 'a':
+
+				/*
+				 * Note: this serializes only AppendRequestHeader, caller is
+				 * expected to append WAL data later.
+				 */
+				{
+					/* v2 sent struct as is */
+					AppendRequestHeader *m = (AppendRequestHeader *) msg;
+					AppendRequestHeaderV2 appendRequestHeaderV2;
+
+					appendRequestHeaderV2.tag = m->apm.tag;
+					appendRequestHeaderV2.term = m->term;
+					appendRequestHeaderV2.epochStartLsn = 0;	/* removed field */
+					appendRequestHeaderV2.beginLsn = m->beginLsn;
+					appendRequestHeaderV2.endLsn = m->endLsn;
+					appendRequestHeaderV2.commitLsn = m->commitLsn;
+					appendRequestHeaderV2.truncateLsn = m->truncateLsn;
+					/* removed field */
+					memset(&appendRequestHeaderV2.proposerId, 0, sizeof(appendRequestHeaderV2.proposerId));
+
+					pq_sendbytes(buf, (char *) &appendRequestHeaderV2, sizeof(appendRequestHeaderV2));
+					break;
+				}
+
+			default:
+				wp_log(FATAL, "unexpected message type %c to serialize", msg->tag);
+		}
+		return;
+	}
+	wp_log(FATAL, "unexpected proto_version %d", proto_version);
+}
+
 /*
 * Try to read CopyData message from i'th safekeeper, resetting connection on
 * failure.
@@ -1779,6 +1961,37 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 	return false;
 }

+/* Deserialize membership configuration from buf to mconf. */
+static void
+MembershipConfigurationDeserialize(MembershipConfiguration *mconf, StringInfo buf)
+{
+	uint32		i;
+
+	mconf->generation = pq_getmsgint32(buf);
+	mconf->members.len = pq_getmsgint32(buf);
+	mconf->members.m = palloc0(sizeof(SafekeeperId) * mconf->members.len);
+	for (i = 0; i < mconf->members.len; i++)
+	{
+		const char *buf_host;
+
+		mconf->members.m[i].node_id = pq_getmsgint64(buf);
+		buf_host = pq_getmsgrawstring(buf);
+		strlcpy(mconf->members.m[i].host, buf_host, sizeof(mconf->members.m[i].host));
+		mconf->members.m[i].port = pq_getmsgint16(buf);
+	}
+	mconf->new_members.len = pq_getmsgint32(buf);
+	mconf->new_members.m = palloc0(sizeof(SafekeeperId) * mconf->new_members.len);
+	for (i = 0; i < mconf->new_members.len; i++)
+	{
+		const char *buf_host;
+
+		mconf->new_members.m[i].node_id = pq_getmsgint64(buf);
+		buf_host = pq_getmsgrawstring(buf);
+		strlcpy(mconf->new_members.m[i].host, buf_host, sizeof(mconf->new_members.m[i].host));
+		mconf->new_members.m[i].port = pq_getmsgint16(buf);
+	}
+}
+
 /*
 * Read next message with known type into provided struct, by reading a CopyData
 * block from the safekeeper's postgres connection, returning whether the read
@@ -1787,6 +2000,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 * If the read needs more polling, we return 'false' and keep the state
 * unmodified, waiting until it becomes read-ready to try again. If it fully
 * failed, a warning is emitted and the connection is reset.
+ *
+ * Note: it pallocs if needed, i.e. for AcceptorGreeting and VoteResponse fields.
 */
 static bool
 AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
@@ -1795,82 +2010,154 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)

 	char	   *buf;
 	int			buf_size;
-	uint64		tag;
+	uint8		tag;
 	StringInfoData s;

 	if (!(AsyncRead(sk, &buf, &buf_size)))
 		return false;
+	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);

 	/* parse it */
 	s.data = buf;
 	s.len = buf_size;
+	s.maxlen = buf_size;
 	s.cursor = 0;

-	tag = pq_getmsgint64_le(&s);
-	if (tag != anymsg->tag)
+	if (wp->config->proto_version == 3)
 	{
-		wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-			   sk->port, FormatSafekeeperState(sk));
-		ResetConnection(sk);
-		return false;
-	}
-	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
-	switch (tag)
-	{
-		case 'g':
-			{
-				AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
-
-				msg->term = pq_getmsgint64_le(&s);
-				msg->nodeId = pq_getmsgint64_le(&s);
-				pq_getmsgend(&s);
-				return true;
-			}
-
-		case 'v':
-			{
-				VoteResponse *msg = (VoteResponse *) anymsg;
-
-				msg->term = pq_getmsgint64_le(&s);
-				msg->voteGiven = pq_getmsgint64_le(&s);
-				msg->flushLsn = pq_getmsgint64_le(&s);
-				msg->truncateLsn = pq_getmsgint64_le(&s);
-				msg->termHistory.n_entries = pq_getmsgint32_le(&s);
-				msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
-				for (int i = 0; i < msg->termHistory.n_entries; i++)
+		tag = pq_getmsgbyte(&s);
+		if (tag != anymsg->tag)
+		{
+			wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+				   sk->port, FormatSafekeeperState(sk));
+			ResetConnection(sk);
+			return false;
+		}
+		switch (tag)
+		{
+			case 'g':
 				{
-					msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
-					msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
+					AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
+
+					msg->nodeId = pq_getmsgint64(&s);
+					MembershipConfigurationDeserialize(&msg->mconf, &s);
+					msg->term = pq_getmsgint64(&s);
+					pq_getmsgend(&s);
+					return true;
 				}
-				msg->timelineStartLsn = pq_getmsgint64_le(&s);
-				pq_getmsgend(&s);
-				return true;
-			}
+			case 'v':
+				{
+					VoteResponse *msg = (VoteResponse *) anymsg;

-		case 'a':
-			{
-				AppendResponse *msg = (AppendResponse *) anymsg;
+					msg->generation = pq_getmsgint32(&s);
+					msg->term = pq_getmsgint64(&s);
+					msg->voteGiven = pq_getmsgbyte(&s);
+					msg->flushLsn = pq_getmsgint64(&s);
+					msg->truncateLsn = pq_getmsgint64(&s);
+					msg->termHistory.n_entries = pq_getmsgint32(&s);
+					msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
+					for (uint32 i = 0; i < msg->termHistory.n_entries; i++)
+					{
+						msg->termHistory.entries[i].term = pq_getmsgint64(&s);
+						msg->termHistory.entries[i].lsn = pq_getmsgint64(&s);
+					}
+					pq_getmsgend(&s);
+					return true;
+				}
+			case 'a':
+				{
+					AppendResponse *msg = (AppendResponse *) anymsg;

-				msg->term = pq_getmsgint64_le(&s);
-				msg->flushLsn = pq_getmsgint64_le(&s);
-				msg->commitLsn = pq_getmsgint64_le(&s);
-				msg->hs.ts = pq_getmsgint64_le(&s);
-				msg->hs.xmin.value = pq_getmsgint64_le(&s);
-				msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
-				if (s.len > s.cursor)
-					ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
-				else
-					msg->ps_feedback.present = false;
-				pq_getmsgend(&s);
-				return true;
-			}
-
-		default:
-			{
-				Assert(false);
-				return false;
-			}
+					msg->generation = pq_getmsgint32(&s);
+					msg->term = pq_getmsgint64(&s);
+					msg->flushLsn = pq_getmsgint64(&s);
+					msg->commitLsn = pq_getmsgint64(&s);
+					msg->hs.ts = pq_getmsgint64(&s);
+					msg->hs.xmin.value = pq_getmsgint64(&s);
+					msg->hs.catalog_xmin.value = pq_getmsgint64(&s);
+					if (s.len > s.cursor)
+						ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
+					else
+						msg->ps_feedback.present = false;
+					pq_getmsgend(&s);
+					return true;
+				}
+			default:
+				{
+					wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
+					return false;
+				}
+		}
 	}
+	else if (wp->config->proto_version == 2)
+	{
+		tag = pq_getmsgint64_le(&s);
+		if (tag != anymsg->tag)
+		{
+			wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+				   sk->port, FormatSafekeeperState(sk));
+			ResetConnection(sk);
+			return false;
+		}
+		switch (tag)
+		{
+			case 'g':
+				{
+					AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
+
+					msg->term = pq_getmsgint64_le(&s);
+					msg->nodeId = pq_getmsgint64_le(&s);
+					pq_getmsgend(&s);
+					return true;
+				}
+
+			case 'v':
+				{
+					VoteResponse *msg = (VoteResponse *) anymsg;
+
+					msg->term = pq_getmsgint64_le(&s);
+					msg->voteGiven = pq_getmsgint64_le(&s);
+					msg->flushLsn = pq_getmsgint64_le(&s);
+					msg->truncateLsn = pq_getmsgint64_le(&s);
+					msg->termHistory.n_entries = pq_getmsgint32_le(&s);
+					msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
+					for (int i = 0; i < msg->termHistory.n_entries; i++)
+					{
+						msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
+						msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
+					}
+					pq_getmsgint64_le(&s);	/* timelineStartLsn */
+					pq_getmsgend(&s);
+					return true;
+				}
+
+			case 'a':
+				{
+					AppendResponse *msg = (AppendResponse *) anymsg;
+
+					msg->term = pq_getmsgint64_le(&s);
+					msg->flushLsn = pq_getmsgint64_le(&s);
+					msg->commitLsn = pq_getmsgint64_le(&s);
+					msg->hs.ts = pq_getmsgint64_le(&s);
+					msg->hs.xmin.value = pq_getmsgint64_le(&s);
+					msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
+					if (s.len > s.cursor)
+						ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback);
+					else
+						msg->ps_feedback.present = false;
+					pq_getmsgend(&s);
+					return true;
+				}
+
+			default:
+				{
+					wp_log(FATAL, "unexpected message tag %c to read", (char) tag);
+					return false;
+				}
+		}
+	}
+	wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
+	return false; /* keep the compiler quiet */
 }

 /*
@@ -2246,3 +2533,45 @@ FormatEvents(WalProposer *wp, uint32 events)

 	return (char *) &return_str;
 }
+
+/* Dump mconf as toml for observability / debugging. Result is palloc'ed. */
+static char *
+MembershipConfigurationToString(MembershipConfiguration *mconf)
+{
+	StringInfoData s;
+	uint32		i;
+
+	initStringInfo(&s);
+	appendStringInfo(&s, "{gen = %u", mconf->generation);
+	appendStringInfoString(&s, ", members = [");
+	for (i = 0; i < mconf->members.len; i++)
+	{
+		if (i > 0)
+			appendStringInfoString(&s, ", ");
+		appendStringInfo(&s, "{node_id = %lu", mconf->members.m[i].node_id);
+		appendStringInfo(&s, ", host = %s", mconf->members.m[i].host);
+		appendStringInfo(&s, ", port = %u }", mconf->members.m[i].port);
+	}
+	appendStringInfo(&s, "], new_members = [");
+	for (i = 0; i < mconf->new_members.len; i++)
+	{
+		if (i > 0)
+			appendStringInfoString(&s, ", ");
+		appendStringInfo(&s, "{node_id = %lu", mconf->new_members.m[i].node_id);
+		appendStringInfo(&s, ", host = %s", mconf->new_members.m[i].host);
+		appendStringInfo(&s, ", port = %u }", mconf->new_members.m[i].port);
+	}
+	appendStringInfoString(&s, "]}");
+	return s.data;
+}
+
+static void
+MembershipConfigurationFree(MembershipConfiguration *mconf)
+{
+	if (mconf->members.m)
+		pfree(mconf->members.m);
+	mconf->members.m = NULL;
+	if (mconf->new_members.m)
+		pfree(mconf->new_members.m);
+	mconf->new_members.m = NULL;
+}
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -12,9 +12,6 @@
 #include "neon_walreader.h"
 #include "pagestore_client.h"

-#define SK_MAGIC 0xCafeCeefu
-#define SK_PROTOCOL_VERSION 2
-
 #define MAX_SAFEKEEPERS 32
 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)	/* max size of a single* WAL
 											 * message */
@@ -143,12 +140,71 @@ typedef uint64 term_t;
 /* neon storage node id */
 typedef uint64 NNodeId;

+/*
+ * Number uniquely identifying safekeeper membership configuration.
+ * This and following structs pair ones in membership.rs.
+ */
+typedef uint32 Generation;
+
+typedef struct SafekeeperId
+{
+	NNodeId		node_id;
+	char		host[MAXCONNINFO];
+	uint16		port;
+} SafekeeperId;
+
+/* Set of safekeepers. */
+typedef struct MemberSet
+{
+	uint32		len;			/* number of members */
+	SafekeeperId *m;			/* ids themselves */
+} MemberSet;
+
+/* Timeline safekeeper membership configuration. */
+typedef struct MembershipConfiguration
+{
+	Generation	generation;
+	MemberSet	members;
+	/* Has 0 n_members in non joint conf. */
+	MemberSet	new_members;
+} MembershipConfiguration;
+
 /*
 * Proposer <-> Acceptor messaging.
 */

+typedef struct ProposerAcceptorMessage
+{
+	uint8		tag;
+} ProposerAcceptorMessage;
+
 /* Initial Proposer -> Acceptor message */
 typedef struct ProposerGreeting
+{
+	ProposerAcceptorMessage pam;	/* message tag */
+
+	/*
+	 * tenant/timeline ids as C strings with standard hex notation for ease of
+	 * printing. In principle they are not strictly needed as ttid is also
+	 * passed as libpq options.
+	 */
+	char	   *tenant_id;
+	char	   *timeline_id;
+	/* Full conf is carried to allow safekeeper switch */
+	MembershipConfiguration mconf;
+
+	/*
+	 * pg_version and wal_seg_size are used for timeline creation until we
+	 * fully migrate to doing externally. systemId is only used as a sanity
+	 * cross check.
+	 */
+	uint32		pg_version;		/* in PG_VERSION_NUM format */
+	uint64		system_id;		/* Postgres system identifier. */
+	uint32		wal_seg_size;
+} ProposerGreeting;
+
+/* protocol v2 variant, kept while wp supports it */
+typedef struct ProposerGreetingV2
 {
 	uint64		tag;			/* message tag */
 	uint32		protocolVersion;	/* proposer-safekeeper protocol version */
@@ -159,32 +215,42 @@ typedef struct ProposerGreeting
 	uint8		tenant_id[16];
 	TimeLineID	timeline;
 	uint32		walSegSize;
-} ProposerGreeting;
+} ProposerGreetingV2;

 typedef struct AcceptorProposerMessage
 {
-	uint64		tag;
+	uint8		tag;
 } AcceptorProposerMessage;

 /*
- * Acceptor -> Proposer initial response: the highest term acceptor voted for.
+ * Acceptor -> Proposer initial response: the highest term acceptor voted for,
+ * its node id and configuration.
 */
 typedef struct AcceptorGreeting
 {
 	AcceptorProposerMessage apm;
-	term_t		term;
 	NNodeId		nodeId;
+	MembershipConfiguration mconf;
+	term_t		term;
 } AcceptorGreeting;

 /*
 * Proposer -> Acceptor vote request.
 */
 typedef struct VoteRequest
+{
+	ProposerAcceptorMessage pam;	/* message tag */
+	Generation	generation;		/* membership conf generation */
+	term_t		term;
+} VoteRequest;
+
+/* protocol v2 variant, kept while wp supports it */
+typedef struct VoteRequestV2
 {
 	uint64		tag;
 	term_t		term;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-} VoteRequest;
+} VoteRequestV2;

 /* Element of term switching chain. */
 typedef struct TermSwitchEntry
@@ -203,8 +269,15 @@ typedef struct TermHistory
 typedef struct VoteResponse
 {
 	AcceptorProposerMessage apm;
+
+	/*
+	 * Membership conf generation. It's not strictly required because on
+	 * mismatch safekeeper is expected to ERROR the connection, but let's
+	 * sanity check it.
+	 */
+	Generation	generation;
 	term_t		term;
-	uint64		voteGiven;
+	uint8		voteGiven;

 	/*
 	 * Safekeeper flush_lsn (end of WAL) + history of term switches allow
@@ -214,7 +287,6 @@ typedef struct VoteResponse
 	XLogRecPtr	truncateLsn;	/* minimal LSN which may be needed for*
 								 * recovery of some safekeeper */
 	TermHistory termHistory;
-	XLogRecPtr	timelineStartLsn;	/* timeline globally starts at this LSN */
 } VoteResponse;

 /*
@@ -223,20 +295,37 @@ typedef struct VoteResponse
 */
 typedef struct ProposerElected
 {
-	uint64		tag;
+	AcceptorProposerMessage apm;
+	Generation	generation;		/* membership conf generation */
 	term_t		term;
 	/* proposer will send since this point */
 	XLogRecPtr	startStreamingAt;
 	/* history of term switches up to this proposer */
 	TermHistory *termHistory;
-	/* timeline globally starts at this LSN */
-	XLogRecPtr	timelineStartLsn;
 } ProposerElected;

 /*
 * Header of request with WAL message sent from proposer to safekeeper.
 */
 typedef struct AppendRequestHeader
+{
+	AcceptorProposerMessage apm;
+	Generation	generation;		/* membership conf generation */
+	term_t		term;			/* term of the proposer */
+	XLogRecPtr	beginLsn;		/* start position of message in WAL */
+	XLogRecPtr	endLsn;			/* end position of message in WAL */
+	XLogRecPtr	commitLsn;		/* LSN committed by quorum of safekeepers */
+
+	/*
+	 * minimal LSN which may be needed for recovery of some safekeeper (end
+	 * lsn + 1 of last chunk streamed to everyone)
+	 */
+	XLogRecPtr	truncateLsn;
+	/* in the AppendRequest message, WAL data follows */
+} AppendRequestHeader;
+
+/* protocol v2 variant, kept while wp supports it */
+typedef struct AppendRequestHeaderV2
 {
 	uint64		tag;
 	term_t		term;			/* term of the proposer */
@@ -256,7 +345,8 @@ typedef struct AppendRequestHeader
 	 */
 	XLogRecPtr	truncateLsn;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-} AppendRequestHeader;
+	/* in the AppendRequest message, WAL data follows */
+} AppendRequestHeaderV2;

 /*
 * Hot standby feedback received from replica
@@ -309,6 +399,13 @@ typedef struct AppendResponse
 {
 	AcceptorProposerMessage apm;

+	/*
+	 * Membership conf generation. It's not strictly required because on
+	 * mismatch safekeeper is expected to ERROR the connection, but let's
+	 * sanity check it.
+	 */
+	Generation	generation;
+
 	/*
 	 * Current term of the safekeeper; if it is higher than proposer's, the
 	 * compute is out of date.
@@ -644,6 +741,8 @@ typedef struct WalProposerConfig
 	/* Will be passed to safekeepers in greet request. */
 	TimeLineID	pgTimeline;

+	int			proto_version;
+
 #ifdef WALPROPOSER_LIB
 	void	   *callback_data;
 #endif
@@ -656,11 +755,14 @@ typedef struct WalProposerConfig
 typedef struct WalProposer
 {
 	WalProposerConfig *config;
-	int			n_safekeepers;
+	/* Current walproposer membership configuration */
+	MembershipConfiguration mconf;

 	/* (n_safekeepers / 2) + 1 */
 	int			quorum;

+	/* Number of occupied slots in safekeepers[] */
+	int			n_safekeepers;
 	Safekeeper	safekeeper[MAX_SAFEKEEPERS];

 	/* WAL has been generated up to this point */
@@ -670,6 +772,7 @@ typedef struct WalProposer
 	XLogRecPtr	commitLsn;

 	ProposerGreeting greetRequest;
+	ProposerGreetingV2 greetRequestV2;

 	/* Vote request for safekeeper */
 	VoteRequest voteRequest;
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -117,14 +117,13 @@ pq_getmsgbytes(StringInfo msg, int datalen)
 }

 /* --------------------------------
- *		pq_getmsgstring - get a null-terminated text string (with conversion)
+ *		pq_getmsgrawstring - get a null-terminated text string - NO conversion
 *
- *		May return a pointer directly into the message buffer, or a pointer
- *		to a palloc'd conversion result.
+ *		Returns a pointer directly into the message buffer.
 * --------------------------------
 */
 const char *
-pq_getmsgstring(StringInfo msg)
+pq_getmsgrawstring(StringInfo msg)
 {
 	char	   *str;
 	int			slen;
@@ -155,6 +154,45 @@ pq_getmsgend(StringInfo msg)
 		ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
 }

+/* --------------------------------
+ *		pq_sendbytes	- append raw data to a StringInfo buffer
+ * --------------------------------
+ */
+void
+pq_sendbytes(StringInfo buf, const void *data, int datalen)
+{
+	/* use variant that maintains a trailing null-byte, out of caution */
+	appendBinaryStringInfo(buf, data, datalen);
+}
+
+/* --------------------------------
+ *		pq_send_ascii_string	- append a null-terminated text string (without conversion)
+ *
+ * This function intentionally bypasses encoding conversion, instead just
+ * silently replacing any non-7-bit-ASCII characters with question marks.
+ * It is used only when we are having trouble sending an error message to
+ * the client with normal localization and encoding conversion.  The caller
+ * should already have taken measures to ensure the string is just ASCII;
+ * the extra work here is just to make certain we don't send a badly encoded
+ * string to the client (which might or might not be robust about that).
+ *
+ * NB: passed text string must be null-terminated, and so is the data
+ * sent to the frontend.
+ * --------------------------------
+ */
+void
+pq_send_ascii_string(StringInfo buf, const char *str)
+{
+	while (*str)
+	{
+		char		ch = *str++;
+
+		if (IS_HIGHBIT_SET(ch))
+			ch = '?';
+		appendStringInfoCharMacro(buf, ch);
+	}
+	appendStringInfoChar(buf, '\0');
+}

 /*
 * Produce a C-string representation of a TimestampTz.
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -59,9 +59,11 @@

 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"

+/* GUCs */
 char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
+int			safekeeper_proto_version = 2;

 /* Set to true in the walproposer bgw. */
 static bool am_walproposer;
@@ -126,6 +128,7 @@ init_walprop_config(bool syncSafekeepers)
 	else
 		walprop_config.systemId = 0;
 	walprop_config.pgTimeline = walprop_pg_get_timeline_id();
+	walprop_config.proto_version = safekeeper_proto_version;
 }

 /*
@@ -219,25 +222,37 @@ nwp_register_gucs(void)
 							PGC_SIGHUP,
 							GUC_UNIT_MS,
 							NULL, NULL, NULL);
+
+	DefineCustomIntVariable(
+							"neon.safekeeper_proto_version",
+							"Version of compute <-> safekeeper protocol.",
+							"Used while migrating from 2 to 3.",
+							&safekeeper_proto_version,
+							2, 0, INT_MAX,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);
 }


 static int
 split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
 {
-	int n_safekeepers = 0;
-	char *curr_sk = safekeepers_list;
+	int			n_safekeepers = 0;
+	char	   *curr_sk = safekeepers_list;

 	for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
 	{
-		if (++n_safekeepers >= MAX_SAFEKEEPERS) {
+		if (++n_safekeepers >= MAX_SAFEKEEPERS)
+		{
 			wpg_log(FATAL, "too many safekeepers");
 		}

 		coma = strchr(coma, ',');
-		safekeepers[n_safekeepers-1] = curr_sk;
+		safekeepers[n_safekeepers - 1] = curr_sk;

-		if (coma != NULL) {
+		if (coma != NULL)
+		{
 			*coma++ = '\0';
 		}
 	}
@@ -252,10 +267,10 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
 static bool
 safekeepers_cmp(char *old, char *new)
 {
-	char *safekeepers_old[MAX_SAFEKEEPERS];
-	char *safekeepers_new[MAX_SAFEKEEPERS];
-	int len_old = 0;
-	int len_new = 0;
+	char	   *safekeepers_old[MAX_SAFEKEEPERS];
+	char	   *safekeepers_new[MAX_SAFEKEEPERS];
+	int			len_old = 0;
+	int			len_new = 0;

 	len_old = split_safekeepers_list(old, safekeepers_old);
 	len_new = split_safekeepers_list(new, safekeepers_new);
@@ -292,7 +307,8 @@ assign_neon_safekeepers(const char *newval, void *extra)
 	if (!am_walproposer)
 		return;

-	if (!newval) {
+	if (!newval)
+	{
 		/* should never happen */
 		wpg_log(FATAL, "neon.safekeepers is empty");
 	}
@@ -301,11 +317,11 @@ assign_neon_safekeepers(const char *newval, void *extra)
 	newval_copy = pstrdup(newval);
 	oldval = pstrdup(wal_acceptors_list);

-	/* 
+	/*
 	 * TODO: restarting through FATAL is stupid and introduces 1s delay before
-	 * next bgw start. We should refactor walproposer to allow graceful exit and
-	 * thus remove this delay.
-	 * XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
+	 * next bgw start. We should refactor walproposer to allow graceful exit
+	 * and thus remove this delay. XXX: If you change anything here, sync with
+	 * test_safekeepers_reconfigure_reorder.
 	 */
 	if (!safekeepers_cmp(oldval, newval_copy))
 	{
@@ -454,7 +470,8 @@ backpressure_throttling_impl(void)
 	memcpy(new_status, old_status, len);
 	snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
 	set_ps_display(new_status);
-	new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */
+	new_status[len] = '\0';		/* truncate off " backpressure ..." to later
+								 * reset the ps */

 	elog(DEBUG2, "backpressure throttling: lag %lu", lag);
 	start = GetCurrentTimestamp();
@@ -621,7 +638,7 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
 	wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
 			LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
-	cmd.timeline = wp->greetRequest.timeline;
+	cmd.timeline = wp->config->pgTimeline;
 	cmd.startpoint = startpos;
 	StartProposerReplication(wp, &cmd);
 }
@@ -1963,10 +1980,11 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 		FullTransactionId xmin = hsFeedback.xmin;
 		FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
 		FullTransactionId next_xid = ReadNextFullTransactionId();
+
 		/*
-		 * Page server is updating nextXid in checkpoint each 1024 transactions,
-		 * so feedback xmin can be actually larger then nextXid and
-		 * function TransactionIdInRecentPast return false in this case,
+		 * Page server is updating nextXid in checkpoint each 1024
+		 * transactions, so feedback xmin can be actually larger then nextXid
+		 * and function TransactionIdInRecentPast return false in this case,
 		 * preventing update of slot's xmin.
 		 */
 		if (FullTransactionIdPrecedes(next_xid, xmin))
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -24,9 +24,9 @@ bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
 chrono.workspace = true
 clap = { workspace = true, features = ["derive", "env"] }
+clashmap.workspace = true
 compute_api.workspace = true
 consumption_metrics.workspace = true
-dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
 futures.workspace = true
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -106,17 +106,7 @@ cases where it is hard to use rows represented as objects (e.g. when several fie

 Proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so we can use *.localtest.me` which resolves to `127.0.0.1`.

-Let's create self-signed certificate by running:
-```sh
-openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
-```
-
-Then we need to build proxy with 'testing' feature and run, e.g.:
-```sh
-RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://proxy:password@endpoint.localtest.me:5432/postgres' --is-private-access-proxy true -c server.crt -k server.key
-```
-
-We will also need to have a postgres instance. Assuming that we have setted up docker we can set it up as follows:
+We will need to have a postgres instance. Assuming that we have set up docker we can set it up as follows:
 ```sh
 docker run \
  --detach \
@@ -133,8 +123,18 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl
 docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';"
 ```

+Let's create self-signed certificate by running:
+```sh
+openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
+```
+
+Then we need to build proxy with 'testing' feature and run, e.g.:
+```sh
+RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+```
+
 Now from client you can start a new session:

 ```sh
 PGSSLROOTCERT=./server.crt psql  "postgresql://proxy:password@endpoint.localtest.me:4432/postgres?sslmode=verify-full"
-```
+```
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -7,8 +7,8 @@ use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span};

-use super::{ComputeCredentialKeys, ControlPlaneApi};
-use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
+use super::ComputeCredentialKeys;
+use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
@@ -84,26 +84,15 @@ pub(crate) fn new_psql_session_id() -> String {
    hex::encode(rand::random::<[u8; 8]>())
 }

-#[async_trait]
-impl BackendIpAllowlist for ConsoleRedirectBackend {
-    async fn get_allowed_ips(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>> {
-        self.api
-            .get_allowed_ips_and_secret(ctx, user_info)
-            .await
-            .map(|(ips, _)| ips.as_ref().clone())
-            .map_err(|e| e.into())
-    }
-}
-
 impl ConsoleRedirectBackend {
    pub fn new(console_uri: reqwest::Url, api: cplane_proxy_v1::NeonControlPlaneClient) -> Self {
        Self { console_uri, api }
    }

+    pub(crate) fn get_api(&self) -> &cplane_proxy_v1::NeonControlPlaneClient {
+        &self.api
+    }
+
    pub(crate) async fn authenticate(
        &self,
        ctx: &RequestContext,
@@ -191,6 +180,15 @@ async fn authenticate(
        }
    }

+    // Check if the access over the public internet is allowed, otherwise block. Note that
+    // the console redirect is not behind the VPC service endpoint, so we don't need to check
+    // the VPC endpoint ID.
+    if let Some(public_access_allowed) = db_info.public_access_allowed {
+        if !public_access_allowed {
+            return Err(auth::AuthError::NetworkNotAllowed);
+        }
+    }
+
    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;

    // This config should be self-contained, because we won't
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -4,7 +4,7 @@ use std::sync::Arc;
 use std::time::{Duration, SystemTime};

 use arc_swap::ArcSwapOption;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use jose_jwk::crypto::KeyInfo;
 use reqwest::{redirect, Client};
 use reqwest_retry::policies::ExponentialBackoff;
@@ -64,7 +64,7 @@ pub(crate) struct AuthRule {
 pub struct JwkCache {
    client: reqwest_middleware::ClientWithMiddleware,

-    map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
+    map: ClashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
 }

 pub(crate) struct JwkCacheEntry {
@@ -469,7 +469,7 @@ impl Default for JwkCache {

        JwkCache {
            client,
-            map: DashMap::default(),
+            map: ClashMap::default(),
        }
    }
 }
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -26,10 +26,12 @@ use crate::context::RequestContext;
 use crate::control_plane::client::ControlPlaneClient;
 use crate::control_plane::errors::GetAuthInfoError;
 use crate::control_plane::{
-    self, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
+    self, AccessBlockerFlags, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps,
+    CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
 };
 use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter};
@@ -99,6 +101,13 @@ impl<T> Backend<'_, T> {
            Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
        }
    }
+
+    pub(crate) fn get_api(&self) -> &ControlPlaneClient {
+        match self {
+            Self::ControlPlane(api, _) => api,
+            Self::Local(_) => panic!("Local backend has no API"),
+        }
+    }
 }

 impl<'a, T> Backend<'a, T> {
@@ -247,15 +256,6 @@ impl AuthenticationConfig {
    }
 }

-#[async_trait::async_trait]
-pub(crate) trait BackendIpAllowlist {
-    async fn get_allowed_ips(
-        &self,
-        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>>;
-}
-
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 ///
@@ -282,23 +282,51 @@ async fn auth_quirks(
        Ok(info) => (info, None),
    };

-    debug!("fetching user's authentication info");
-    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;
+    debug!("fetching authentication info and allowlists");

    // check allowed list
-    if config.ip_allowlist_check_enabled
-        && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
-    {
-        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+    let allowed_ips = if config.ip_allowlist_check_enabled {
+        let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
+        if !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) {
+            return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+        }
+        allowed_ips
+    } else {
+        Cached::new_uncached(Arc::new(vec![]))
+    };
+
+    // check if a VPC endpoint ID is coming in and if yes, if it's allowed
+    let access_blocks = api.get_block_public_or_vpc_access(ctx, &info).await?;
+    if config.is_vpc_acccess_proxy {
+        if access_blocks.vpc_access_blocked {
+            return Err(AuthError::NetworkNotAllowed);
+        }
+
+        let incoming_vpc_endpoint_id = match ctx.extra() {
+            None => return Err(AuthError::MissingEndpointName),
+            Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                // Convert the vcpe_id to a string
+                String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+            }
+            Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+        };
+        let allowed_vpc_endpoint_ids = api.get_allowed_vpc_endpoint_ids(ctx, &info).await?;
+        // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+        if !allowed_vpc_endpoint_ids.is_empty()
+            && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id)
+        {
+            return Err(AuthError::vpc_endpoint_id_not_allowed(
+                incoming_vpc_endpoint_id,
+            ));
+        }
+    } else if access_blocks.public_access_blocked {
+        return Err(AuthError::NetworkNotAllowed);
    }

    if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
        return Err(AuthError::too_many_connections());
    }
-    let cached_secret = match maybe_secret {
-        Some(secret) => secret,
-        None => api.get_role_secret(ctx, &info).await?,
-    };
+    let cached_secret = api.get_role_secret(ctx, &info).await?;
    let (cached_entry, secret) = cached_secret.take_value();

    let secret = if let Some(secret) = secret {
@@ -440,34 +468,38 @@ impl Backend<'_, ComputeUserInfo> {
        }
    }

-    pub(crate) async fn get_allowed_ips_and_secret(
+    pub(crate) async fn get_allowed_ips(
        &self,
        ctx: &RequestContext,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
        match self {
-            Self::ControlPlane(api, user_info) => {
-                api.get_allowed_ips_and_secret(ctx, user_info).await
-            }
-            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+            Self::ControlPlane(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
+            Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
        }
    }
-}

-#[async_trait::async_trait]
-impl BackendIpAllowlist for Backend<'_, ()> {
-    async fn get_allowed_ips(
+    pub(crate) async fn get_allowed_vpc_endpoint_ids(
        &self,
        ctx: &RequestContext,
-        user_info: &ComputeUserInfo,
-    ) -> auth::Result<Vec<auth::IpPattern>> {
-        let auth_data = match self {
-            Self::ControlPlane(api, ()) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-        };
+    ) -> Result<CachedAllowedVpcEndpointIds, GetAuthInfoError> {
+        match self {
+            Self::ControlPlane(api, user_info) => {
+                api.get_allowed_vpc_endpoint_ids(ctx, user_info).await
+            }
+            Self::Local(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
+        }
+    }

-        auth_data
-            .map(|(ips, _)| ips.as_ref().clone())
-            .map_err(|e| e.into())
+    pub(crate) async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<CachedAccessBlockerFlags, GetAuthInfoError> {
+        match self {
+            Self::ControlPlane(api, user_info) => {
+                api.get_block_public_or_vpc_access(ctx, user_info).await
+            }
+            Self::Local(_) => Ok(Cached::new_uncached(AccessBlockerFlags::default())),
+        }
    }
 }

@@ -514,7 +546,10 @@ mod tests {
    use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
    use crate::config::AuthenticationConfig;
    use crate::context::RequestContext;
-    use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret};
+    use crate::control_plane::{
+        self, AccessBlockerFlags, CachedAccessBlockerFlags, CachedAllowedIps,
+        CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret,
+    };
    use crate::proxy::NeonOptions;
    use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
    use crate::scram::threadpool::ThreadPool;
@@ -523,6 +558,8 @@ mod tests {

    struct Auth {
        ips: Vec<IpPattern>,
+        vpc_endpoint_ids: Vec<String>,
+        access_blocker_flags: AccessBlockerFlags,
        secret: AuthSecret,
    }

@@ -535,17 +572,31 @@ mod tests {
            Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone())))
        }

-        async fn get_allowed_ips_and_secret(
+        async fn get_allowed_ips(
            &self,
            _ctx: &RequestContext,
            _user_info: &super::ComputeUserInfo,
-        ) -> Result<
-            (CachedAllowedIps, Option<CachedRoleSecret>),
-            control_plane::errors::GetAuthInfoError,
-        > {
-            Ok((
-                CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())),
-                Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))),
+        ) -> Result<CachedAllowedIps, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())))
+        }
+
+        async fn get_allowed_vpc_endpoint_ids(
+            &self,
+            _ctx: &RequestContext,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAllowedVpcEndpointIds, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAllowedVpcEndpointIds::new_uncached(Arc::new(
+                self.vpc_endpoint_ids.clone(),
+            )))
+        }
+
+        async fn get_block_public_or_vpc_access(
+            &self,
+            _ctx: &RequestContext,
+            _user_info: &super::ComputeUserInfo,
+        ) -> Result<CachedAccessBlockerFlags, control_plane::errors::GetAuthInfoError> {
+            Ok(CachedAccessBlockerFlags::new_uncached(
+                self.access_blocker_flags.clone(),
            ))
        }

@@ -575,6 +626,7 @@ mod tests {
        rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
        rate_limit_ip_subnet: 64,
        ip_allowlist_check_enabled: true,
+        is_vpc_acccess_proxy: false,
        is_auth_broker: false,
        accept_jwts: false,
        console_redirect_confirmation_timeout: std::time::Duration::from_secs(5),
@@ -642,6 +694,8 @@ mod tests {
        let ctx = RequestContext::test();
        let api = Auth {
            ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
        };

@@ -722,6 +776,8 @@ mod tests {
        let ctx = RequestContext::test();
        let api = Auth {
            ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
        };

@@ -774,6 +830,8 @@ mod tests {
        let ctx = RequestContext::test();
        let api = Auth {
            ips: vec![],
+            vpc_endpoint_ids: vec![],
+            access_blocker_flags: AccessBlockerFlags::default(),
            secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()),
        };

--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -55,6 +55,12 @@ pub(crate) enum AuthError {
    )]
    MissingEndpointName,

+    #[error(
+        "VPC endpoint ID is not specified. \
+        This endpoint requires a VPC endpoint ID to connect."
+    )]
+    MissingVPCEndpointId,
+
    #[error("password authentication failed for user '{0}'")]
    PasswordFailed(Box<str>),

@@ -69,6 +75,15 @@ pub(crate) enum AuthError {
    )]
    IpAddressNotAllowed(IpAddr),

+    #[error("This connection is trying to access this endpoint from a blocked network.")]
+    NetworkNotAllowed,
+
+    #[error(
+        "This VPC endpoint id {0} is not allowed to connect to this endpoint. \
+        Please add it to the allowed list in the Neon console."
+    )]
+    VpcEndpointIdNotAllowed(String),
+
    #[error("Too many connections to this endpoint. Please try again later.")]
    TooManyConnections,

@@ -95,6 +110,10 @@ impl AuthError {
        AuthError::IpAddressNotAllowed(ip)
    }

+    pub(crate) fn vpc_endpoint_id_not_allowed(id: String) -> Self {
+        AuthError::VpcEndpointIdNotAllowed(id)
+    }
+
    pub(crate) fn too_many_connections() -> Self {
        AuthError::TooManyConnections
    }
@@ -122,8 +141,11 @@ impl UserFacingError for AuthError {
            Self::BadAuthMethod(_) => self.to_string(),
            Self::MalformedPassword(_) => self.to_string(),
            Self::MissingEndpointName => self.to_string(),
+            Self::MissingVPCEndpointId => self.to_string(),
            Self::Io(_) => "Internal error".to_string(),
            Self::IpAddressNotAllowed(_) => self.to_string(),
+            Self::NetworkNotAllowed => self.to_string(),
+            Self::VpcEndpointIdNotAllowed(_) => self.to_string(),
            Self::TooManyConnections => self.to_string(),
            Self::UserTimeout(_) => self.to_string(),
            Self::ConfirmationTimeout(_) => self.to_string(),
@@ -142,8 +164,11 @@ impl ReportableError for AuthError {
            Self::BadAuthMethod(_) => crate::error::ErrorKind::User,
            Self::MalformedPassword(_) => crate::error::ErrorKind::User,
            Self::MissingEndpointName => crate::error::ErrorKind::User,
+            Self::MissingVPCEndpointId => crate::error::ErrorKind::User,
            Self::Io(_) => crate::error::ErrorKind::ClientDisconnect,
            Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            Self::NetworkNotAllowed => crate::error::ErrorKind::User,
+            Self::VpcEndpointIdNotAllowed(_) => crate::error::ErrorKind::User,
            Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
            Self::UserTimeout(_) => crate::error::ErrorKind::User,
            Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -284,6 +284,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
            rate_limiter: BucketRateLimiter::new(vec![]),
            rate_limit_ip_subnet: 64,
            ip_allowlist_check_enabled: true,
+            is_vpc_acccess_proxy: false,
            is_auth_broker: false,
            accept_jwts: true,
            console_redirect_confirmation_timeout: Duration::ZERO,
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -630,6 +630,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
        ip_allowlist_check_enabled: !args.is_private_access_proxy,
+        is_vpc_acccess_proxy: args.is_private_access_proxy,
        is_auth_broker: args.is_auth_broker,
        accept_jwts: args.is_auth_broker,
        console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -3,7 +3,7 @@ use std::future::pending;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};

-use dashmap::DashSet;
+use clashmap::ClashSet;
 use redis::streams::{StreamReadOptions, StreamReadReply};
 use redis::{AsyncCommands, FromRedisValue, Value};
 use serde::Deserialize;
@@ -55,9 +55,9 @@ impl TryFrom<&Value> for ControlPlaneEvent {

 pub struct EndpointsCache {
    config: EndpointCacheConfig,
-    endpoints: DashSet<EndpointIdInt>,
-    branches: DashSet<BranchIdInt>,
-    projects: DashSet<ProjectIdInt>,
+    endpoints: ClashSet<EndpointIdInt>,
+    branches: ClashSet<BranchIdInt>,
+    projects: ClashSet<ProjectIdInt>,
    ready: AtomicBool,
    limiter: Arc<Mutex<GlobalRateLimiter>>,
 }
@@ -69,9 +69,9 @@ impl EndpointsCache {
                config.limiter_info.clone(),
            ))),
            config,
-            endpoints: DashSet::new(),
-            branches: DashSet::new(),
-            projects: DashSet::new(),
+            endpoints: ClashSet::new(),
+            branches: ClashSet::new(),
+            projects: ClashSet::new(),
            ready: AtomicBool::new(false),
        }
    }
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -5,7 +5,7 @@ use std::sync::Arc;
 use std::time::Duration;

 use async_trait::async_trait;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use rand::{thread_rng, Rng};
 use smol_str::SmolStr;
 use tokio::sync::Mutex;
@@ -15,13 +15,16 @@ use tracing::{debug, info};
 use super::{Cache, Cached};
 use crate::auth::IpPattern;
 use crate::config::ProjectInfoCacheOptions;
-use crate::control_plane::AuthSecret;
-use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::control_plane::{AccessBlockerFlags, AuthSecret};
+use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::types::{EndpointId, RoleName};

 #[async_trait]
 pub(crate) trait ProjectInfoCache {
    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
+    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>);
+    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt);
+    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt);
    fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
    async fn decrement_active_listeners(&self);
    async fn increment_active_listeners(&self);
@@ -51,6 +54,8 @@ impl<T> From<T> for Entry<T> {
 struct EndpointInfo {
    secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
    allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
+    block_public_or_vpc_access: Option<Entry<AccessBlockerFlags>>,
+    allowed_vpc_endpoint_ids: Option<Entry<Arc<Vec<String>>>>,
 }

 impl EndpointInfo {
@@ -92,9 +97,52 @@ impl EndpointInfo {
        }
        None
    }
+    pub(crate) fn get_allowed_vpc_endpoint_ids(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(Arc<Vec<String>>, bool)> {
+        if let Some(allowed_vpc_endpoint_ids) = &self.allowed_vpc_endpoint_ids {
+            if valid_since < allowed_vpc_endpoint_ids.created_at {
+                return Some((
+                    allowed_vpc_endpoint_ids.value.clone(),
+                    Self::check_ignore_cache(
+                        ignore_cache_since,
+                        allowed_vpc_endpoint_ids.created_at,
+                    ),
+                ));
+            }
+        }
+        None
+    }
+    pub(crate) fn get_block_public_or_vpc_access(
+        &self,
+        valid_since: Instant,
+        ignore_cache_since: Option<Instant>,
+    ) -> Option<(AccessBlockerFlags, bool)> {
+        if let Some(block_public_or_vpc_access) = &self.block_public_or_vpc_access {
+            if valid_since < block_public_or_vpc_access.created_at {
+                return Some((
+                    block_public_or_vpc_access.value.clone(),
+                    Self::check_ignore_cache(
+                        ignore_cache_since,
+                        block_public_or_vpc_access.created_at,
+                    ),
+                ));
+            }
+        }
+        None
+    }
+
    pub(crate) fn invalidate_allowed_ips(&mut self) {
        self.allowed_ips = None;
    }
+    pub(crate) fn invalidate_allowed_vpc_endpoint_ids(&mut self) {
+        self.allowed_vpc_endpoint_ids = None;
+    }
+    pub(crate) fn invalidate_block_public_or_vpc_access(&mut self) {
+        self.block_public_or_vpc_access = None;
+    }
    pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
        self.secret.remove(&role_name);
    }
@@ -108,9 +156,11 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<EndpointIdInt, EndpointInfo>,
+    cache: ClashMap<EndpointIdInt, EndpointInfo>,

-    project2ep: DashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
+    project2ep: ClashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
+    // FIXME(stefan): we need a way to GC the account2ep map.
+    account2ep: ClashMap<AccountIdInt, HashSet<EndpointIdInt>>,
    config: ProjectInfoCacheOptions,

    start_time: Instant,
@@ -120,6 +170,63 @@ pub struct ProjectInfoCacheImpl {

 #[async_trait]
 impl ProjectInfoCache for ProjectInfoCacheImpl {
+    fn invalidate_allowed_vpc_endpoint_ids_for_projects(&self, project_ids: Vec<ProjectIdInt>) {
+        info!(
+            "invalidating allowed vpc endpoint ids for projects `{}`",
+            project_ids
+                .iter()
+                .map(|id| id.to_string())
+                .collect::<Vec<_>>()
+                .join(", ")
+        );
+        for project_id in project_ids {
+            let endpoints = self
+                .project2ep
+                .get(&project_id)
+                .map(|kv| kv.value().clone())
+                .unwrap_or_default();
+            for endpoint_id in endpoints {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+                }
+            }
+        }
+    }
+
+    fn invalidate_allowed_vpc_endpoint_ids_for_org(&self, account_id: AccountIdInt) {
+        info!(
+            "invalidating allowed vpc endpoint ids for org `{}`",
+            account_id
+        );
+        let endpoints = self
+            .account2ep
+            .get(&account_id)
+            .map(|kv| kv.value().clone())
+            .unwrap_or_default();
+        for endpoint_id in endpoints {
+            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+            }
+        }
+    }
+
+    fn invalidate_block_public_or_vpc_access_for_project(&self, project_id: ProjectIdInt) {
+        info!(
+            "invalidating block public or vpc access for project `{}`",
+            project_id
+        );
+        let endpoints = self
+            .project2ep
+            .get(&project_id)
+            .map(|kv| kv.value().clone())
+            .unwrap_or_default();
+        for endpoint_id in endpoints {
+            if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) {
+                endpoint_info.invalidate_block_public_or_vpc_access();
+            }
+        }
+    }
+
    fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
        info!("invalidating allowed ips for project `{}`", project_id);
        let endpoints = self
@@ -176,8 +283,9 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
 impl ProjectInfoCacheImpl {
    pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self {
        Self {
-            cache: DashMap::new(),
-            project2ep: DashMap::new(),
+            cache: ClashMap::new(),
+            project2ep: ClashMap::new(),
+            account2ep: ClashMap::new(),
            config,
            ttl_disabled_since_us: AtomicU64::new(u64::MAX),
            start_time: Instant::now(),
@@ -226,6 +334,49 @@ impl ProjectInfoCacheImpl {
        }
        Some(Cached::new_uncached(value))
    }
+    pub(crate) fn get_allowed_vpc_endpoint_ids(
+        &self,
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, Arc<Vec<String>>>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_allowed_vpc_endpoint_ids(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_allowed_vpc_endpoint_ids(endpoint_id),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
+    }
+    pub(crate) fn get_block_public_or_vpc_access(
+        &self,
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, AccessBlockerFlags>> {
+        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
+        let (valid_since, ignore_cache_since) = self.get_cache_times();
+        let endpoint_info = self.cache.get(&endpoint_id)?;
+        let value = endpoint_info.get_block_public_or_vpc_access(valid_since, ignore_cache_since);
+        let (value, ignore_cache) = value?;
+        if !ignore_cache {
+            let cached = Cached {
+                token: Some((
+                    self,
+                    CachedLookupInfo::new_block_public_or_vpc_access(endpoint_id),
+                )),
+                value,
+            };
+            return Some(cached);
+        }
+        Some(Cached::new_uncached(value))
+    }
+
    pub(crate) fn insert_role_secret(
        &self,
        project_id: ProjectIdInt,
@@ -256,6 +407,43 @@ impl ProjectInfoCacheImpl {
        self.insert_project2endpoint(project_id, endpoint_id);
        self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
    }
+    pub(crate) fn insert_allowed_vpc_endpoint_ids(
+        &self,
+        account_id: Option<AccountIdInt>,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        allowed_vpc_endpoint_ids: Arc<Vec<String>>,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        if let Some(account_id) = account_id {
+            self.insert_account2endpoint(account_id, endpoint_id);
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id)
+            .or_default()
+            .allowed_vpc_endpoint_ids = Some(allowed_vpc_endpoint_ids.into());
+    }
+    pub(crate) fn insert_block_public_or_vpc_access(
+        &self,
+        project_id: ProjectIdInt,
+        endpoint_id: EndpointIdInt,
+        access_blockers: AccessBlockerFlags,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+        self.insert_project2endpoint(project_id, endpoint_id);
+        self.cache
+            .entry(endpoint_id)
+            .or_default()
+            .block_public_or_vpc_access = Some(access_blockers.into());
+    }
+
    fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
        if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
            endpoints.insert(endpoint_id);
@@ -264,6 +452,14 @@ impl ProjectInfoCacheImpl {
                .insert(project_id, HashSet::from([endpoint_id]));
        }
    }
+    fn insert_account2endpoint(&self, account_id: AccountIdInt, endpoint_id: EndpointIdInt) {
+        if let Some(mut endpoints) = self.account2ep.get_mut(&account_id) {
+            endpoints.insert(endpoint_id);
+        } else {
+            self.account2ep
+                .insert(account_id, HashSet::from([endpoint_id]));
+        }
+    }
    fn get_cache_times(&self) -> (Instant, Option<Instant>) {
        let mut valid_since = Instant::now() - self.config.ttl;
        // Only ignore cache if ttl is disabled.
@@ -302,7 +498,7 @@ impl ProjectInfoCacheImpl {
        let mut removed = 0;
        let shard = self.project2ep.shards()[shard].write();
        for (_, endpoints) in shard.iter() {
-            for endpoint in endpoints.get() {
+            for endpoint in endpoints {
                self.cache.remove(endpoint);
                removed += 1;
            }
@@ -334,11 +530,25 @@ impl CachedLookupInfo {
            lookup_type: LookupType::AllowedIps,
        }
    }
+    pub(self) fn new_allowed_vpc_endpoint_ids(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::AllowedVpcEndpointIds,
+        }
+    }
+    pub(self) fn new_block_public_or_vpc_access(endpoint_id: EndpointIdInt) -> Self {
+        Self {
+            endpoint_id,
+            lookup_type: LookupType::BlockPublicOrVpcAccess,
+        }
+    }
 }

 enum LookupType {
    RoleSecret(RoleNameInt),
    AllowedIps,
+    AllowedVpcEndpointIds,
+    BlockPublicOrVpcAccess,
 }

 impl Cache for ProjectInfoCacheImpl {
@@ -360,6 +570,16 @@ impl Cache for ProjectInfoCacheImpl {
                    endpoint_info.invalidate_allowed_ips();
                }
            }
+            LookupType::AllowedVpcEndpointIds => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_allowed_vpc_endpoint_ids();
+                }
+            }
+            LookupType::BlockPublicOrVpcAccess => {
+                if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
+                    endpoint_info.invalidate_block_public_or_vpc_access();
+                }
+            }
        }
    }
 }
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,3 +1,4 @@
+use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;

@@ -8,23 +9,22 @@ use pq_proto::CancelKeyData;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio::sync::mpsc;
+use tokio::sync::{mpsc, oneshot};
 use tracing::{debug, info};

-use crate::auth::backend::{BackendIpAllowlist, ComputeUserInfo};
+use crate::auth::backend::ComputeUserInfo;
 use crate::auth::{check_peer_addr_is_in_list, AuthError};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
+use crate::control_plane::ControlPlaneApi;
 use crate::error::ReportableError;
 use crate::ext::LockExt;
-use crate::metrics::CancelChannelSizeGuard;
-use crate::metrics::{CancellationRequest, Metrics, RedisMsgKind};
+use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, RedisMsgKind};
+use crate::protocol2::ConnectionInfoExtra;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
 use crate::redis::kv_ops::RedisKVClient;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
-use std::convert::Infallible;
-use tokio::sync::oneshot;

 type IpSubnetKey = IpNet;

@@ -135,6 +135,9 @@ pub(crate) enum CancelError {
    #[error("IP is not allowed")]
    IpNotAllowed,

+    #[error("VPC endpoint id is not allowed to connect")]
+    VpcEndpointIdNotAllowed,
+
    #[error("Authentication backend error")]
    AuthError(#[from] AuthError),

@@ -154,8 +157,9 @@ impl ReportableError for CancelError {
            }
            CancelError::Postgres(_) => crate::error::ErrorKind::Compute,
            CancelError::RateLimit => crate::error::ErrorKind::RateLimit,
-            CancelError::IpNotAllowed => crate::error::ErrorKind::User,
-            CancelError::NotFound => crate::error::ErrorKind::User,
+            CancelError::IpNotAllowed
+            | CancelError::VpcEndpointIdNotAllowed
+            | CancelError::NotFound => crate::error::ErrorKind::User,
            CancelError::AuthError(_) => crate::error::ErrorKind::ControlPlane,
            CancelError::InternalError => crate::error::ErrorKind::Service,
        }
@@ -267,11 +271,12 @@ impl CancellationHandler {
    /// Will fetch IP allowlist internally.
    ///
    /// return Result primarily for tests
-    pub(crate) async fn cancel_session<T: BackendIpAllowlist>(
+    pub(crate) async fn cancel_session<T: ControlPlaneApi>(
        &self,
        key: CancelKeyData,
        ctx: RequestContext,
-        check_allowed: bool,
+        check_ip_allowed: bool,
+        check_vpc_allowed: bool,
        auth_backend: &T,
    ) -> Result<(), CancelError> {
        let subnet_key = match ctx.peer_addr() {
@@ -306,11 +311,11 @@ impl CancellationHandler {
            return Err(CancelError::NotFound);
        };

-        if check_allowed {
+        if check_ip_allowed {
            let ip_allowlist = auth_backend
                .get_allowed_ips(&ctx, &cancel_closure.user_info)
                .await
-                .map_err(CancelError::AuthError)?;
+                .map_err(|e| CancelError::AuthError(e.into()))?;

            if !check_peer_addr_is_in_list(&ctx.peer_addr(), &ip_allowlist) {
                // log it here since cancel_session could be spawned in a task
@@ -322,6 +327,40 @@ impl CancellationHandler {
            }
        }

+        // check if a VPC endpoint ID is coming in and if yes, if it's allowed
+        let access_blocks = auth_backend
+            .get_block_public_or_vpc_access(&ctx, &cancel_closure.user_info)
+            .await
+            .map_err(|e| CancelError::AuthError(e.into()))?;
+
+        if check_vpc_allowed {
+            if access_blocks.vpc_access_blocked {
+                return Err(CancelError::AuthError(AuthError::NetworkNotAllowed));
+            }
+
+            let incoming_vpc_endpoint_id = match ctx.extra() {
+                None => return Err(CancelError::AuthError(AuthError::MissingVPCEndpointId)),
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                    // Convert the vcpe_id to a string
+                    String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+                }
+                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+            };
+
+            let allowed_vpc_endpoint_ids = auth_backend
+                .get_allowed_vpc_endpoint_ids(&ctx, &cancel_closure.user_info)
+                .await
+                .map_err(|e| CancelError::AuthError(e.into()))?;
+            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+            if !allowed_vpc_endpoint_ids.is_empty()
+                && !allowed_vpc_endpoint_ids.contains(&incoming_vpc_endpoint_id)
+            {
+                return Err(CancelError::VpcEndpointIdNotAllowed);
+            }
+        } else if access_blocks.public_access_blocked {
+            return Err(CancelError::VpcEndpointIdNotAllowed);
+        }
+
        Metrics::get()
            .proxy
            .cancellation_requests_total
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -68,6 +68,7 @@ pub struct AuthenticationConfig {
    pub rate_limiter: AuthRateLimiter,
    pub rate_limit_ip_subnet: u8,
    pub ip_allowlist_check_enabled: bool,
+    pub is_vpc_acccess_proxy: bool,
    pub jwks_cache: JwkCache,
    pub is_auth_broker: bool,
    pub accept_jwts: bool,
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -182,7 +182,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
                            cancel_key_data,
                            ctx,
                            config.authentication_config.ip_allowlist_check_enabled,
-                            backend,
+                            config.authentication_config.is_vpc_acccess_proxy,
+                            backend.get_api(),
                        )
                        .await
                        .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -19,7 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
    ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
 };
-use crate::protocol2::ConnectionInfo;
+use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
 use crate::types::{DbName, EndpointId, RoleName};

 pub mod parquet;
@@ -312,6 +312,15 @@ impl RequestContext {
            .ip()
    }

+    pub(crate) fn extra(&self) -> Option<ConnectionInfoExtra> {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .conn_info
+            .extra
+            .clone()
+    }
+
    pub(crate) fn cold_start_info(&self) -> ColdStartInfo {
        self.0
            .try_lock()
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -22,7 +22,8 @@ use crate::control_plane::errors::{
 use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
 use crate::control_plane::{
-    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
+    AccessBlockerFlags, AuthInfo, AuthSecret, CachedAccessBlockerFlags, CachedAllowedIps,
+    CachedAllowedVpcEndpointIds, CachedNodeInfo, CachedRoleSecret, NodeInfo,
 };
 use crate::metrics::{CacheOutcome, Metrics};
 use crate::rate_limiter::WakeComputeRateLimiter;
@@ -137,9 +138,6 @@ impl NeonControlPlaneClient {
                }
            };

-            // Ivan: don't know where it will be used, so I leave it here
-            let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
-
            let secret = if body.role_secret.is_empty() {
                None
            } else {
@@ -153,10 +151,23 @@ impl NeonControlPlaneClient {
                .proxy
                .allowed_ips_number
                .observe(allowed_ips.len() as f64);
+            let allowed_vpc_endpoint_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
+            Metrics::get()
+                .proxy
+                .allowed_vpc_endpoint_ids
+                .observe(allowed_vpc_endpoint_ids.len() as f64);
+            let block_public_connections = body.block_public_connections.unwrap_or_default();
+            let block_vpc_connections = body.block_vpc_connections.unwrap_or_default();
            Ok(AuthInfo {
                secret,
                allowed_ips,
+                allowed_vpc_endpoint_ids,
                project_id: body.project_id,
+                account_id: body.account_id,
+                access_blocker_flags: AccessBlockerFlags {
+                    public_access_blocked: block_public_connections,
+                    vpc_access_blocked: block_vpc_connections,
+                },
            })
        }
        .inspect_err(|e| tracing::debug!(error = ?e))
@@ -299,6 +310,7 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
            return Ok(role_secret);
        }
        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let account_id = auth_info.account_id;
        if let Some(project_id) = auth_info.project_id {
            let normalized_ep_int = normalized_ep.into();
            self.caches.project_info.insert_role_secret(
@@ -312,24 +324,35 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                normalized_ep_int,
                Arc::new(auth_info.allowed_ips),
            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                Arc::new(auth_info.allowed_vpc_endpoint_ids),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                auth_info.access_blocker_flags,
+            );
            ctx.set_project_id(project_id);
        }
        // When we just got a secret, we don't need to invalidate it.
        Ok(Cached::new_uncached(auth_info.secret))
    }

-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
        &self,
        ctx: &RequestContext,
        user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
        let normalized_ep = &user_info.endpoint.normalize();
        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
            Metrics::get()
                .proxy
-                .allowed_ips_cache_misses
+                .allowed_ips_cache_misses // TODO SR: Should we rename this variable to something like allowed_ip_cache_stats?
                .inc(CacheOutcome::Hit);
-            return Ok((allowed_ips, None));
+            return Ok(allowed_ips);
        }
        Metrics::get()
            .proxy
@@ -337,7 +360,10 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
            .inc(CacheOutcome::Miss);
        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
        let user = &user_info.user;
+        let account_id = auth_info.account_id;
        if let Some(project_id) = auth_info.project_id {
            let normalized_ep_int = normalized_ep.into();
            self.caches.project_info.insert_role_secret(
@@ -351,12 +377,136 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                normalized_ep_int,
                allowed_ips.clone(),
            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags,
+            );
            ctx.set_project_id(project_id);
        }
-        Ok((
-            Cached::new_uncached(allowed_ips),
-            Some(Cached::new_uncached(auth_info.secret)),
-        ))
+        Ok(Cached::new_uncached(allowed_ips))
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_vpc_endpoint_ids) = self
+            .caches
+            .project_info
+            .get_allowed_vpc_endpoint_ids(normalized_ep)
+        {
+            Metrics::get()
+                .proxy
+                .vpc_endpoint_id_cache_stats
+                .inc(CacheOutcome::Hit);
+            return Ok(allowed_vpc_endpoint_ids);
+        }
+
+        Metrics::get()
+            .proxy
+            .vpc_endpoint_id_cache_stats
+            .inc(CacheOutcome::Miss);
+
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags,
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok(Cached::new_uncached(allowed_vpc_endpoint_ids))
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(access_blocker_flags) = self
+            .caches
+            .project_info
+            .get_block_public_or_vpc_access(normalized_ep)
+        {
+            Metrics::get()
+                .proxy
+                .access_blocker_flags_cache_stats
+                .inc(CacheOutcome::Hit);
+            return Ok(access_blocker_flags);
+        }
+
+        Metrics::get()
+            .proxy
+            .access_blocker_flags_cache_stats
+            .inc(CacheOutcome::Miss);
+
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let allowed_vpc_endpoint_ids = Arc::new(auth_info.allowed_vpc_endpoint_ids);
+        let access_blocker_flags = auth_info.access_blocker_flags;
+        let user = &user_info.user;
+        let account_id = auth_info.account_id;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            self.caches.project_info.insert_allowed_vpc_endpoint_ids(
+                account_id,
+                project_id,
+                normalized_ep_int,
+                allowed_vpc_endpoint_ids.clone(),
+            );
+            self.caches.project_info.insert_block_public_or_vpc_access(
+                project_id,
+                normalized_ep_int,
+                access_blocker_flags.clone(),
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok(Cached::new_uncached(access_blocker_flags))
    }

    #[tracing::instrument(skip_all)]
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -13,12 +13,14 @@ use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::context::RequestContext;
-use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret};
+use crate::control_plane::client::{
+    CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedRoleSecret,
+};
 use crate::control_plane::errors::{
    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
 use crate::control_plane::messages::MetricsAuxInfo;
-use crate::control_plane::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
+use crate::control_plane::{AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
 use crate::error::io_error;
 use crate::intern::RoleNameInt;
 use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
@@ -121,7 +123,10 @@ impl MockControlPlane {
        Ok(AuthInfo {
            secret,
            allowed_ips,
+            allowed_vpc_endpoint_ids: vec![],
            project_id: None,
+            account_id: None,
+            access_blocker_flags: AccessBlockerFlags::default(),
        })
    }

@@ -214,16 +219,35 @@ impl super::ControlPlaneApi for MockControlPlane {
        ))
    }

-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
        &self,
        _ctx: &RequestContext,
        user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        Ok((
-            Cached::new_uncached(Arc::new(
-                self.do_get_auth_info(user_info).await?.allowed_ips,
-            )),
-            None,
+    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+        Ok(Cached::new_uncached(Arc::new(
+            self.do_get_auth_info(user_info).await?.allowed_ips,
+        )))
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        _ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, super::errors::GetAuthInfoError> {
+        Ok(Cached::new_uncached(Arc::new(
+            self.do_get_auth_info(user_info)
+                .await?
+                .allowed_vpc_endpoint_ids,
+        )))
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        _ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<super::CachedAccessBlockerFlags, super::errors::GetAuthInfoError> {
+        Ok(Cached::new_uncached(
+            self.do_get_auth_info(user_info).await?.access_blocker_flags,
        ))
    }

--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -6,7 +6,7 @@ use std::hash::Hash;
 use std::sync::Arc;
 use std::time::Duration;

-use dashmap::DashMap;
+use clashmap::ClashMap;
 use tokio::time::Instant;
 use tracing::{debug, info};

@@ -17,7 +17,8 @@ use crate::cache::project_info::ProjectInfoCacheImpl;
 use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
 use crate::context::RequestContext;
 use crate::control_plane::{
-    errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
+    errors, CachedAccessBlockerFlags, CachedAllowedIps, CachedAllowedVpcEndpointIds,
+    CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
 };
 use crate::error::ReportableError;
 use crate::metrics::ApiLockMetrics;
@@ -55,17 +56,45 @@ impl ControlPlaneApi for ControlPlaneClient {
        }
    }

-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
        &self,
        ctx: &RequestContext,
        user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError> {
        match self {
-            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::ProxyV1(api) => api.get_allowed_ips(ctx, user_info).await,
            #[cfg(any(test, feature = "testing"))]
-            Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Self::PostgresMock(api) => api.get_allowed_ips(ctx, user_info).await,
            #[cfg(test)]
-            Self::Test(api) => api.get_allowed_ips_and_secret(),
+            Self::Test(api) => api.get_allowed_ips(),
+        }
+    }
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError> {
+        match self {
+            Self::ProxyV1(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_allowed_vpc_endpoint_ids(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_allowed_vpc_endpoint_ids(),
+        }
+    }
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError> {
+        match self {
+            Self::ProxyV1(api) => api.get_block_public_or_vpc_access(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_block_public_or_vpc_access(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_block_public_or_vpc_access(),
        }
    }

@@ -102,9 +131,15 @@ impl ControlPlaneApi for ControlPlaneClient {
 pub(crate) trait TestControlPlaneClient: Send + Sync + 'static {
    fn wake_compute(&self) -> Result<CachedNodeInfo, errors::WakeComputeError>;

-    fn get_allowed_ips_and_secret(
+    fn get_allowed_ips(&self) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+
+    fn get_allowed_vpc_endpoint_ids(
        &self,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError>;
+
+    fn get_block_public_or_vpc_access(
+        &self,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError>;

    fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient>;
 }
@@ -148,7 +183,7 @@ impl ApiCaches {
 /// Various caches for [`control_plane`](super).
 pub struct ApiLocks<K> {
    name: &'static str,
-    node_locks: DashMap<K, Arc<DynamicLimiter>>,
+    node_locks: ClashMap<K, Arc<DynamicLimiter>>,
    config: RateLimiterConfig,
    timeout: Duration,
    epoch: std::time::Duration,
@@ -180,7 +215,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
    ) -> prometheus::Result<Self> {
        Ok(Self {
            name,
-            node_locks: DashMap::with_shard_amount(shards),
+            node_locks: ClashMap::with_shard_amount(shards),
            config,
            timeout,
            epoch,
@@ -238,7 +273,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                let mut lock = shard.write();
                let timer = self.metrics.reclamation_lag_seconds.start_timer();
                let count = lock
-                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
+                    .extract_if(|(_, semaphore)| Arc::strong_count(semaphore) == 1)
                    .count();
                drop(lock);
                self.metrics.semaphores_unregistered.inc_by(count as u64);
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -4,7 +4,7 @@ use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};

 use crate::auth::IpPattern;
-use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::proxy::retry::CouldRetry;

 /// Generic error response with human-readable description.
@@ -227,8 +227,11 @@ pub(crate) struct UserFacingMessage {
 pub(crate) struct GetEndpointAccessControl {
    pub(crate) role_secret: Box<str>,
    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<String>>,
    pub(crate) project_id: Option<ProjectIdInt>,
-    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
+    pub(crate) account_id: Option<AccountIdInt>,
+    pub(crate) block_public_connections: Option<bool>,
+    pub(crate) block_vpc_connections: Option<bool>,
 }

 /// Response which holds compute node's `host:port` pair.
@@ -282,6 +285,10 @@ pub(crate) struct DatabaseInfo {
    pub(crate) aux: MetricsAuxInfo,
    #[serde(default)]
    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    #[serde(default)]
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<String>>,
+    #[serde(default)]
+    pub(crate) public_access_allowed: Option<bool>,
 }

 // Manually implement debug to omit sensitive info.
@@ -293,6 +300,7 @@ impl fmt::Debug for DatabaseInfo {
            .field("dbname", &self.dbname)
            .field("user", &self.user)
            .field("allowed_ips", &self.allowed_ips)
+            .field("allowed_vpc_endpoint_ids", &self.allowed_vpc_endpoint_ids)
            .finish_non_exhaustive()
    }
 }
@@ -457,7 +465,7 @@ mod tests {

    #[test]
    fn parse_get_role_secret() -> anyhow::Result<()> {
-        // Empty `allowed_ips` field.
+        // Empty `allowed_ips` and `allowed_vpc_endpoint_ids` field.
        let json = json!({
            "role_secret": "secret",
        });
@@ -467,9 +475,21 @@ mod tests {
            "allowed_ips": ["8.8.8.8"],
        });
        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
+        let json = json!({
+            "role_secret": "secret",
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
+        });
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
        let json = json!({
            "role_secret": "secret",
            "allowed_ips": ["8.8.8.8"],
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
+        });
+        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
+        let json = json!({
+            "role_secret": "secret",
+            "allowed_ips": ["8.8.8.8"],
+            "allowed_vpc_endpoint_ids": ["vpce-0abcd1234567890ef"],
            "project_id": "project",
        });
        serde_json::from_str::<GetEndpointAccessControl>(&json.to_string())?;
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -19,6 +19,7 @@ use crate::cache::{Cached, TimedLru};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
+use crate::intern::AccountIdInt;
 use crate::intern::ProjectIdInt;
 use crate::types::{EndpointCacheKey, EndpointId};
 use crate::{compute, scram};
@@ -52,8 +53,14 @@ pub(crate) struct AuthInfo {
    pub(crate) secret: Option<AuthSecret>,
    /// List of IP addresses allowed for the autorization.
    pub(crate) allowed_ips: Vec<IpPattern>,
+    /// List of VPC endpoints allowed for the autorization.
+    pub(crate) allowed_vpc_endpoint_ids: Vec<String>,
    /// Project ID. This is used for cache invalidation.
    pub(crate) project_id: Option<ProjectIdInt>,
+    /// Account ID. This is used for cache invalidation.
+    pub(crate) account_id: Option<AccountIdInt>,
+    /// Are public connections or VPC connections blocked?
+    pub(crate) access_blocker_flags: AccessBlockerFlags,
 }

 /// Info for establishing a connection to a compute node.
@@ -95,11 +102,21 @@ impl NodeInfo {
    }
 }

+#[derive(Clone, Default, Eq, PartialEq, Debug)]
+pub(crate) struct AccessBlockerFlags {
+    pub public_access_blocked: bool,
+    pub vpc_access_blocked: bool,
+}
+
 pub(crate) type NodeInfoCache =
    TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneErrorMessage>>>;
 pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
 pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
 pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
+pub(crate) type CachedAllowedVpcEndpointIds =
+    Cached<&'static ProjectInfoCacheImpl, Arc<Vec<String>>>;
+pub(crate) type CachedAccessBlockerFlags =
+    Cached<&'static ProjectInfoCacheImpl, AccessBlockerFlags>;

 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
@@ -113,11 +130,23 @@ pub(crate) trait ControlPlaneApi {
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;

-    async fn get_allowed_ips_and_secret(
+    async fn get_allowed_ips(
        &self,
        ctx: &RequestContext,
        user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
+    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+
+    async fn get_allowed_vpc_endpoint_ids(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAllowedVpcEndpointIds, errors::GetAuthInfoError>;
+
+    async fn get_block_public_or_vpc_access(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedAccessBlockerFlags, errors::GetAuthInfoError>;

    async fn get_endpoint_jwks(
        &self,
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -7,7 +7,7 @@ use std::sync::OnceLock;
 use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
 use rustc_hash::FxHasher;

-use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
+use crate::types::{AccountId, BranchId, EndpointId, ProjectId, RoleName};

 pub trait InternId: Sized + 'static {
    fn get_interner() -> &'static StringInterner<Self>;
@@ -206,6 +206,26 @@ impl From<ProjectId> for ProjectIdInt {
    }
 }

+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub struct AccountIdTag;
+impl InternId for AccountIdTag {
+    fn get_interner() -> &'static StringInterner<Self> {
+        static ROLE_NAMES: OnceLock<StringInterner<AccountIdTag>> = OnceLock::new();
+        ROLE_NAMES.get_or_init(Default::default)
+    }
+}
+pub type AccountIdInt = InternedString<AccountIdTag>;
+impl From<&AccountId> for AccountIdInt {
+    fn from(value: &AccountId) -> Self {
+        AccountIdTag::get_interner().get_or_intern(value)
+    }
+}
+impl From<AccountId> for AccountIdInt {
+    fn from(value: AccountId) -> Self {
+        AccountIdTag::get_interner().get_or_intern(&value)
+    }
+}
+
 #[cfg(test)]
 #[expect(clippy::unwrap_used)]
 mod tests {
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -96,6 +96,16 @@ pub struct ProxyMetrics {
    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
    pub allowed_ips_number: Histogram<10>,

+    /// Number of cache hits/misses for VPC endpoint IDs.
+    pub vpc_endpoint_id_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
+
+    /// Number of cache hits/misses for access blocker flags.
+    pub access_blocker_flags_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
+
+    /// Number of allowed VPC endpoints IDs
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
+    pub allowed_vpc_endpoint_ids: Histogram<10>,
+
    /// Number of connections (per sni).
    pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,

@@ -570,6 +580,9 @@ pub enum RedisEventsCount {
    CancelSession,
    PasswordUpdate,
    AllowedIpsUpdate,
+    AllowedVpcEndpointIdsUpdateForProjects,
+    AllowedVpcEndpointIdsUpdateForAllProjectsInOrg,
+    BlockPublicOrVpcAccessUpdate,
 }

 pub struct ThreadPoolWorkers(usize);
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -201,25 +201,26 @@ impl CopyBuffer {
        W: AsyncWrite + ?Sized,
    {
        loop {
-            // If our buffer is empty, then we need to read some data to
-            // continue.
-            if self.pos == self.cap && !self.read_done {
-                self.pos = 0;
-                self.cap = 0;
-
+            // If there is some space left in our buffer, then we try to read some
+            // data to continue, thus maximizing the chances of a large write.
+            if self.cap < self.buf.len() && !self.read_done {
                match self.poll_fill_buf(cx, reader.as_mut()) {
                    Poll::Ready(Ok(())) => (),
                    Poll::Ready(Err(err)) => return Poll::Ready(Err(ErrorDirection::Read(err))),
                    Poll::Pending => {
-                        // Try flushing when the reader has no progress to avoid deadlock
-                        // when the reader depends on buffered writer.
-                        if self.need_flush {
-                            ready!(writer.as_mut().poll_flush(cx))
-                                .map_err(ErrorDirection::Write)?;
-                            self.need_flush = false;
-                        }
+                        // Ignore pending reads when our buffer is not empty, because
+                        // we can try to write data immediately.
+                        if self.pos == self.cap {
+                            // Try flushing when the reader has no progress to avoid deadlock
+                            // when the reader depends on buffered writer.
+                            if self.need_flush {
+                                ready!(writer.as_mut().poll_flush(cx))
+                                    .map_err(ErrorDirection::Write)?;
+                                self.need_flush = false;
+                            }

-                        return Poll::Pending;
+                            return Poll::Pending;
+                        }
                    }
                }
            }
@@ -246,9 +247,13 @@ impl CopyBuffer {
                "writer returned length larger than input slice"
            );

+            // All data has been written, the buffer can be considered empty again
+            self.pos = 0;
+            self.cap = 0;
+
            // If we've written all the data and we've seen EOF, flush out the
            // data and finish the transfer.
-            if self.pos == self.cap && self.read_done {
+            if self.read_done {
                ready!(writer.as_mut().poll_flush(cx)).map_err(ErrorDirection::Write)?;
                return Poll::Ready(Ok(self.amt));
            }
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -283,7 +283,8 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
                            cancel_key_data,
                            ctx,
                            config.authentication_config.ip_allowlist_check_enabled,
-                            auth_backend,
+                            config.authentication_config.is_vpc_acccess_proxy,
+                            auth_backend.get_api(),
                        )
                        .await
                        .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok();
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -26,7 +26,7 @@ use crate::config::{ComputeConfig, RetryConfig};
 use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
 use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
 use crate::control_plane::{
-    self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache,
+    self, CachedAllowedIps, CachedAllowedVpcEndpointIds, CachedNodeInfo, NodeInfo, NodeInfoCache,
 };
 use crate::error::ErrorKind;
 use crate::tls::client_config::compute_client_config_with_certs;
@@ -526,9 +526,19 @@ impl TestControlPlaneClient for TestConnectMechanism {
        }
    }

-    fn get_allowed_ips_and_secret(
+    fn get_allowed_ips(&self) -> Result<CachedAllowedIps, control_plane::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
+
+    fn get_allowed_vpc_endpoint_ids(
        &self,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), control_plane::errors::GetAuthInfoError>
+    ) -> Result<CachedAllowedVpcEndpointIds, control_plane::errors::GetAuthInfoError> {
+        unimplemented!("not used in tests")
+    }
+
+    fn get_block_public_or_vpc_access(
+        &self,
+    ) -> Result<control_plane::CachedAccessBlockerFlags, control_plane::errors::GetAuthInfoError>
    {
        unimplemented!("not used in tests")
    }
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -2,7 +2,7 @@ use std::hash::Hash;
 use std::sync::atomic::{AtomicUsize, Ordering};

 use ahash::RandomState;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use rand::{thread_rng, Rng};
 use tokio::time::Instant;
 use tracing::info;
@@ -14,7 +14,7 @@ use crate::intern::EndpointIdInt;
 pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;

 pub struct LeakyBucketRateLimiter<Key> {
-    map: DashMap<Key, LeakyBucketState, RandomState>,
+    map: ClashMap<Key, LeakyBucketState, RandomState>,
    config: utils::leaky_bucket::LeakyBucketConfig,
    access_count: AtomicUsize,
 }
@@ -27,7 +27,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {

    pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
        Self {
-            map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
+            map: ClashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
            config: config.into(),
            access_count: AtomicUsize::new(0),
        }
@@ -58,7 +58,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
        let shard = thread_rng().gen_range(0..n);
        self.map.shards()[shard]
            .write()
-            .retain(|_, value| !value.get().bucket_is_empty(now));
+            .retain(|(_, value)| !value.bucket_is_empty(now));
    }
 }

--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -5,7 +5,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Mutex;

 use anyhow::bail;
-use dashmap::DashMap;
+use clashmap::ClashMap;
 use itertools::Itertools;
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
@@ -62,7 +62,7 @@ impl GlobalRateLimiter {
 pub type WakeComputeRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;

 pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
-    map: DashMap<Key, Vec<RateBucket>, Hasher>,
+    map: ClashMap<Key, Vec<RateBucket>, Hasher>,
    info: Cow<'static, [RateBucketInfo]>,
    access_count: AtomicUsize,
    rand: Mutex<Rand>,
@@ -202,7 +202,7 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
        info!(buckets = ?info, "endpoint rate limiter");
        Self {
            info,
-            map: DashMap::with_hasher_and_shard_amount(hasher, 64),
+            map: ClashMap::with_hasher_and_shard_amount(hasher, 64),
            access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
            rand: Mutex::new(rand),
        }
--- a/proxy/src/redis/keys.rs
+++ b/proxy/src/redis/keys.rs
@@ -1,7 +1,8 @@
+use std::io::ErrorKind;
+
 use anyhow::Ok;
 use pq_proto::{id_to_cancel_key, CancelKeyData};
 use serde::{Deserialize, Serialize};
-use std::io::ErrorKind;

 pub mod keyspace {
    pub const CANCEL_PREFIX: &str = "cancel";
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -1,7 +1,6 @@
 use redis::{AsyncCommands, ToRedisArgs};

 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};

 pub struct RedisKVClient {
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -10,7 +10,7 @@ use uuid::Uuid;

 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::cache::project_info::ProjectInfoCache;
-use crate::intern::{ProjectIdInt, RoleNameInt};
+use crate::intern::{AccountIdInt, ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};

 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -86,9 +86,7 @@ pub(crate) struct BlockPublicOrVpcAccessUpdated {

 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 pub(crate) struct AllowedVpcEndpointsUpdatedForOrg {
-    // TODO: change type once the implementation is more fully fledged.
-    // See e.g. https://github.com/neondatabase/neon/pull/10073.
-    account_id: ProjectIdInt,
+    account_id: AccountIdInt,
 }

 #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
@@ -205,6 +203,24 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                        .proxy
                        .redis_events_count
                        .inc(RedisEventsCount::PasswordUpdate);
+                } else if matches!(
+                    msg,
+                    Notification::AllowedVpcEndpointsUpdatedForProjects { .. }
+                ) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForProjects);
+                } else if matches!(msg, Notification::AllowedVpcEndpointsUpdatedForOrg { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedVpcEndpointIdsUpdateForAllProjectsInOrg);
+                } else if matches!(msg, Notification::BlockPublicOrVpcAccessUpdated { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::BlockPublicOrVpcAccessUpdate);
                }
                // TODO: add additional metrics for the other event types.

@@ -230,20 +246,26 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
        Notification::AllowedIpsUpdate { allowed_ips_update } => {
            cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id);
        }
+        Notification::BlockPublicOrVpcAccessUpdated {
+            block_public_or_vpc_access_updated,
+        } => cache.invalidate_block_public_or_vpc_access_for_project(
+            block_public_or_vpc_access_updated.project_id,
+        ),
+        Notification::AllowedVpcEndpointsUpdatedForOrg {
+            allowed_vpc_endpoints_updated_for_org,
+        } => cache.invalidate_allowed_vpc_endpoint_ids_for_org(
+            allowed_vpc_endpoints_updated_for_org.account_id,
+        ),
+        Notification::AllowedVpcEndpointsUpdatedForProjects {
+            allowed_vpc_endpoints_updated_for_projects,
+        } => cache.invalidate_allowed_vpc_endpoint_ids_for_projects(
+            allowed_vpc_endpoints_updated_for_projects.project_ids,
+        ),
        Notification::PasswordUpdate { password_update } => cache
            .invalidate_role_secret_for_project(
                password_update.project_id,
                password_update.role_name,
            ),
-        Notification::BlockPublicOrVpcAccessUpdated { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
-        Notification::AllowedVpcEndpointsUpdatedForOrg { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
-        Notification::AllowedVpcEndpointsUpdatedForProjects { .. } => {
-            // https://github.com/neondatabase/neon/pull/10073
-        }
        Notification::UnknownTopic => unreachable!(),
    }
 }
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -30,6 +30,7 @@ use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::CachedNodeInfo;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::intern::EndpointIdInt;
+use crate::protocol2::ConnectionInfoExtra;
 use crate::proxy::connect_compute::ConnectMechanism;
 use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
 use crate::rate_limiter::EndpointRateLimiter;
@@ -57,23 +58,52 @@ impl PoolingBackend {

        let user_info = user_info.clone();
        let backend = self.auth_backend.as_ref().map(|()| user_info.clone());
-        let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
+        let allowed_ips = backend.get_allowed_ips(ctx).await?;
+
        if self.config.authentication_config.ip_allowlist_check_enabled
            && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
        {
            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
        }
+
+        let access_blocker_flags = backend.get_block_public_or_vpc_access(ctx).await?;
+        if self.config.authentication_config.is_vpc_acccess_proxy {
+            if access_blocker_flags.vpc_access_blocked {
+                return Err(AuthError::NetworkNotAllowed);
+            }
+
+            let extra = ctx.extra();
+            let incoming_endpoint_id = match extra {
+                None => String::new(),
+                Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                    // Convert the vcpe_id to a string
+                    String::from_utf8(vpce_id.to_vec()).unwrap_or_default()
+                }
+                Some(ConnectionInfoExtra::Azure { link_id }) => link_id.to_string(),
+            };
+
+            if incoming_endpoint_id.is_empty() {
+                return Err(AuthError::MissingVPCEndpointId);
+            }
+
+            let allowed_vpc_endpoint_ids = backend.get_allowed_vpc_endpoint_ids(ctx).await?;
+            // TODO: For now an empty VPC endpoint ID list means all are allowed. We should replace that.
+            if !allowed_vpc_endpoint_ids.is_empty()
+                && !allowed_vpc_endpoint_ids.contains(&incoming_endpoint_id)
+            {
+                return Err(AuthError::vpc_endpoint_id_not_allowed(incoming_endpoint_id));
+            }
+        } else if access_blocker_flags.public_access_blocked {
+            return Err(AuthError::NetworkNotAllowed);
+        }
+
        if !self
            .endpoint_rate_limiter
            .check(user_info.endpoint.clone().into(), 1)
        {
            return Err(AuthError::too_many_connections());
        }
-        let cached_secret = match maybe_secret {
-            Some(secret) => secret,
-            None => backend.get_role_secret(ctx).await?,
-        };
-
+        let cached_secret = backend.get_role_secret(ctx).await?;
        let secret = match cached_secret.value.clone() {
            Some(secret) => self.config.authentication_config.check_rate_limit(
                ctx,
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -5,7 +5,7 @@ use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
 use std::time::Duration;

-use dashmap::DashMap;
+use clashmap::ClashMap;
 use parking_lot::RwLock;
 use postgres_client::ReadyForQueryStatus;
 use rand::Rng;
@@ -351,11 +351,11 @@ where
    //
    // That should be a fairly conteded map, so return reference to the per-endpoint
    // pool as early as possible and release the lock.
-    pub(crate) global_pool: DashMap<EndpointCacheKey, Arc<RwLock<P>>>,
+    pub(crate) global_pool: ClashMap<EndpointCacheKey, Arc<RwLock<P>>>,

    /// Number of endpoint-connection pools
    ///
-    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// [`ClashMap::len`] iterates over all inner pools and acquires a read lock on each.
    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
    /// It's only used for diagnostics.
    pub(crate) global_pool_size: AtomicUsize,
@@ -396,7 +396,7 @@ where
    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
        let shards = config.pool_options.pool_shards;
        Arc::new(Self {
-            global_pool: DashMap::with_shard_amount(shards),
+            global_pool: ClashMap::with_shard_amount(shards),
            global_pool_size: AtomicUsize::new(0),
            config,
            global_connections_count: Arc::new(AtomicUsize::new(0)),
@@ -442,10 +442,10 @@ where
            .start_timer();
        let current_len = shard.len();
        let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
+        shard.retain(|(endpoint, x)| {
            // if the current endpoint pool is unique (no other strong or weak references)
            // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+            if let Some(pool) = Arc::get_mut(x) {
                let endpoints = pool.get_mut();
                clients_removed = endpoints.clear_closed();

--- a/proxy/src/types.rs
+++ b/proxy/src/types.rs
@@ -97,6 +97,8 @@ smol_str_wrapper!(EndpointId);
 smol_str_wrapper!(BranchId);
 // 90% of project strings are 23 characters or less.
 smol_str_wrapper!(ProjectId);
+// 90% of account strings are 23 characters or less.
+smol_str_wrapper!(AccountId);

 // will usually equal endpoint ID
 smol_str_wrapper!(EndpointCacheKey);
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -10,9 +10,9 @@ use anyhow::{bail, Context};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Bytes;
 use chrono::{DateTime, Datelike, Timelike, Utc};
+use clashmap::mapref::entry::Entry;
+use clashmap::ClashMap;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
-use dashmap::mapref::entry::Entry;
-use dashmap::DashMap;
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
@@ -137,7 +137,7 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;

 #[derive(Default)]
 pub(crate) struct Metrics {
-    endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    endpoints: ClashMap<Ids, Arc<MetricCounter>, FastHasher>,
 }

 impl Metrics {
@@ -213,7 +213,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
 }

 fn collect_and_clear_metrics<C: Clearable>(
-    endpoints: &DashMap<Ids, Arc<C>, FastHasher>,
+    endpoints: &ClashMap<Ids, Arc<C>, FastHasher>,
 ) -> Vec<(Ids, u64)> {
    let mut metrics_to_clear = Vec::new();

@@ -271,7 +271,7 @@ fn create_event_chunks<'a>(
 #[expect(clippy::too_many_arguments)]
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
-    endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
+    endpoints: &ClashMap<Ids, Arc<MetricCounter>, FastHasher>,
    client: &http::ClientWithMiddleware,
    metric_collection_endpoint: &reqwest::Url,
    storage: Option<&GenericRemoteStorage>,
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.84.0"
+channel = "1.84.1"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -88,13 +88,12 @@ fn bench_process_msg(c: &mut Criterion) {
                let (lsn, record) = walgen.next().expect("endless WAL");
                ProposerAcceptorMessage::AppendRequest(AppendRequest {
                    h: AppendRequestHeader {
+                        generation: 0,
                        term: 1,
-                        term_start_lsn: Lsn(0),
                        begin_lsn: lsn,
                        end_lsn: lsn + record.len() as u64,
                        commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
                        truncate_lsn: Lsn(0),
-                        proposer_uuid: [0; 16],
                    },
                    wal_data: record,
                })
@@ -160,13 +159,12 @@ fn bench_wal_acceptor(c: &mut Criterion) {
                    .take(n)
                    .map(|(lsn, record)| AppendRequest {
                        h: AppendRequestHeader {
+                            generation: 0,
                            term: 1,
-                            term_start_lsn: Lsn(0),
                            begin_lsn: lsn,
                            end_lsn: lsn + record.len() as u64,
                            commit_lsn: Lsn(0),
                            truncate_lsn: Lsn(0),
-                            proposer_uuid: [0; 16],
                        },
                        wal_data: record,
                    })
@@ -262,13 +260,12 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) {
            runtime.block_on(async {
                let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest {
                    h: AppendRequestHeader {
+                        generation: 0,
                        term: 1,
-                        term_start_lsn: Lsn(0),
                        begin_lsn: lsn,
                        end_lsn: lsn + record.len() as u64,
                        commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
                        truncate_lsn: Lsn(0),
-                        proposer_uuid: [0; 16],
                    },
                    wal_data: record,
                });
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -8,7 +8,7 @@

 use anyhow::Context;
 use postgres_backend::QueryError;
-use safekeeper_api::membership::Configuration;
+use safekeeper_api::membership::{Configuration, INVALID_GENERATION};
 use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -133,10 +133,10 @@ async fn send_proposer_elected(
    let history = TermHistory(history_entries);

    let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
+        generation: INVALID_GENERATION,
        term,
        start_streaming_at: lsn,
        term_history: history,
-        timeline_start_lsn: lsn,
    });

    tli.process_msg(&proposer_elected_request).await?;
@@ -170,13 +170,12 @@ pub async fn append_logical_message(

    let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
        h: AppendRequestHeader {
+            generation: INVALID_GENERATION,
            term: msg.term,
-            term_start_lsn: begin_lsn,
            begin_lsn,
            end_lsn,
            commit_lsn,
            truncate_lsn: msg.truncate_lsn,
-            proposer_uuid: [0u8; 16],
        },
        wal_data,
    });
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -281,7 +281,7 @@ impl SafekeeperPostgresHandler {
            tokio::select! {
                // todo: add read|write .context to these errors
                r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline, next_msg) => r,
-                r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
+                r = network_write(pgb, reply_rx, pageserver_feedback_rx, proto_version) => r,
                _ = timeline_cancel.cancelled() => {
                    return Err(CopyStreamHandlerEnd::Cancelled);
                }
@@ -342,8 +342,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
        let tli = match next_msg {
            ProposerAcceptorMessage::Greeting(ref greeting) => {
                info!(
-                    "start handshake with walproposer {} sysid {} timeline {}",
-                    self.peer_addr, greeting.system_id, greeting.tli,
+                    "start handshake with walproposer {} sysid {}",
+                    self.peer_addr, greeting.system_id,
                );
                let server_info = ServerInfo {
                    pg_version: greeting.pg_version,
@@ -459,6 +459,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
    pgb_writer: &mut PostgresBackend<IO>,
    mut reply_rx: Receiver<AcceptorProposerMessage>,
    mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback>,
+    proto_version: u32,
 ) -> Result<(), CopyStreamHandlerEnd> {
    let mut buf = BytesMut::with_capacity(128);

@@ -496,7 +497,7 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
        };

        buf.clear();
-        msg.serialize(&mut buf)?;
+        msg.serialize(&mut buf, proto_version)?;
        pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?;
    }
 }
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -7,6 +7,7 @@ use std::{fmt, pin::pin};
 use anyhow::{bail, Context};
 use futures::StreamExt;
 use postgres_protocol::message::backend::ReplicationMessage;
+use safekeeper_api::membership::INVALID_GENERATION;
 use safekeeper_api::models::{PeerInfo, TimelineStatus};
 use safekeeper_api::Term;
 use tokio::sync::mpsc::{channel, Receiver, Sender};
@@ -267,7 +268,10 @@ async fn recover(
    );

    // Now understand our term history.
-    let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
+    let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
+        generation: INVALID_GENERATION,
+        term: donor.term,
+    });
    let vote_response = match tli
        .process_msg(&vote_request)
        .await
@@ -302,10 +306,10 @@ async fn recover(

    // truncate WAL locally
    let pe = ProposerAcceptorMessage::Elected(ProposerElected {
+        generation: INVALID_GENERATION,
        term: donor.term,
        start_streaming_at: last_common_point.lsn,
        term_history: donor_th,
-        timeline_start_lsn: Lsn::INVALID,
    });
    // Successful ProposerElected handling always returns None. If term changed,
    // we'll find out that during the streaming. Note: it is expected to get
@@ -434,13 +438,12 @@ async fn network_io(
        match msg {
            ReplicationMessage::XLogData(xlog_data) => {
                let ar_hdr = AppendRequestHeader {
+                    generation: INVALID_GENERATION,
                    term: donor.term,
-                    term_start_lsn: Lsn::INVALID, // unused
                    begin_lsn: Lsn(xlog_data.wal_start()),
                    end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
                    commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
                    truncate_lsn: Lsn::INVALID, // do not attempt to advance
-                    proposer_uuid: [0; 16],
                };
                let ar = AppendRequest {
                    h: ar_hdr,
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -5,6 +5,11 @@ use byteorder::{LittleEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};

 use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
+use safekeeper_api::membership;
+use safekeeper_api::membership::Generation;
+use safekeeper_api::membership::MemberSet;
+use safekeeper_api::membership::SafekeeperId;
+use safekeeper_api::membership::INVALID_GENERATION;
 use safekeeper_api::models::HotStandbyFeedback;
 use safekeeper_api::Term;
 use serde::{Deserialize, Serialize};
@@ -12,6 +17,7 @@ use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
+use std::str::FromStr;
 use storage_broker::proto::SafekeeperTimelineInfo;

 use tracing::*;
@@ -29,7 +35,8 @@ use utils::{
    lsn::Lsn,
 };

-pub const SK_PROTOCOL_VERSION: u32 = 2;
+pub const SK_PROTO_VERSION_2: u32 = 2;
+pub const SK_PROTO_VERSION_3: u32 = 3;
 pub const UNKNOWN_SERVER_VERSION: u32 = 0;

 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
@@ -56,8 +63,28 @@ impl TermHistory {
        TermHistory(Vec::new())
    }

-    // Parse TermHistory as n_entries followed by TermLsn pairs
+    // Parse TermHistory as n_entries followed by TermLsn pairs in network order.
    pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
+        let n_entries = bytes
+            .get_u32_f()
+            .with_context(|| "TermHistory misses len")?;
+        let mut res = Vec::with_capacity(n_entries as usize);
+        for i in 0..n_entries {
+            let term = bytes
+                .get_u64_f()
+                .with_context(|| format!("TermHistory pos {} misses term", i))?;
+            let lsn = bytes
+                .get_u64_f()
+                .with_context(|| format!("TermHistory pos {} misses lsn", i))?
+                .into();
+            res.push(TermLsn { term, lsn })
+        }
+        Ok(TermHistory(res))
+    }
+
+    // Parse TermHistory as n_entries followed by TermLsn pairs in LE order.
+    // TODO remove once v2 protocol is fully dropped.
+    pub fn from_bytes_le(bytes: &mut Bytes) -> Result<TermHistory> {
        if bytes.remaining() < 4 {
            bail!("TermHistory misses len");
        }
@@ -197,6 +224,18 @@ impl AcceptorState {
 /// Initial Proposer -> Acceptor message
 #[derive(Debug, Deserialize)]
 pub struct ProposerGreeting {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub mconf: membership::Configuration,
+    /// Postgres server version
+    pub pg_version: u32,
+    pub system_id: SystemId,
+    pub wal_seg_size: u32,
+}
+
+/// V2 of the message; exists as a struct because we (de)serialized it as is.
+#[derive(Debug, Deserialize)]
+pub struct ProposerGreetingV2 {
    /// proposer-acceptor protocol version
    pub protocol_version: u32,
    /// Postgres server version
@@ -213,27 +252,35 @@ pub struct ProposerGreeting {
 /// (acceptor voted for).
 #[derive(Debug, Serialize)]
 pub struct AcceptorGreeting {
-    term: u64,
    node_id: NodeId,
+    mconf: membership::Configuration,
+    term: u64,
 }

 /// Vote request sent from proposer to safekeepers
-#[derive(Debug, Deserialize)]
+#[derive(Debug)]
 pub struct VoteRequest {
+    pub generation: Generation,
+    pub term: Term,
+}
+
+/// V2 of the message; exists as a struct because we (de)serialized it as is.
+#[derive(Debug, Deserialize)]
+pub struct VoteRequestV2 {
    pub term: Term,
 }

 /// Vote itself, sent from safekeeper to proposer
 #[derive(Debug, Serialize)]
 pub struct VoteResponse {
+    generation: Generation, // membership conf generation
    pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
-    vote_given: u64, // fixme u64 due to padding
+    vote_given: bool,
    // Safekeeper flush_lsn (end of WAL) + history of term switches allow
    // proposer to choose the most advanced one.
    pub flush_lsn: Lsn,
    truncate_lsn: Lsn,
    pub term_history: TermHistory,
-    timeline_start_lsn: Lsn,
 }

 /*
@@ -242,10 +289,10 @@ pub struct VoteResponse {
 */
 #[derive(Debug)]
 pub struct ProposerElected {
+    pub generation: Generation, // membership conf generation
    pub term: Term,
    pub start_streaming_at: Lsn,
    pub term_history: TermHistory,
-    pub timeline_start_lsn: Lsn,
 }

 /// Request with WAL message sent from proposer to safekeeper. Along the way it
@@ -257,6 +304,22 @@ pub struct AppendRequest {
 }
 #[derive(Debug, Clone, Deserialize)]
 pub struct AppendRequestHeader {
+    pub generation: Generation, // membership conf generation
+    // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
+    pub term: Term,
+    /// start position of message in WAL
+    pub begin_lsn: Lsn,
+    /// end position of message in WAL
+    pub end_lsn: Lsn,
+    /// LSN committed by quorum of safekeepers
+    pub commit_lsn: Lsn,
+    /// minimal LSN which may be needed by proposer to perform recovery of some safekeeper
+    pub truncate_lsn: Lsn,
+}
+
+/// V2 of the message; exists as a struct because we (de)serialized it as is.
+#[derive(Debug, Clone, Deserialize)]
+pub struct AppendRequestHeaderV2 {
    // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
    pub term: Term,
    // TODO: remove this field from the protocol, it in unused -- LSN of term
@@ -277,6 +340,9 @@ pub struct AppendRequestHeader {
 /// Report safekeeper state to proposer
 #[derive(Debug, Serialize, Clone)]
 pub struct AppendResponse {
+    // Membership conf generation. Not strictly required because on mismatch
+    // connection is reset, but let's sanity check it.
+    generation: Generation,
    // Current term of the safekeeper; if it is higher than proposer's, the
    // compute is out of date.
    pub term: Term,
@@ -293,8 +359,9 @@ pub struct AppendResponse {
 }

 impl AppendResponse {
-    fn term_only(term: Term) -> AppendResponse {
+    fn term_only(generation: Generation, term: Term) -> AppendResponse {
        AppendResponse {
+            generation,
            term,
            flush_lsn: Lsn(0),
            commit_lsn: Lsn(0),
@@ -315,72 +382,316 @@ pub enum ProposerAcceptorMessage {
    FlushWAL,
 }

-impl ProposerAcceptorMessage {
-    /// Parse proposer message.
-    pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
-        if proto_version != SK_PROTOCOL_VERSION {
-            bail!(
-                "incompatible protocol version {}, expected {}",
-                proto_version,
-                SK_PROTOCOL_VERSION
-            );
+/// Augment Bytes with fallible get_uN where N is number of bytes methods.
+/// All reads are in network (big endian) order.
+trait BytesF {
+    fn get_u8_f(&mut self) -> Result<u8>;
+    fn get_u16_f(&mut self) -> Result<u16>;
+    fn get_u32_f(&mut self) -> Result<u32>;
+    fn get_u64_f(&mut self) -> Result<u64>;
+}
+
+impl BytesF for Bytes {
+    fn get_u8_f(&mut self) -> Result<u8> {
+        if self.is_empty() {
+            bail!("no bytes left, expected 1");
        }
-        // xxx using Reader is inefficient but easy to work with bincode
-        let mut stream = msg_bytes.reader();
-        // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
-        let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
-        match tag {
-            'g' => {
-                let msg = ProposerGreeting::des_from(&mut stream)?;
-                Ok(ProposerAcceptorMessage::Greeting(msg))
-            }
-            'v' => {
-                let msg = VoteRequest::des_from(&mut stream)?;
-                Ok(ProposerAcceptorMessage::VoteRequest(msg))
-            }
-            'e' => {
-                let mut msg_bytes = stream.into_inner();
-                if msg_bytes.remaining() < 16 {
-                    bail!("ProposerElected message is not complete");
-                }
-                let term = msg_bytes.get_u64_le();
-                let start_streaming_at = msg_bytes.get_u64_le().into();
-                let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
-                if msg_bytes.remaining() < 8 {
-                    bail!("ProposerElected message is not complete");
-                }
-                let timeline_start_lsn = msg_bytes.get_u64_le().into();
-                let msg = ProposerElected {
-                    term,
-                    start_streaming_at,
-                    timeline_start_lsn,
-                    term_history,
+        Ok(self.get_u8())
+    }
+    fn get_u16_f(&mut self) -> Result<u16> {
+        if self.remaining() < 2 {
+            bail!("no bytes left, expected 2");
+        }
+        Ok(self.get_u16())
+    }
+    fn get_u32_f(&mut self) -> Result<u32> {
+        if self.remaining() < 4 {
+            bail!("only {} bytes left, expected 4", self.remaining());
+        }
+        Ok(self.get_u32())
+    }
+    fn get_u64_f(&mut self) -> Result<u64> {
+        if self.remaining() < 8 {
+            bail!("only {} bytes left, expected 8", self.remaining());
+        }
+        Ok(self.get_u64())
+    }
+}
+
+impl ProposerAcceptorMessage {
+    /// Read cstring from Bytes.
+    fn get_cstr(buf: &mut Bytes) -> Result<String> {
+        let pos = buf
+            .iter()
+            .position(|x| *x == 0)
+            .ok_or_else(|| anyhow::anyhow!("missing cstring terminator"))?;
+        let result = buf.split_to(pos);
+        buf.advance(1); // drop the null terminator
+        match std::str::from_utf8(&result) {
+            Ok(s) => Ok(s.to_string()),
+            Err(e) => bail!("invalid utf8 in cstring: {}", e),
+        }
+    }
+
+    /// Read membership::Configuration from Bytes.
+    fn get_mconf(buf: &mut Bytes) -> Result<membership::Configuration> {
+        let generation = buf.get_u32_f().with_context(|| "reading generation")?;
+        let members_len = buf.get_u32_f().with_context(|| "reading members_len")?;
+        // Main member set must have at least someone in valid configuration.
+        // Empty conf is allowed until we fully migrate.
+        if generation != INVALID_GENERATION && members_len == 0 {
+            bail!("empty members_len");
+        }
+        let mut members = MemberSet::empty();
+        for i in 0..members_len {
+            let id = buf
+                .get_u64_f()
+                .with_context(|| format!("reading member {} node_id", i))?;
+            let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?;
+            let pg_port = buf
+                .get_u16_f()
+                .with_context(|| format!("reading member {} port", i))?;
+            let sk = SafekeeperId {
+                id: NodeId(id),
+                host,
+                pg_port,
+            };
+            members.add(sk)?;
+        }
+        let new_members_len = buf.get_u32_f().with_context(|| "reading new_members_len")?;
+        // Non joint conf.
+        if new_members_len == 0 {
+            Ok(membership::Configuration {
+                generation,
+                members,
+                new_members: None,
+            })
+        } else {
+            let mut new_members = MemberSet::empty();
+            for i in 0..new_members_len {
+                let id = buf
+                    .get_u64_f()
+                    .with_context(|| format!("reading new member {} node_id", i))?;
+                let host = Self::get_cstr(buf)
+                    .with_context(|| format!("reading new member {} host", i))?;
+                let pg_port = buf
+                    .get_u16_f()
+                    .with_context(|| format!("reading new member {} port", i))?;
+                let sk = SafekeeperId {
+                    id: NodeId(id),
+                    host,
+                    pg_port,
                };
-                Ok(ProposerAcceptorMessage::Elected(msg))
+                new_members.add(sk)?;
            }
-            'a' => {
-                // read header followed by wal data
-                let hdr = AppendRequestHeader::des_from(&mut stream)?;
-                let rec_size = hdr
-                    .end_lsn
-                    .checked_sub(hdr.begin_lsn)
-                    .context("begin_lsn > end_lsn in AppendRequest")?
-                    .0 as usize;
-                if rec_size > MAX_SEND_SIZE {
-                    bail!(
-                        "AppendRequest is longer than MAX_SEND_SIZE ({})",
-                        MAX_SEND_SIZE
-                    );
+            Ok(membership::Configuration {
+                generation,
+                members,
+                new_members: Some(new_members),
+            })
+        }
+    }
+
+    /// Parse proposer message.
+    pub fn parse(mut msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
+        if proto_version == SK_PROTO_VERSION_3 {
+            if msg_bytes.is_empty() {
+                bail!("ProposerAcceptorMessage is not complete: missing tag");
+            }
+            let tag = msg_bytes.get_u8_f().with_context(|| {
+                "ProposerAcceptorMessage is not complete: missing tag".to_string()
+            })? as char;
+            match tag {
+                'g' => {
+                    let tenant_id_str =
+                        Self::get_cstr(&mut msg_bytes).with_context(|| "reading tenant_id")?;
+                    let tenant_id = TenantId::from_str(&tenant_id_str)?;
+                    let timeline_id_str =
+                        Self::get_cstr(&mut msg_bytes).with_context(|| "reading timeline_id")?;
+                    let timeline_id = TimelineId::from_str(&timeline_id_str)?;
+                    let mconf = Self::get_mconf(&mut msg_bytes)?;
+                    let pg_version = msg_bytes
+                        .get_u32_f()
+                        .with_context(|| "reading pg_version")?;
+                    let system_id = msg_bytes.get_u64_f().with_context(|| "reading system_id")?;
+                    let wal_seg_size = msg_bytes
+                        .get_u32_f()
+                        .with_context(|| "reading wal_seg_size")?;
+                    let g = ProposerGreeting {
+                        tenant_id,
+                        timeline_id,
+                        mconf,
+                        pg_version,
+                        system_id,
+                        wal_seg_size,
+                    };
+                    Ok(ProposerAcceptorMessage::Greeting(g))
                }
+                'v' => {
+                    let generation = msg_bytes
+                        .get_u32_f()
+                        .with_context(|| "reading generation")?;
+                    let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
+                    let v = VoteRequest { generation, term };
+                    Ok(ProposerAcceptorMessage::VoteRequest(v))
+                }
+                'e' => {
+                    let generation = msg_bytes
+                        .get_u32_f()
+                        .with_context(|| "reading generation")?;
+                    let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
+                    let start_streaming_at: Lsn = msg_bytes
+                        .get_u64_f()
+                        .with_context(|| "reading start_streaming_at")?
+                        .into();
+                    let term_history = TermHistory::from_bytes(&mut msg_bytes)?;
+                    let msg = ProposerElected {
+                        generation,
+                        term,
+                        start_streaming_at,
+                        term_history,
+                    };
+                    Ok(ProposerAcceptorMessage::Elected(msg))
+                }
+                'a' => {
+                    let generation = msg_bytes
+                        .get_u32_f()
+                        .with_context(|| "reading generation")?;
+                    let term = msg_bytes.get_u64_f().with_context(|| "reading term")?;
+                    let begin_lsn: Lsn = msg_bytes
+                        .get_u64_f()
+                        .with_context(|| "reading begin_lsn")?
+                        .into();
+                    let end_lsn: Lsn = msg_bytes
+                        .get_u64_f()
+                        .with_context(|| "reading end_lsn")?
+                        .into();
+                    let commit_lsn: Lsn = msg_bytes
+                        .get_u64_f()
+                        .with_context(|| "reading commit_lsn")?
+                        .into();
+                    let truncate_lsn: Lsn = msg_bytes
+                        .get_u64_f()
+                        .with_context(|| "reading truncate_lsn")?
+                        .into();
+                    let hdr = AppendRequestHeader {
+                        generation,
+                        term,
+                        begin_lsn,
+                        end_lsn,
+                        commit_lsn,
+                        truncate_lsn,
+                    };
+                    let rec_size = hdr
+                        .end_lsn
+                        .checked_sub(hdr.begin_lsn)
+                        .context("begin_lsn > end_lsn in AppendRequest")?
+                        .0 as usize;
+                    if rec_size > MAX_SEND_SIZE {
+                        bail!(
+                            "AppendRequest is longer than MAX_SEND_SIZE ({})",
+                            MAX_SEND_SIZE
+                        );
+                    }
+                    if msg_bytes.remaining() < rec_size {
+                        bail!(
+                            "reading WAL: only {} bytes left, wanted {}",
+                            msg_bytes.remaining(),
+                            rec_size
+                        );
+                    }
+                    let wal_data = msg_bytes.copy_to_bytes(rec_size);
+                    let msg = AppendRequest { h: hdr, wal_data };

-                let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
-                stream.read_exact(&mut wal_data_vec)?;
-                let wal_data = Bytes::from(wal_data_vec);
-                let msg = AppendRequest { h: hdr, wal_data };
-
-                Ok(ProposerAcceptorMessage::AppendRequest(msg))
+                    Ok(ProposerAcceptorMessage::AppendRequest(msg))
+                }
+                _ => bail!("unknown proposer-acceptor message tag: {}", tag),
            }
-            _ => bail!("unknown proposer-acceptor message tag: {}", tag),
+        } else if proto_version == SK_PROTO_VERSION_2 {
+            // xxx using Reader is inefficient but easy to work with bincode
+            let mut stream = msg_bytes.reader();
+            // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
+            let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
+            match tag {
+                'g' => {
+                    let msgv2 = ProposerGreetingV2::des_from(&mut stream)?;
+                    let g = ProposerGreeting {
+                        tenant_id: msgv2.tenant_id,
+                        timeline_id: msgv2.timeline_id,
+                        mconf: membership::Configuration {
+                            generation: INVALID_GENERATION,
+                            members: MemberSet::empty(),
+                            new_members: None,
+                        },
+                        pg_version: msgv2.pg_version,
+                        system_id: msgv2.system_id,
+                        wal_seg_size: msgv2.wal_seg_size,
+                    };
+                    Ok(ProposerAcceptorMessage::Greeting(g))
+                }
+                'v' => {
+                    let msg = VoteRequestV2::des_from(&mut stream)?;
+                    let v = VoteRequest {
+                        generation: INVALID_GENERATION,
+                        term: msg.term,
+                    };
+                    Ok(ProposerAcceptorMessage::VoteRequest(v))
+                }
+                'e' => {
+                    let mut msg_bytes = stream.into_inner();
+                    if msg_bytes.remaining() < 16 {
+                        bail!("ProposerElected message is not complete");
+                    }
+                    let term = msg_bytes.get_u64_le();
+                    let start_streaming_at = msg_bytes.get_u64_le().into();
+                    let term_history = TermHistory::from_bytes_le(&mut msg_bytes)?;
+                    if msg_bytes.remaining() < 8 {
+                        bail!("ProposerElected message is not complete");
+                    }
+                    let _timeline_start_lsn = msg_bytes.get_u64_le();
+                    let msg = ProposerElected {
+                        generation: INVALID_GENERATION,
+                        term,
+                        start_streaming_at,
+                        term_history,
+                    };
+                    Ok(ProposerAcceptorMessage::Elected(msg))
+                }
+                'a' => {
+                    // read header followed by wal data
+                    let hdrv2 = AppendRequestHeaderV2::des_from(&mut stream)?;
+                    let hdr = AppendRequestHeader {
+                        generation: INVALID_GENERATION,
+                        term: hdrv2.term,
+                        begin_lsn: hdrv2.begin_lsn,
+                        end_lsn: hdrv2.end_lsn,
+                        commit_lsn: hdrv2.commit_lsn,
+                        truncate_lsn: hdrv2.truncate_lsn,
+                    };
+                    let rec_size = hdr
+                        .end_lsn
+                        .checked_sub(hdr.begin_lsn)
+                        .context("begin_lsn > end_lsn in AppendRequest")?
+                        .0 as usize;
+                    if rec_size > MAX_SEND_SIZE {
+                        bail!(
+                            "AppendRequest is longer than MAX_SEND_SIZE ({})",
+                            MAX_SEND_SIZE
+                        );
+                    }
+
+                    let mut wal_data_vec: Vec<u8> = vec![0; rec_size];
+                    stream.read_exact(&mut wal_data_vec)?;
+                    let wal_data = Bytes::from(wal_data_vec);
+
+                    let msg = AppendRequest { h: hdr, wal_data };
+
+                    Ok(ProposerAcceptorMessage::AppendRequest(msg))
+                }
+                _ => bail!("unknown proposer-acceptor message tag: {}", tag),
+            }
+        } else {
+            bail!("unsupported protocol version {}", proto_version);
        }
    }

@@ -394,36 +705,21 @@ impl ProposerAcceptorMessage {
        // We explicitly list all fields, to draw attention here when new fields are added.
        let mut size = BASE_SIZE;
        size += match self {
-            Self::Greeting(ProposerGreeting {
-                protocol_version: _,
-                pg_version: _,
-                proposer_id: _,
-                system_id: _,
-                timeline_id: _,
-                tenant_id: _,
-                tli: _,
-                wal_seg_size: _,
-            }) => 0,
+            Self::Greeting(_) => 0,

-            Self::VoteRequest(VoteRequest { term: _ }) => 0,
+            Self::VoteRequest(_) => 0,

-            Self::Elected(ProposerElected {
-                term: _,
-                start_streaming_at: _,
-                term_history: _,
-                timeline_start_lsn: _,
-            }) => 0,
+            Self::Elected(_) => 0,

            Self::AppendRequest(AppendRequest {
                h:
                    AppendRequestHeader {
+                        generation: _,
                        term: _,
-                        term_start_lsn: _,
                        begin_lsn: _,
                        end_lsn: _,
                        commit_lsn: _,
                        truncate_lsn: _,
-                        proposer_uuid: _,
                    },
                wal_data,
            }) => wal_data.len(),
@@ -431,13 +727,12 @@ impl ProposerAcceptorMessage {
            Self::NoFlushAppendRequest(AppendRequest {
                h:
                    AppendRequestHeader {
+                        generation: _,
                        term: _,
-                        term_start_lsn: _,
                        begin_lsn: _,
                        end_lsn: _,
                        commit_lsn: _,
                        truncate_lsn: _,
-                        proposer_uuid: _,
                    },
                wal_data,
            }) => wal_data.len(),
@@ -458,45 +753,118 @@ pub enum AcceptorProposerMessage {
 }

 impl AcceptorProposerMessage {
-    /// Serialize acceptor -> proposer message.
-    pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
-        match self {
-            AcceptorProposerMessage::Greeting(msg) => {
-                buf.put_u64_le('g' as u64);
-                buf.put_u64_le(msg.term);
-                buf.put_u64_le(msg.node_id.0);
-            }
-            AcceptorProposerMessage::VoteResponse(msg) => {
-                buf.put_u64_le('v' as u64);
-                buf.put_u64_le(msg.term);
-                buf.put_u64_le(msg.vote_given);
-                buf.put_u64_le(msg.flush_lsn.into());
-                buf.put_u64_le(msg.truncate_lsn.into());
-                buf.put_u32_le(msg.term_history.0.len() as u32);
-                for e in &msg.term_history.0 {
-                    buf.put_u64_le(e.term);
-                    buf.put_u64_le(e.lsn.into());
-                }
-                buf.put_u64_le(msg.timeline_start_lsn.into());
-            }
-            AcceptorProposerMessage::AppendResponse(msg) => {
-                buf.put_u64_le('a' as u64);
-                buf.put_u64_le(msg.term);
-                buf.put_u64_le(msg.flush_lsn.into());
-                buf.put_u64_le(msg.commit_lsn.into());
-                buf.put_i64_le(msg.hs_feedback.ts);
-                buf.put_u64_le(msg.hs_feedback.xmin);
-                buf.put_u64_le(msg.hs_feedback.catalog_xmin);
+    fn put_cstr(buf: &mut BytesMut, s: &str) {
+        buf.put_slice(s.as_bytes());
+        buf.put_u8(0); // null terminator
+    }

-                // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
-                // if it is not present.
-                if let Some(ref msg) = msg.pageserver_feedback {
-                    msg.serialize(buf);
-                }
-            }
+    /// Serialize membership::Configuration into buf.
+    fn serialize_mconf(buf: &mut BytesMut, mconf: &membership::Configuration) {
+        buf.put_u32(mconf.generation);
+        buf.put_u32(mconf.members.m.len() as u32);
+        for sk in &mconf.members.m {
+            buf.put_u64(sk.id.0);
+            Self::put_cstr(buf, &sk.host);
+            buf.put_u16(sk.pg_port);
        }
+        if let Some(ref new_members) = mconf.new_members {
+            buf.put_u32(new_members.m.len() as u32);
+            for sk in &new_members.m {
+                buf.put_u64(sk.id.0);
+                Self::put_cstr(buf, &sk.host);
+                buf.put_u16(sk.pg_port);
+            }
+        } else {
+            buf.put_u32(0);
+        }
+    }

-        Ok(())
+    /// Serialize acceptor -> proposer message.
+    pub fn serialize(&self, buf: &mut BytesMut, proto_version: u32) -> Result<()> {
+        if proto_version == SK_PROTO_VERSION_3 {
+            match self {
+                AcceptorProposerMessage::Greeting(msg) => {
+                    buf.put_u8(b'g');
+                    buf.put_u64(msg.node_id.0);
+                    Self::serialize_mconf(buf, &msg.mconf);
+                    buf.put_u64(msg.term)
+                }
+                AcceptorProposerMessage::VoteResponse(msg) => {
+                    buf.put_u8(b'v');
+                    buf.put_u32(msg.generation);
+                    buf.put_u64(msg.term);
+                    buf.put_u8(msg.vote_given as u8);
+                    buf.put_u64(msg.flush_lsn.into());
+                    buf.put_u64(msg.truncate_lsn.into());
+                    buf.put_u32(msg.term_history.0.len() as u32);
+                    for e in &msg.term_history.0 {
+                        buf.put_u64(e.term);
+                        buf.put_u64(e.lsn.into());
+                    }
+                }
+                AcceptorProposerMessage::AppendResponse(msg) => {
+                    buf.put_u8(b'a');
+                    buf.put_u32(msg.generation);
+                    buf.put_u64(msg.term);
+                    buf.put_u64(msg.flush_lsn.into());
+                    buf.put_u64(msg.commit_lsn.into());
+                    buf.put_i64(msg.hs_feedback.ts);
+                    buf.put_u64(msg.hs_feedback.xmin);
+                    buf.put_u64(msg.hs_feedback.catalog_xmin);
+
+                    // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
+                    // if it is not present.
+                    if let Some(ref msg) = msg.pageserver_feedback {
+                        msg.serialize(buf);
+                    }
+                }
+            }
+            Ok(())
+        // TODO remove 3 after converting all msgs
+        } else if proto_version == SK_PROTO_VERSION_2 {
+            match self {
+                AcceptorProposerMessage::Greeting(msg) => {
+                    buf.put_u64_le('g' as u64);
+                    // v2 didn't have mconf and fields were reordered
+                    buf.put_u64_le(msg.term);
+                    buf.put_u64_le(msg.node_id.0);
+                }
+                AcceptorProposerMessage::VoteResponse(msg) => {
+                    // v2 didn't have generation, had u64 vote_given and timeline_start_lsn
+                    buf.put_u64_le('v' as u64);
+                    buf.put_u64_le(msg.term);
+                    buf.put_u64_le(msg.vote_given as u64);
+                    buf.put_u64_le(msg.flush_lsn.into());
+                    buf.put_u64_le(msg.truncate_lsn.into());
+                    buf.put_u32_le(msg.term_history.0.len() as u32);
+                    for e in &msg.term_history.0 {
+                        buf.put_u64_le(e.term);
+                        buf.put_u64_le(e.lsn.into());
+                    }
+                    // removed timeline_start_lsn
+                    buf.put_u64_le(0);
+                }
+                AcceptorProposerMessage::AppendResponse(msg) => {
+                    // v2 didn't have generation
+                    buf.put_u64_le('a' as u64);
+                    buf.put_u64_le(msg.term);
+                    buf.put_u64_le(msg.flush_lsn.into());
+                    buf.put_u64_le(msg.commit_lsn.into());
+                    buf.put_i64_le(msg.hs_feedback.ts);
+                    buf.put_u64_le(msg.hs_feedback.xmin);
+                    buf.put_u64_le(msg.hs_feedback.catalog_xmin);
+
+                    // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback
+                    // if it is not present.
+                    if let Some(ref msg) = msg.pageserver_feedback {
+                        msg.serialize(buf);
+                    }
+                }
+            }
+            Ok(())
+        } else {
+            bail!("unsupported protocol version {}", proto_version);
+        }
    }
 }

@@ -593,14 +961,6 @@ where
        &mut self,
        msg: &ProposerGreeting,
    ) -> Result<Option<AcceptorProposerMessage>> {
-        // Check protocol compatibility
-        if msg.protocol_version != SK_PROTOCOL_VERSION {
-            bail!(
-                "incompatible protocol version {}, expected {}",
-                msg.protocol_version,
-                SK_PROTOCOL_VERSION
-            );
-        }
        /* Postgres major version mismatch is treated as fatal error
         * because safekeepers parse WAL headers and the format
         * may change between versions.
@@ -655,15 +1015,16 @@ where
            self.state.finish_change(&state).await?;
        }

-        info!(
-            "processed greeting from walproposer {}, sending term {:?}",
-            msg.proposer_id.map(|b| format!("{:X}", b)).join(""),
-            self.state.acceptor_state.term
-        );
-        Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
-            term: self.state.acceptor_state.term,
+        let apg = AcceptorGreeting {
            node_id: self.node_id,
-        })))
+            mconf: self.state.mconf.clone(),
+            term: self.state.acceptor_state.term,
+        };
+        info!(
+            "processed greeting {:?} from walproposer, sending {:?}",
+            msg, apg
+        );
+        Ok(Some(AcceptorProposerMessage::Greeting(apg)))
    }

    /// Give vote for the given term, if we haven't done that previously.
@@ -684,12 +1045,12 @@ where
        self.wal_store.flush_wal().await?;
        // initialize with refusal
        let mut resp = VoteResponse {
+            generation: self.state.mconf.generation,
            term: self.state.acceptor_state.term,
-            vote_given: false as u64,
+            vote_given: false,
            flush_lsn: self.flush_lsn(),
            truncate_lsn: self.state.inmem.peer_horizon_lsn,
            term_history: self.get_term_history(),
-            timeline_start_lsn: self.state.timeline_start_lsn,
        };
        if self.state.acceptor_state.term < msg.term {
            let mut state = self.state.start_change();
@@ -698,15 +1059,16 @@ where
            self.state.finish_change(&state).await?;

            resp.term = self.state.acceptor_state.term;
-            resp.vote_given = true as u64;
+            resp.vote_given = true;
        }
-        info!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
+        info!("processed {:?}: sending {:?}", msg, &resp);
        Ok(Some(AcceptorProposerMessage::VoteResponse(resp)))
    }

    /// Form AppendResponse from current state.
    fn append_response(&self) -> AppendResponse {
        let ar = AppendResponse {
+            generation: self.state.mconf.generation,
            term: self.state.acceptor_state.term,
            flush_lsn: self.flush_lsn(),
            commit_lsn: self.state.commit_lsn,
@@ -805,18 +1167,22 @@ where
            // Here we learn initial LSN for the first time, set fields
            // interested in that.

-            if state.timeline_start_lsn == Lsn(0) {
-                // Remember point where WAL begins globally.
-                state.timeline_start_lsn = msg.timeline_start_lsn;
-                info!(
-                    "setting timeline_start_lsn to {:?}",
-                    state.timeline_start_lsn
-                );
+            if let Some(start_lsn) = msg.term_history.0.first() {
+                if state.timeline_start_lsn == Lsn(0) {
+                    // Remember point where WAL begins globally. In the future it
+                    // will be intialized immediately on timeline creation.
+                    state.timeline_start_lsn = start_lsn.lsn;
+                    info!(
+                        "setting timeline_start_lsn to {:?}",
+                        state.timeline_start_lsn
+                    );
+                }
            }
+
            if state.peer_horizon_lsn == Lsn(0) {
                // Update peer_horizon_lsn as soon as we know where timeline starts.
                // It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn.
-                state.peer_horizon_lsn = msg.timeline_start_lsn;
+                state.peer_horizon_lsn = state.timeline_start_lsn;
            }
            if state.local_start_lsn == Lsn(0) {
                state.local_start_lsn = msg.start_streaming_at;
@@ -896,7 +1262,10 @@ where

        // If our term is higher, immediately refuse the message.
        if self.state.acceptor_state.term > msg.h.term {
-            let resp = AppendResponse::term_only(self.state.acceptor_state.term);
+            let resp = AppendResponse::term_only(
+                self.state.mconf.generation,
+                self.state.acceptor_state.term,
+            );
            return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
        }

@@ -924,10 +1293,8 @@ where
            );
        }

-        // Now we know that we are in the same term as the proposer,
-        // processing the message.
-
-        self.state.inmem.proposer_uuid = msg.h.proposer_uuid;
+        // Now we know that we are in the same term as the proposer, process the
+        // message.

        // do the job
        if !msg.wal_data.is_empty() {
@@ -1097,10 +1464,13 @@ mod tests {
        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();

        // check voting for 1 is ok
-        let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
+        let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest {
+            generation: 0,
+            term: 1,
+        });
        let mut vote_resp = sk.process_msg(&vote_request).await;
        match vote_resp.unwrap() {
-            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0),
+            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given),
            r => panic!("unexpected response: {:?}", r),
        }

@@ -1115,7 +1485,7 @@ mod tests {
        // and ensure voting second time for 1 is not ok
        vote_resp = sk.process_msg(&vote_request).await;
        match vote_resp.unwrap() {
-            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0),
+            Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(!resp.vote_given),
            r => panic!("unexpected response: {:?}", r),
        }
    }
@@ -1130,13 +1500,12 @@ mod tests {
        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();

        let mut ar_hdr = AppendRequestHeader {
+            generation: 0,
            term: 2,
-            term_start_lsn: Lsn(3),
            begin_lsn: Lsn(1),
            end_lsn: Lsn(2),
            commit_lsn: Lsn(0),
            truncate_lsn: Lsn(0),
-            proposer_uuid: [0; 16],
        };
        let mut append_request = AppendRequest {
            h: ar_hdr.clone(),
@@ -1144,6 +1513,7 @@ mod tests {
        };

        let pem = ProposerElected {
+            generation: 0,
            term: 2,
            start_streaming_at: Lsn(1),
            term_history: TermHistory(vec![
@@ -1156,7 +1526,6 @@ mod tests {
                    lsn: Lsn(3),
                },
            ]),
-            timeline_start_lsn: Lsn(1),
        };
        sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
            .await
@@ -1191,26 +1560,25 @@ mod tests {
        let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap();

        let pem = ProposerElected {
+            generation: 0,
            term: 1,
            start_streaming_at: Lsn(1),
            term_history: TermHistory(vec![TermLsn {
                term: 1,
                lsn: Lsn(1),
            }]),
-            timeline_start_lsn: Lsn(1),
        };
        sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
            .await
            .unwrap();

        let ar_hdr = AppendRequestHeader {
+            generation: 0,
            term: 1,
-            term_start_lsn: Lsn(3),
            begin_lsn: Lsn(1),
            end_lsn: Lsn(2),
            commit_lsn: Lsn(0),
            truncate_lsn: Lsn(0),
-            proposer_uuid: [0; 16],
        };
        let append_request = AppendRequest {
            h: ar_hdr.clone(),
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -73,10 +73,10 @@ impl Env {
        // Emulate an initial election.
        safekeeper
            .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected {
+                generation: 0,
                term: 1,
                start_streaming_at: start_lsn,
                term_history: TermHistory(vec![(1, start_lsn).into()]),
-                timeline_start_lsn: start_lsn,
            }))
            .await?;

@@ -142,13 +142,12 @@ impl Env {

            let req = AppendRequest {
                h: AppendRequestHeader {
+                    generation: 0,
                    term: 1,
-                    term_start_lsn: start_lsn,
                    begin_lsn: lsn,
                    end_lsn: lsn + record.len() as u64,
                    commit_lsn: lsn,
                    truncate_lsn: Lsn(0),
-                    proposer_uuid: [0; 16],
                },
                wal_data: record,
            };
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -535,6 +535,10 @@ pub async fn main_task(
        // limit concurrent uploads
        let _upload_permit = tokio::select! {
            acq = limiter.acquire_partial_backup() => acq,
+            _ = backup.tli.cancel.cancelled() => {
+                info!("timeline canceled");
+                return None;
+            }
            _ = cancel.cancelled() => {
                info!("task canceled");
                return None;
--- a/Show More
+++ b/Show More