Add neon.safekeeper_connstrings GUC

This GUC is very similar to neon.safekeepers, but instead of being formatted as host:port, it is a Postgres connection string. The purpose for changing how we connect to safekeepers is so that we can pass various libpq SSL keyword parameters in the connection string. A future PR will remove the `neon.safekeepers` GUC. Signed-off-by: Tristan Partin <tristan@neon.tech>
storcon: fix split aborts removing other tenants (#11837 )
2026-06-04 22:10:39 +00:00 · 2025-05-06 09:31:51 -05:00 · 2025-05-06 13:57:34 +00:00 · 2025-05-06 12:27:16 +00:00 · 2025-05-06 12:26:13 +00:00 · 2025-05-06 10:52:21 +00:00
116 changed files with 2030 additions and 924 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -33,9 +33,14 @@ config-variables:
  - REMOTE_STORAGE_AZURE_CONTAINER
  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_CICD_CHANNEL_ID
+  - SLACK_COMPUTE_CHANNEL_ID
  - SLACK_ON_CALL_DEVPROD_STREAM
  - SLACK_ON_CALL_QA_STAGING_STREAM
  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
+  - SLACK_ONCALL_COMPUTE_GROUP
+  - SLACK_ONCALL_PROXY_GROUP
+  - SLACK_ONCALL_STORAGE_GROUP
+  - SLACK_PROXY_CHANNEL_ID
  - SLACK_RUST_CHANNEL_ID
  - SLACK_STORAGE_CHANNEL_ID
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/scripts/lint-release-pr.sh
+++ b/.github/scripts/lint-release-pr.sh
@@ -41,7 +41,7 @@ echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}"
 LAST_COMMIT=$(git rev-parse HEAD)

 MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}")
-EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$"
+EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2} UTC$"

 if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then
  report_error "Merge commit message does not match expected pattern: '<component> release YYYY-MM-DD'
--- a/.github/workflows/_create-release-pr.yml
+++ b/.github/workflows/_create-release-pr.yml
@@ -1,103 +0,0 @@
-name: Create Release PR
-
-on:
-  workflow_call:
-    inputs:
-      component-name:
-        description: 'Component name'
-        required: true
-        type: string
-      source-branch:
-        description: 'Source branch'
-        required: true
-        type: string
-    secrets:
-      ci-access-token:
-        description: 'CI access token'
-        required: true
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-permissions:
-  contents: read
-
-jobs:
-  create-release-branch:
-    runs-on: ubuntu-22.04
-
-    permissions:
-      contents: write # for `git push`
-
-    steps:
-    - name: Harden the runner (Audit all outbound calls)
-      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
-      with:
-        egress-policy: audit
-
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      with:
-        ref: ${{ inputs.source-branch }}
-        fetch-depth: 0
-
-    - name: Set variables
-      id: vars
-      env:
-        COMPONENT_NAME: ${{ inputs.component-name }}
-        RELEASE_BRANCH: >-
-          ${{
-            false
-            || inputs.component-name == 'Storage' && 'release'
-            || inputs.component-name == 'Proxy' && 'release-proxy'
-            || inputs.component-name == 'Compute' && 'release-compute'
-          }}
-      run: |
-        now_date=$(date -u +'%Y-%m-%d')
-        now_time=$(date -u +'%H-%M-%Z')
-        {
-          echo "title=${COMPONENT_NAME} release ${now_date}"
-          echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}"
-          echo "release-branch=${RELEASE_BRANCH}"
-        } | tee -a ${GITHUB_OUTPUT}
-
-    - name: Configure git
-      run: |
-        git config user.name "github-actions[bot]"
-        git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
-
-    - name: Create RC branch
-      env:
-        RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
-        RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
-        TITLE: ${{ steps.vars.outputs.title }}
-      run: |
-        git switch -c "${RC_BRANCH}"
-
-        # Manually create a merge commit on the current branch, keeping the
-        # tree and setting the parents to the current HEAD and the HEAD of the
-        # release branch. This commit is what we'll fast-forward the release
-        # branch to when merging the release branch.
-        # For details on why, look at
-        # https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs
-        current_tree=$(git rev-parse 'HEAD^{tree}')
-        release_head=$(git rev-parse "origin/${RELEASE_BRANCH}")
-        current_head=$(git rev-parse HEAD)
-        merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}")
-
-        # Fast-forward the current branch to the newly created merge_commit
-        git merge --ff-only ${merge_commit}
-
-        git push origin "${RC_BRANCH}"
-
-    - name: Create a PR into ${{ steps.vars.outputs.release-branch }}
-      env:
-        GH_TOKEN: ${{ secrets.ci-access-token }}
-        RC_BRANCH: ${{ steps.vars.outputs.rc-branch }}
-        RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }}
-        TITLE: ${{ steps.vars.outputs.title }}
-      run: |
-        gh pr create --title "${TITLE}" \
-                     --body "" \
-                     --head "${RC_BRANCH}" \
-                     --base "${RELEASE_BRANCH}"
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -53,6 +53,77 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  cleanup:
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+    env:
+      ORG_ID: org-solitary-dew-09443886
+      LIMIT: 100
+      SEARCH: "GITHUB_RUN_ID="
+      BASE_URL: https://console-stage.neon.build/api/v2
+      DRY_RUN: "false"  # Set to "true" to just test out the workflow
+
+    steps:
+    - name: Harden the runner (Audit all outbound calls)
+      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+    - name: Cleanup inactive Neon projects left over from prior runs
+      env:
+        API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+      run: |
+        set -euo pipefail
+
+        NOW=$(date -u +%s)
+        DAYS_AGO=$((NOW - 5 * 86400))
+
+        REQUEST_URL="$BASE_URL/projects?limit=$LIMIT&search=$(printf '%s' "$SEARCH" | jq -sRr @uri)&org_id=$ORG_ID"
+
+        echo "Requesting project list from:"
+        echo "$REQUEST_URL"
+
+        response=$(curl -s -X GET "$REQUEST_URL" \
+          --header "Accept: application/json" \
+          --header "Content-Type: application/json" \
+          --header "Authorization: Bearer ${API_KEY}" )
+
+        echo "Response:"
+        echo "$response" | jq .
+
+        projects_to_delete=$(echo "$response" | jq --argjson cutoff "$DAYS_AGO" '
+          .projects[]
+          | select(.compute_last_active_at != null)
+          | select((.compute_last_active_at | fromdateiso8601) < $cutoff)
+          | {id, name, compute_last_active_at}
+        ')
+
+        if [ -z "$projects_to_delete" ]; then
+          echo "No projects eligible for deletion."
+          exit 0
+        fi
+
+        echo "Projects that will be deleted:"
+        echo "$projects_to_delete" | jq -r '.id'
+
+        if [ "$DRY_RUN" = "false" ]; then
+          echo "$projects_to_delete" | jq -r '.id' | while read -r project_id; do
+            echo "Deleting project: $project_id"
+            curl -s -X DELETE "$BASE_URL/projects/$project_id" \
+              --header "Accept: application/json" \
+              --header "Content-Type: application/json" \
+              --header "Authorization: Bearer ${API_KEY}" 
+          done
+        else
+          echo "Dry run enabled — no projects were deleted."
+        fi
  bench:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    permissions:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -69,7 +69,7 @@ jobs:
          submodules: true

      - name: Check for file changes
-        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
+        uses: step-security/paths-filter@v3
        id: files-changed
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
@@ -824,7 +824,7 @@ jobs:
          - pg: v17
            debian: bookworm
    env:
-      VM_BUILDER_VERSION: v0.42.2
+      VM_BUILDER_VERSION: v0.46.0

    steps:
      - name: Harden the runner (Audit all outbound calls)
@@ -1434,10 +1434,10 @@ jobs:
            ;;
          esac

-  notify-storage-release-deploy-failure:
-    needs: [ deploy ]
+  notify-release-deploy-failure:
+    needs: [ meta, deploy ]
    # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs.
-    if: github.ref_name == 'release' && needs.deploy.result != 'success' && always()
+    if: contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.deploy.result != 'success' && always()
    runs-on: ubuntu-22.04
    steps:
      - name: Harden the runner (Audit all outbound calls)
@@ -1445,15 +1445,40 @@ jobs:
        with:
          egress-policy: audit

-      - name: Post release-deploy failure to team-storage slack channel
+      - name: Post release-deploy failure to team slack channel
        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        env:
+          TEAM_ONCALL: >-
+            ${{
+              fromJSON(format('{
+                "storage-release": "<!subteam^{0}|@oncall-storage>",
+                "compute-release": "<!subteam^{1}|@oncall-compute>",
+                "proxy-release":   "<!subteam^{2}|@oncall-proxy>"
+              }',
+                vars.SLACK_ONCALL_STORAGE_GROUP,
+                vars.SLACK_ONCALL_COMPUTE_GROUP,
+                vars.SLACK_ONCALL_PROXY_GROUP
+              ))[needs.meta.outputs.run-kind]
+            }}
+          CHANNEL: >-
+            ${{
+              fromJSON(format('{
+                "storage-release": "{0}",
+                "compute-release": "{1}",
+                "proxy-release":   "{2}"
+              }',
+                vars.SLACK_STORAGE_CHANNEL_ID,
+                vars.SLACK_COMPUTE_CHANNEL_ID,
+                vars.SLACK_PROXY_CHANNEL_ID
+              ))[needs.meta.outputs.run-kind]
+            }}
        with:
          method: chat.postMessage
          token: ${{ secrets.SLACK_BOT_TOKEN }}
          payload: |
-            channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
+            channel: ${{ env.CHANNEL }}
            text: |
-              🔴 <!subteam^S06CJ87UMNY|@oncall-storage>: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
+              🔴 ${{ env.TEAM_ONCALL }}: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.

  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
  promote-compatibility-data:
--- a/.github/workflows/cloud-extensions.yml
+++ b/.github/workflows/cloud-extensions.yml
@@ -68,7 +68,7 @@ jobs:
        id: create-neon-project
        uses: ./.github/actions/neon-project-create
        with:
-          region_id: ${{ inputs.region_id }}
+          region_id: ${{ inputs.region_id || 'aws-us-east-2' }}
          postgres_version: ${{ matrix.pg-version }}
          project_settings: ${{ steps.project-settings.outputs.settings }}
          # We need these settings to get the expected output results.
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -53,7 +53,7 @@ jobs:
          submodules: true

      - name: Check for Postgres changes
-        uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242  #v3
+        uses: step-security/paths-filter@v3
        id: files_changed
        with:
          token: ${{ github.token }}
--- a/.github/workflows/release-compute.yml
+++ b/.github/workflows/release-compute.yml
@@ -0,0 +1,12 @@
+name: Create compute release PR
+
+on:
+  schedule:
+    - cron: '0 7 * * FRI'
+
+jobs:
+  create-release-pr:
+    uses: ./.github/workflows/release.yml
+    with:
+      component: compute
+    secrets: inherit
--- a/.github/workflows/release-proxy.yml
+++ b/.github/workflows/release-proxy.yml
@@ -0,0 +1,12 @@
+name: Create proxy release PR
+
+on:
+  schedule:
+    - cron: '0 6 * * TUE'
+
+jobs:
+  create-release-pr:
+    uses: ./.github/workflows/release.yml
+    with:
+      component: proxy
+    secrets: inherit
--- a/.github/workflows/release-storage.yml
+++ b/.github/workflows/release-storage.yml
@@ -0,0 +1,12 @@
+name: Create storage release PR
+
+on:
+  schedule:
+    - cron: '0 6 * * FRI'
+
+jobs:
+  create-release-pr:
+    uses: ./.github/workflows/release.yml
+    with:
+      component: storage
+    secrets: inherit
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,25 +1,34 @@
-name: Create Release Branch
+name: Create release PR

 on:
-  schedule:
-    # It should be kept in sync with if-condition in jobs
-    - cron: '0 6 * * TUE' # Proxy release
-    - cron: '0 6 * * FRI' # Storage release
-    - cron: '0 7 * * FRI' # Compute release
  workflow_dispatch:
    inputs:
-      create-storage-release-branch:
-        type: boolean
-        description: 'Create Storage release PR'
+      component:
+        description: "Component to release"
+        required: true
+        type: choice
+        options:
+          - compute
+          - proxy
+          - storage
+      cherry-pick:
+        description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)"
        required: false
-      create-proxy-release-branch:
-        type: boolean
-        description: 'Create Proxy release PR'
-        required: false
-      create-compute-release-branch:
-        type: boolean
-        description: 'Create Compute release PR'
+        type: string
+        default: ''
+
+  workflow_call:
+    inputs:
+      component:
+        description: "Component to release"
+        required: true
+        type: string
+      cherry-pick:
+        description: "Commits to cherry-pick (space separated, makes this a hotfix based on previous release)"
        required: false
+        type: string
+        default: ''
+

 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -29,41 +38,31 @@ defaults:
    shell: bash -euo pipefail {0}

 jobs:
-  create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * FRI' || inputs.create-storage-release-branch }}
+  create-release-pr:
+    runs-on: ubuntu-22.04

    permissions:
      contents: write

-    uses: ./.github/workflows/_create-release-pr.yml
-    with:
-      component-name: 'Storage'
-      source-branch: ${{ github.ref_name }}
-    secrets:
-      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit

-  create-proxy-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * TUE' || inputs.create-proxy-release-branch }}
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0

-    permissions:
-      contents: write
+      - name: Configure git
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"

-    uses: ./.github/workflows/_create-release-pr.yml
-    with:
-      component-name: 'Proxy'
-      source-branch: ${{ github.ref_name }}
-    secrets:
-      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
-
-  create-compute-release-branch:
-    if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }}
-
-    permissions:
-      contents: write
-
-    uses: ./.github/workflows/_create-release-pr.yml
-    with:
-      component-name: 'Compute'
-      source-branch: ${{ github.ref_name }}
-    secrets:
-      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
+      - name: Create release PR
+        uses: neondatabase/dev-actions/release-pr@290dec821d86fa8a93f019e8c69720f5865b5677
+        with:
+          component: ${{ inputs.component }}
+          cherry-pick: ${{ inputs.cherry-pick }}
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
--- a/1
+++ b/1
@@ -239,6 +239,7 @@ walproposer-lib: neon-pg-ext-v17
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
 	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
 	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpq.so $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
 	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
 		pg_strong_random.o
 	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1085,6 +1085,23 @@ RUN cargo install --locked --version 0.12.9 cargo-pgrx && \

 USER root

+#########################################################################################
+#
+# Layer "rust extensions pgrx14"
+#
+# Version 14 is now required by a few
+# This layer should be used as a base for new pgrx extensions,
+# and eventually get merged with `rust-extensions-build`
+#
+#########################################################################################
+FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
+ARG PG_VERSION
+
+RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+
+USER root
+
 #########################################################################################
 #
 # Layers "pg-onnx-build" and "pgrag-build"
@@ -1100,11 +1117,11 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.
    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
    echo "#nothing to test here" > neon-test.sh

-RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
-    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz &&  \
+    echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \
    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .

-FROM rust-extensions-build-pgrx12 AS pgrag-build
+FROM rust-extensions-build-pgrx14 AS pgrag-build
 COPY --from=pgrag-src /ext-src/ /ext-src/

 # Install build-time dependencies
@@ -1124,19 +1141,19 @@ RUN . venv/bin/activate && \

 WORKDIR /ext-src/pgrag-src
 RUN cd exts/rag && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control

 RUN cd exts/rag_bge_small_en_v15 && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
        cargo pgrx install --release --features remote_onnx && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control

 RUN cd exts/rag_jina_reranker_v1_tiny_en && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
        cargo pgrx install --release --features remote_onnx && \
@@ -1319,6 +1336,39 @@ COPY --from=pg_session_jwt-src /ext-src/ /ext-src/
 WORKDIR /ext-src/pg_session_jwt-src
 RUN cargo pgrx install --release

+#########################################################################################
+#
+# Layer "pg-anon-pg-build"
+# compile anon extension
+#
+#########################################################################################
+FROM pg-build AS pg_anon-src
+ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+WORKDIR /ext-src
+COPY compute/patches/anon_v2.patch .
+
+# This is an experimental extension, never got to real production.
+# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
+RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/latest/postgresql_anonymizer-latest.tar.gz -O pg_anon.tar.gz && \
+    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt && \
+    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "=0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    patch -p1 < /ext-src/anon_v2.patch
+
+FROM rust-extensions-build-pgrx14 AS pg-anon-pg-build
+ARG PG_VERSION
+COPY --from=pg_anon-src /ext-src/ /ext-src/
+WORKDIR /ext-src
+RUN cd pg_anon-src && \
+    make -j $(getconf _NPROCESSORS_ONLN) extension PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config PGVER=pg$(echo "$PG_VERSION" | sed 's/^v//') && \
+    chmod -R a+r ../pg_anon-src && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control;
+
+########################################################################################
+
 #########################################################################################
 #
 # Layer "wal2json-build"
@@ -1615,6 +1665,7 @@ COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
+COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -23,6 +23,8 @@
    import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
    import 'sql_exporter/getpage_prefetches_buffered.libsonnet',
    import 'sql_exporter/getpage_sync_requests_total.libsonnet',
+    import 'sql_exporter/compute_getpage_stuck_requests_total.libsonnet',
+    import 'sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet',
    import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
    import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
    import 'sql_exporter/getpage_wait_seconds_sum.libsonnet',
--- a/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet
+++ b/compute/etc/sql_exporter/compute_getpage_max_inflight_stuck_time_ms.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'compute_getpage_max_inflight_stuck_time_ms',
+  type: 'gauge',
+  help: 'Max wait time for stuck requests among all backends. Includes only active stuck requests, terminated or disconnected ones are not accounted for',
+  values: [
+    'compute_getpage_max_inflight_stuck_time_ms',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet
+++ b/compute/etc/sql_exporter/compute_getpage_stuck_requests_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'compute_getpage_stuck_requests_total',
+  type: 'counter',
+  help: 'Total number of Getpage requests left without an answer for more than pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout',
+  values: [
+    'compute_getpage_stuck_requests_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
--- a/compute/etc/sql_exporter/neon_perf_counters.sql
+++ b/compute/etc/sql_exporter/neon_perf_counters.sql
@@ -9,6 +9,8 @@ SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
  getpage_wait_seconds_sum numeric,
  getpage_prefetch_requests_total numeric,
  getpage_sync_requests_total numeric,
+  compute_getpage_stuck_requests_total numeric,
+  compute_getpage_max_inflight_stuck_time_ms numeric,
  getpage_prefetch_misses_total numeric,
  getpage_prefetch_discards_total numeric,
  getpage_prefetches_buffered numeric,
--- a/compute/patches/anon_v2.patch
+++ b/compute/patches/anon_v2.patch
@@ -0,0 +1,129 @@
+diff --git a/sql/anon.sql b/sql/anon.sql
+index 0cdc769..f6cc950 100644
+--- a/sql/anon.sql
+++ b/sql/anon.sql
+@@ -1141,3 +1141,8 @@ $$
+ -- TODO : https://en.wikipedia.org/wiki/L-diversity
+ 
+ -- TODO : https://en.wikipedia.org/wiki/T-closeness
+
+-- NEON Patches
+
+GRANT ALL ON SCHEMA anon to neon_superuser;
+GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser;
+diff --git a/sql/init.sql b/sql/init.sql
+index 7da6553..9b6164b 100644
+--- a/sql/init.sql
+++ b/sql/init.sql
+@@ -74,50 +74,49 @@ $$
+ 
+ SECURITY LABEL FOR anon ON FUNCTION anon.load_csv IS 'UNTRUSTED';
+ 
+--- load fake data from a given path
+-CREATE OR REPLACE FUNCTION anon.init(
+-  datapath TEXT
+-)
+CREATE OR REPLACE FUNCTION anon.load_fake_data()
+ RETURNS BOOLEAN
+ AS $$
+ DECLARE
+-  datapath_check TEXT;
+   success BOOLEAN;
+  sharedir TEXT;
+  datapath TEXT;
+ BEGIN
+ 
+-  IF anon.is_initialized() THEN
+-    RAISE NOTICE 'The anon extension is already initialized.';
+-    RETURN TRUE;
+-  END IF;
+  datapath := '/extension/anon/';
+  -- find the local extension directory
+  SELECT setting INTO sharedir
+  FROM pg_catalog.pg_config
+  WHERE name = 'SHAREDIR';
+ 
+   SELECT bool_or(results) INTO success
+   FROM unnest(array[
+-    anon.load_csv('anon.identifiers_category',datapath||'/identifiers_category.csv'),
+-    anon.load_csv('anon.identifier',datapath ||'/identifier.csv'),
+-    anon.load_csv('anon.address',datapath ||'/address.csv'),
+-    anon.load_csv('anon.city',datapath ||'/city.csv'),
+-    anon.load_csv('anon.company',datapath ||'/company.csv'),
+-    anon.load_csv('anon.country',datapath ||'/country.csv'),
+-    anon.load_csv('anon.email', datapath ||'/email.csv'),
+-    anon.load_csv('anon.first_name',datapath ||'/first_name.csv'),
+-    anon.load_csv('anon.iban',datapath ||'/iban.csv'),
+-    anon.load_csv('anon.last_name',datapath ||'/last_name.csv'),
+-    anon.load_csv('anon.postcode',datapath ||'/postcode.csv'),
+-    anon.load_csv('anon.siret',datapath ||'/siret.csv'),
+-    anon.load_csv('anon.lorem_ipsum',datapath ||'/lorem_ipsum.csv')
+    anon.load_csv('anon.identifiers_category',sharedir || datapath || '/identifiers_category.csv'),
+    anon.load_csv('anon.identifier',sharedir || datapath || '/identifier.csv'),
+    anon.load_csv('anon.address',sharedir || datapath || '/address.csv'),
+    anon.load_csv('anon.city',sharedir || datapath || '/city.csv'),
+    anon.load_csv('anon.company',sharedir || datapath || '/company.csv'),
+    anon.load_csv('anon.country',sharedir || datapath || '/country.csv'),
+    anon.load_csv('anon.email', sharedir || datapath || '/email.csv'),
+    anon.load_csv('anon.first_name',sharedir || datapath || '/first_name.csv'),
+    anon.load_csv('anon.iban',sharedir || datapath || '/iban.csv'),
+    anon.load_csv('anon.last_name',sharedir || datapath || '/last_name.csv'),
+    anon.load_csv('anon.postcode',sharedir || datapath || '/postcode.csv'),
+    anon.load_csv('anon.siret',sharedir || datapath || '/siret.csv'),
+    anon.load_csv('anon.lorem_ipsum',sharedir || datapath || '/lorem_ipsum.csv')
+   ]) results;
+   RETURN success;
+-
+ END;
+ $$
+-  LANGUAGE PLPGSQL
+  LANGUAGE plpgsql
+   VOLATILE
+   RETURNS NULL ON NULL INPUT
+-  PARALLEL UNSAFE -- because load_csv is unsafe
+-  SECURITY INVOKER
+  PARALLEL UNSAFE -- because of the EXCEPTION
+  SECURITY DEFINER
+   SET search_path=''
+ ;
+-SECURITY LABEL FOR anon ON FUNCTION anon.init(TEXT) IS 'UNTRUSTED';
+
+SECURITY LABEL FOR anon ON FUNCTION anon.load_fake_data IS 'UNTRUSTED';
+ 
+ -- People tend to forget the anon.init() step
+ -- This is a friendly notice for them
+@@ -144,7 +143,7 @@ SECURITY LABEL FOR anon ON FUNCTION anon.notice_if_not_init IS 'UNTRUSTED';
+ CREATE OR REPLACE FUNCTION anon.load(TEXT)
+ RETURNS BOOLEAN AS
+ $$
+-  SELECT anon.init($1);
+  SELECT anon.init();
+ $$
+   LANGUAGE SQL
+   VOLATILE
+@@ -159,16 +158,16 @@ SECURITY LABEL FOR anon ON FUNCTION anon.load(TEXT) IS 'UNTRUSTED';
+ CREATE OR REPLACE FUNCTION anon.init()
+ RETURNS BOOLEAN
+ AS $$
+-  WITH conf AS (
+-        -- find the local extension directory
+-        SELECT setting AS sharedir
+-        FROM pg_catalog.pg_config
+-        WHERE name = 'SHAREDIR'
+-    )
+-  SELECT anon.init(conf.sharedir || '/extension/anon/')
+-  FROM conf;
+BEGIN
+  IF anon.is_initialized() THEN
+    RAISE NOTICE 'The anon extension is already initialized.';
+    RETURN TRUE;
+  END IF;
+
+  RETURN anon.load_fake_data();
+END;
+ $$
+-  LANGUAGE SQL
+  LANGUAGE plpgsql
+   VOLATILE
+   PARALLEL UNSAFE -- because init is unsafe
+   SECURITY INVOKER
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -22,7 +22,7 @@ commands:
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
-    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -22,7 +22,7 @@ commands:
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
-    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="error" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -218,7 +218,9 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
            if matches!(spec.mode, ComputeMode::Primary) {
                spec.cluster
                    .settings
-                    .find("neon.safekeepers")
+                    .find("neon.safekeeper_connstrings")
+                    // TODO(tristan957): Remove the compatibility code here.
+                    .or(spec.cluster.settings.find("neon.safekeepers"))
                    .ok_or("safekeeper connstrings should be provided")?
                    .split(',')
                    .map(|str| str.to_string())
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -75,7 +75,7 @@ pub fn write_postgres_conf(
        neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
        writeln!(
            file,
-            "neon.safekeepers={}",
+            "neon.safekeeper_connstrings={}",
            escape_conf_value(&neon_safekeepers_value)
        )?;
    }
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -424,10 +424,10 @@ pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
        experimental,
    };

-    let span = span!(Level::INFO, "compute_monitor");
    thread::Builder::new()
        .name("compute-monitor".into())
        .spawn(move || {
+            let span = span!(Level::INFO, "compute_monitor");
            let _enter = span.enter();
            monitor.run();
        })
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -30,7 +30,7 @@ mod pg_helpers_tests {
            r#"fsync = off
 wal_level = logical
 hot_standby = on
-neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
+neon.safekeeper_connstrings = 'host=127.0.0.1 port=6502,host=127.0.0.1 port=6503,host=127.0.0.1 port=6501'
 wal_log_hints = on
 log_connections = on
 shared_buffers = 32768
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -632,7 +632,7 @@ struct EndpointStartCmdArgs {

    #[clap(
        long,
-        help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
+        help = "Safekeepers membership generation to prefix neon.safekeeper_connstrings with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
    )]
    safekeepers_generation: Option<u32>,
    #[clap(
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -454,10 +454,10 @@ impl Endpoint {
                        .env
                        .safekeepers
                        .iter()
-                        .map(|sk| format!("localhost:{}", sk.get_compute_port()))
+                        .map(|sk| format!("host=localhost port={}", sk.get_compute_port()))
                        .collect::<Vec<String>>()
                        .join(",");
-                    conf.append("neon.safekeepers", &safekeepers);
+                    conf.append("neon.safekeeper_connstrings", &safekeepers);
                } else {
                    // We only use setup without safekeepers for tests,
                    // and don't care about data durability on pageserver,
@@ -623,7 +623,8 @@ impl Endpoint {
                    .iter()
                    .find(|node| node.id == sk_id)
                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
+                safekeeper_connstrings
+                    .push(format!("host=127.0.0.1 port={}", sk.get_compute_port()));
            }
        }
        Ok(safekeeper_connstrings)
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -112,7 +112,7 @@ impl SafekeeperNode {
    }

    /// Initializes a safekeeper node by creating all necessary files,
-    /// e.g. SSL certificates.
+    /// e.g. SSL certificates and JWT token file.
    pub fn initialize(&self) -> anyhow::Result<()> {
        if self.env.generate_local_ssl_certs {
            self.env.generate_ssl_cert(
@@ -120,6 +120,17 @@ impl SafekeeperNode {
                &self.datadir_path().join("server.key"),
            )?;
        }
+
+        // Generate a token file for authentication with other safekeepers
+        if self.conf.auth_enabled {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
+
+            let token_path = self.datadir_path().join("peer_jwt_token");
+            std::fs::write(token_path, token)?;
+        }
+
        Ok(())
    }

@@ -218,14 +229,26 @@ impl SafekeeperNode {
            args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
        }

+        if self.conf.auth_enabled {
+            let token_path = self.datadir_path().join("peer_jwt_token");
+            let token_path_str = token_path
+                .to_str()
+                .with_context(|| {
+                    format!("Token path {token_path:?} cannot be represented as a unicode string")
+                })?
+                .to_owned();
+            args.extend(["--auth-token-path".to_owned(), token_path_str]);
+        }
+
        args.extend_from_slice(extra_opts);

+        let env_variables = Vec::new();
        background_process::start_process(
            &format!("safekeeper-{id}"),
            &datadir,
            &self.env.safekeeper_bin(),
            &args,
-            self.safekeeper_env_variables()?,
+            env_variables,
            background_process::InitialPidFile::Expect(self.pid_file()),
            retry_timeout,
            || async {
@@ -239,18 +262,6 @@ impl SafekeeperNode {
        .await
    }

-    fn safekeeper_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
-        // Generate a token to connect from safekeeper to peers
-        if self.conf.auth_enabled {
-            let token = self
-                .env
-                .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
-            Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)])
-        } else {
-            Ok(Vec::new())
-        }
-    }
-
    ///
    /// Stop the server.
    ///
--- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -104,6 +104,11 @@
                    "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
                    "vartype": "string"
                },
+                {
+                    "name": "neon.safekeeper_connstrings",
+                    "value": "host=safekeeper1 port=5454,host=safekeeper2 port=5454,host=safekeeper3 port=5454",
+                    "vartype": "string"
+                },
                {
                    "name": "neon.timeline_id",
                    "value": "TIMELINE_ID",
--- a/docker-compose/pageserver_config/pageserver.toml
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -3,3 +3,5 @@ pg_distrib_dir='/usr/local/'
 listen_pg_addr='0.0.0.0:6400'
 listen_http_addr='0.0.0.0:9898'
 remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
+control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address
+control_plane_emergency_mode=true
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -24,7 +24,7 @@ because configs may be parsed and dumped into logs.
 #### Tokens generation and validation
 JWT tokens are signed using a private key.
 Compute/pageserver/safekeeper use the private key's public counterpart to validate JWT tokens.
-These components should not have access to the private key and may only get tokens from their configuration or external clients. 
+These components should not have access to the private key and may only get tokens from their configuration or external clients.

 The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`.
 There is currently no way to rotate the key without bringing down all components.
@@ -117,8 +117,8 @@ pageserver uses JWT tokens for authentication, so the password is really a
 token.)

 Compute connects to Safekeepers to write and commit data. The list of safekeeper
-addresses is given in the `neon.safekeepers` GUC. The connections to the
-safekeepers take the password from the `$NEON_AUTH_TOKEN` environment
+addresses is given in the `neon.safekeeper_connstrings` GUC. The connections to
+the safekeepers take the password from the `$NEON_AUTH_TOKEN` environment
 variable, if set.

 The `compute_ctl` binary that runs before the PostgreSQL server, and launches
--- a/docs/consumption_metrics.md
+++ b/docs/consumption_metrics.md
@@ -38,11 +38,6 @@ Currently, the following metrics are collected:
 Amount of WAL produced , by a timeline, i.e. last_record_lsn
 This is an absolute, per-timeline metric.

- `resident_size`
-
-Size of all the layer files in the tenant's directory on disk on the pageserver.
-This is an absolute, per-tenant metric.
-
 - `remote_storage_size`

 Size of the remote storage (S3) directory.
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -269,11 +269,13 @@ calls should be retried until they succeed.

 When compute receives safekeepers list from control plane it needs to know the
 generation to checked whether it should be updated (note that compute may get
-safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
-GUC is just a comma separates list of `host:port`. Let's prefix it with
-`g#<generation>:` to this end, so it will look like
+safekeeper list from either cplane or safekeepers). Currently
+`neon.safekeeper_connstrings` GUC is just a comma separates list of Postgres
+connection strings. Let's prefix it with `g#<generation>:` to this end, so it
+will look like:
+
 ```
-g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401
+g#42:host=safekeeper-0.eu-central-1.aws.neon.tech port=6401,host=safekeeper-2.eu-central-1.aws.neon.tech port=6401,host=safekeeper-1.eu-central-1.aws.neon.tech port=6401
 ```

 To summarize, list of cplane changes:
@@ -281,7 +283,7 @@ To summarize, list of cplane changes:
 - `/notify-safekeepers` endpoint.
 - Branch creation call may return list of safekeepers and when it is
  present cplane should adopt it instead of choosing on its own like it does currently.
- `neon.safekeepers` GUC should be prefixed with `g#<generation>:`.
+- `neon.safekeeper_connstrings` GUC should be prefixed with `g#<generation>:`.

 ### storage_controller implementation

@@ -455,16 +457,16 @@ So, main loop of per sk reconcile reads `safekeeper_timeline_pending_ops`
 joined with timeline configuration to get current conf (with generation `n`)
 for the safekeeper and does the jobs, infinitely retrying failures:
 1) If node is member (`include`):
-  - Check if timeline exists on it, if not, call pull_timeline on it from 
+  - Check if timeline exists on it, if not, call pull_timeline on it from
     other members
  - Call switch configuration to the current
 2) If node is not member (`exclude`):
  - Call switch configuration to the current, 404 is ok.
 3) If timeline is deleted (`delete`), call delete.

-In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and 
+In cases 1 and 2 remove `safekeeper_timeline_pending_ops` for the sk and
 timeline with generation <= `n` if `op_type` is not `delete`.
-In case 3 also remove `safekeeper_timeline_pending_ops` 
+In case 3 also remove `safekeeper_timeline_pending_ops`
 entry + remove `timelines` entry if there is nothing left  in `safekeeper_timeline_pending_ops` for the timeline.

 Let's consider in details how APIs can be implemented from this angle.
@@ -483,7 +485,7 @@ corruption. The following sequence works:
   changes once ingestion starts, insert must not overwrite it (as well as other
   fields like membership conf). On the contrary, start_lsn used in the next
   step must be set to the value in the db. cplane_notified_generation can be set
-   to 1 (initial generation) in insert to avoid notifying cplane about initial 
+   to 1 (initial generation) in insert to avoid notifying cplane about initial
   conf as cplane will receive it in timeline creation request anyway.
 3) Issue timeline creation calls to at least majority of safekeepers. Using
   majority here is not necessary but handy because it guarantees that any live
@@ -492,15 +494,15 @@ corruption. The following sequence works:
   create timeline special init case. OFC if timeline is already exists call is
   ignored.
 4) For minority of safekeepers which could have missed creation insert
-   entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion 
-   because response to cplane is sent only after it has happened, and cplane 
+   entries to `safekeeper_timeline_pending_ops`. We won't miss this insertion
+   because response to cplane is sent only after it has happened, and cplane
   retries the call until 200 response.

   There is a small question how request handler (timeline creation in this
   case) would interact with per sk reconciler. As always I prefer to do the
   simplest possible thing and here it seems to be just waking it up so it
   re-reads the db for work to do. Passing work in memory is faster, but
-   that shouldn't matter, and path to scan db for work will exist anyway, 
+   that shouldn't matter, and path to scan db for work will exist anyway,
   simpler to reuse it.

 For pg version / wal segment size: while we may persist them in `timelines`
@@ -514,13 +516,13 @@ Timeline migration.
   as well as deliver this conf to current ones; poke per sk reconcilers to work
   on it. Also any conf change should also poke cplane notifier task(s).
 2) Once it becomes possible per alg description above, get out of joint conf
-   with another CAS. Task should get wakeups from per sk reconcilers because 
+   with another CAS. Task should get wakeups from per sk reconcilers because
   conf switch is required for advancement; however retries should be sleep
-   based as well as LSN advancement might be needed, though in happy path 
+   based as well as LSN advancement might be needed, though in happy path
   it isn't. To see whether further transition is possible on wakup migration
   executor polls safekeepers per the algorithm. CAS creating new conf with only
   new members should again insert entries to `safekeeper_timeline_pending_ops`
-   to switch them there, as well as `exclude` rows to remove timeline from 
+   to switch them there, as well as `exclude` rows to remove timeline from
   old members.

 Timeline deletion: just set `deleted_at` on the timeline row and insert
@@ -601,7 +603,7 @@ Let's have the following implementation bits for gradual rollout:
  (and returns them in response to cplane) only when it is set to
  true.
 - control_plane [see above](storage_controller-<->-control-plane interface-and-changes)
-  prefixes `neon.safekeepers` GUC with generation number. When it is 0
+  prefixes `neon.safekeeper_connstrings` GUC with generation number. When it is 0
  (or prefix not present at all), walproposer behaves as currently, committing on
  the provided safekeeper list -- generations are disabled.
  If it is non 0 it follows this RFC rules.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -111,7 +111,7 @@ pub struct ComputeSpec {
    pub endpoint_id: Option<String>,

    /// Safekeeper membership config generation. It is put in
-    /// neon.safekeepers GUC and serves two purposes:
+    /// neon.safekeeper_connstrings GUC and serves two purposes:
    /// 1) Non zero value forces walproposer to use membership configurations.
    /// 2) If walproposer wants to update list of safekeepers to connect to
    ///    taking them from some safekeeper mconf, it should check what value
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -85,8 +85,8 @@
                "vartype": "bool"
            },
            {
-                "name": "neon.safekeepers",
-                "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
+                "name": "neon.safekeeper_connstrings",
+                "value": "host=127.0.0.1 port=6502,host=127.0.0.1 port=6503,host=127.0.0.1 port=6501",
                "vartype": "string"
            },
            {
--- a/libs/metrics/src/more_process_metrics.rs
+++ b/libs/metrics/src/more_process_metrics.rs
@@ -16,6 +16,7 @@ pub struct Collector {
 const NMETRICS: usize = 2;

 static CLK_TCK_F64: Lazy<f64> = Lazy::new(|| {
+    // SAFETY: libc::sysconf is safe, it merely returns a value.
    let long = unsafe { libc::sysconf(libc::_SC_CLK_TCK) };
    if long == -1 {
        panic!("sysconf(_SC_CLK_TCK) failed");
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -841,6 +841,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

        let expected_end = match &end {
            ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF | Cancelled => true,
+            // The timeline doesn't exist and we have been requested to not auto-create it.
+            // Compute requests for timelines that haven't been created yet
+            // might reach us before the storcon request to create those timelines.
+            TimelineNoCreate => true,
            CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
                if is_expected_io_error(io_error) =>
            {
@@ -1059,6 +1063,8 @@ pub enum CopyStreamHandlerEnd {
    Terminate,
    #[error("EOF on COPY stream")]
    EOF,
+    #[error("timeline not found, and allow_timeline_creation is false")]
+    TimelineNoCreate,
    /// The connection was lost
    #[error("connection error: {0}")]
    Disconnected(#[from] ConnectionError),
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -303,7 +303,8 @@ pub struct PullTimelineRequest {

 #[derive(Debug, Serialize, Deserialize)]
 pub struct PullTimelineResponse {
-    // Donor safekeeper host
-    pub safekeeper_host: String,
+    /// Donor safekeeper host.
+    /// None if no pull happened because the timeline already exists.
+    pub safekeeper_host: Option<String>,
    // TODO: add more fields?
 }
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -35,6 +35,7 @@ fn main() -> anyhow::Result<()> {
    println!("cargo:rustc-link-lib=static=walproposer");
    println!("cargo:rustc-link-lib=static=pgport");
    println!("cargo:rustc-link-lib=static=pgcommon");
+    println!("cargo:rustc-link-lib=pq");
    println!("cargo:rustc-link-search={walproposer_lib_search_str}");

    // Rebuild crate when libwalproposer.a changes
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -171,8 +171,8 @@ pub enum WaitResult {
 pub struct Config {
    /// Tenant and timeline id
    pub ttid: TenantTimelineId,
-    /// List of safekeepers in format `host:port`
-    pub safekeepers_list: Vec<String>,
+    /// List of safekeeper connection strings
+    pub safekeeper_connstrings: Vec<String>,
    /// Safekeeper reconnect timeout in milliseconds
    pub safekeeper_reconnect_timeout: i32,
    /// Safekeeper connection timeout in milliseconds
@@ -185,7 +185,7 @@ pub struct Config {
 /// WalProposer main struct. C methods are reexported as Rust functions.
 pub struct Wrapper {
    wp: *mut WalProposer,
-    _safekeepers_list_vec: Vec<u8>,
+    _safekeeper_connstrings_vec: Vec<u8>,
 }

 impl Wrapper {
@@ -197,18 +197,19 @@ impl Wrapper {
            .unwrap()
            .into_raw();

-        let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
+        let mut safekeeper_connstrings_vec = CString::new(config.safekeeper_connstrings.join(","))
            .unwrap()
            .into_bytes_with_nul();
-        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
+        assert!(safekeeper_connstrings_vec.len() == safekeeper_connstrings_vec.capacity());
+        let safekeeper_connstrings =
+            safekeeper_connstrings_vec.as_mut_ptr() as *mut std::ffi::c_char;

        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

        let c_config = WalProposerConfig {
            neon_tenant,
            neon_timeline,
-            safekeepers_list,
+            safekeeper_connstrings,
            safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
            safekeeper_connection_timeout: config.safekeeper_connection_timeout,
            wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
@@ -224,7 +225,7 @@ impl Wrapper {
        let wp = unsafe { WalProposerCreate(c_config, api) };
        Wrapper {
            wp,
-            _safekeepers_list_vec: safekeepers_list_vec,
+            _safekeeper_connstrings_vec: safekeeper_connstrings_vec,
        }
    }

@@ -575,7 +576,7 @@ mod tests {
        });
        let config = crate::walproposer::Config {
            ttid,
-            safekeepers_list: vec!["localhost:5000".to_string()],
+            safekeeper_connstrings: vec!["host=localhost port=5000".to_string()],
            safekeeper_reconnect_timeout: 1000,
            safekeeper_connection_timeout: 10000,
            sync_safekeepers: true,
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -10,6 +10,7 @@ use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer, delta_layer, ima
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
+use pageserver_api::key::Key;
 use utils::id::{TenantId, TimelineId};

 use crate::layer_map_analyzer::parse_filename;
@@ -27,6 +28,7 @@ pub(crate) enum LayerCmd {
        path: PathBuf,
        tenant: String,
        timeline: String,
+        key: Option<Key>,
    },
    /// Dump all information of a layer file
    DumpLayer {
@@ -100,6 +102,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            path,
            tenant,
            timeline,
+            key,
        } => {
            let timeline_path = path
                .join(TENANTS_SEGMENT_NAME)
@@ -107,21 +110,37 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                .join(TIMELINES_SEGMENT_NAME)
                .join(timeline);
            let mut idx = 0;
+            let mut to_print = Vec::default();
            for layer in fs::read_dir(timeline_path)? {
                let layer = layer?;
                if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
-                    println!(
-                        "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
-                        idx,
-                        layer_file.key_range.start,
-                        layer_file.key_range.end,
-                        layer_file.lsn_range.start,
-                        layer_file.lsn_range.end,
-                        layer_file.is_delta,
-                    );
+                    if let Some(key) = key {
+                        if layer_file.key_range.start <= *key && *key < layer_file.key_range.end {
+                            to_print.push((idx, layer_file));
+                        }
+                    } else {
+                        to_print.push((idx, layer_file));
+                    }
                    idx += 1;
                }
            }
+
+            if key.is_some() {
+                to_print
+                    .sort_by_key(|(_idx, layer_file)| std::cmp::Reverse(layer_file.lsn_range.end));
+            }
+
+            for (idx, layer_file) in to_print {
+                println!(
+                    "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+                    idx,
+                    layer_file.key_range.start,
+                    layer_file.key_range.end,
+                    layer_file.lsn_range.start,
+                    layer_file.lsn_range.end,
+                    layer_file.is_delta,
+                );
+            }
            Ok(())
        }
        LayerCmd::DumpLayer {
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -504,7 +504,7 @@ fn start_pageserver(
    // Set up deletion queue
    let (deletion_queue, deletion_workers) = DeletionQueue::new(
        remote_storage.clone(),
-        StorageControllerUpcallClient::new(conf, &shutdown_pageserver)?,
+        StorageControllerUpcallClient::new(conf, &shutdown_pageserver),
        conf,
    );
    deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -150,7 +150,7 @@ pub struct PageServerConf {
    /// not terrible.
    pub background_task_maximum_delay: Duration,

-    pub control_plane_api: Option<Url>,
+    pub control_plane_api: Url,

    /// JWT token for use with the control plane API.
    pub control_plane_api_token: Option<SecretString>,
@@ -438,7 +438,8 @@ impl PageServerConf {
            test_remote_failures,
            ondemand_download_behavior_treat_error_as_warn,
            background_task_maximum_delay,
-            control_plane_api,
+            control_plane_api: control_plane_api
+                .ok_or_else(|| anyhow::anyhow!("`control_plane_api` must be set"))?,
            control_plane_emergency_mode,
            heatmap_upload_concurrency,
            secondary_download_concurrency,
@@ -573,6 +574,7 @@ impl PageServerConf {
            background_task_maximum_delay: Duration::ZERO,
            load_previous_heatmap: Some(true),
            generate_unarchival_heatmap: Some(true),
+            control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()),
            ..Default::default()
        };
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
@@ -641,9 +643,12 @@ mod tests {
    use super::PageServerConf;

    #[test]
-    fn test_empty_config_toml_is_valid() {
-        // we use Default impl of everything in this situation
+    fn test_minimal_config_toml_is_valid() {
+        // The minimal valid config for running a pageserver:
+        // - control_plane_api is mandatory, as pageservers cannot run in isolation
+        // - we use Default impl of everything else in this situation
        let input = r#"
+            control_plane_api = "http://localhost:6666"
        "#;
        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
            .expect("empty config is valid");
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -30,9 +30,6 @@ pub(super) enum Name {
    /// Tenant remote size
    #[serde(rename = "remote_storage_size")]
    RemoteSize,
-    /// Tenant resident size
-    #[serde(rename = "resident_size")]
-    ResidentSize,
    /// Tenant synthetic size
    #[serde(rename = "synthetic_storage_size")]
    SyntheticSize,
@@ -187,18 +184,6 @@ impl MetricsKey {
        .absolute_values()
    }

-    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
-    ///
-    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: Name::ResidentSize,
-        }
-        .absolute_values()
-    }
-
    /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
    ///
    /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
@@ -261,10 +246,7 @@ where
    let mut tenants = std::pin::pin!(tenants);

    while let Some((tenant_id, tenant)) = tenants.next().await {
-        let mut tenant_resident_size = 0;
-
        let timelines = tenant.list_timelines();
-        let timelines_len = timelines.len();
        for timeline in timelines {
            let timeline_id = timeline.timeline_id;

@@ -287,16 +269,9 @@ where
                    continue;
                }
            }
-
-            tenant_resident_size += timeline.resident_physical_size();
        }

-        if timelines_len == 0 {
-            // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
-            tenant_resident_size = 1;
-        }
-
-        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
+        let snap = TenantSnapshot::collect(&tenant);
        snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
    }

@@ -305,19 +280,14 @@ where

 /// In-between abstraction to allow testing metrics without actual Tenants.
 struct TenantSnapshot {
-    resident_size: u64,
    remote_size: u64,
    synthetic_size: u64,
 }

 impl TenantSnapshot {
    /// Collect tenant status to have metrics created out of it.
-    ///
-    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
-    /// cannot just list timelines here.
-    fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
+    fn collect(t: &Arc<crate::tenant::TenantShard>) -> Self {
        TenantSnapshot {
-            resident_size,
            remote_size: t.remote_size(),
            // Note that this metric is calculated in a separate bgworker
            // Here we only use cached value, which may lag behind the real latest one
@@ -334,8 +304,6 @@ impl TenantSnapshot {
    ) {
        let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);

-        let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
-
        let synthetic_size = {
            let factory = MetricsKey::synthetic_size(tenant_id);
            let mut synthetic_size = self.synthetic_size;
@@ -355,11 +323,7 @@ impl TenantSnapshot {
            }
        };

-        metrics.extend(
-            [Some(remote_size), Some(resident_size), synthetic_size]
-                .into_iter()
-                .flatten(),
-        );
+        metrics.extend([Some(remote_size), synthetic_size].into_iter().flatten());
    }
 }

--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -224,7 +224,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
    let tenant_id = TenantId::generate();

    let ts = TenantSnapshot {
-        resident_size: 1000,
        remote_size: 1000,
        // not yet calculated
        synthetic_size: 0,
@@ -245,7 +244,6 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
        metrics,
        &[
            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
-            MetricsKey::resident_size(tenant_id).at(now, 1000),
            MetricsKey::synthetic_size(tenant_id).at(now, 1000),
        ]
    );
@@ -256,7 +254,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
    let tenant_id = TenantId::generate();

    let ts = TenantSnapshot {
-        resident_size: 1000,
        remote_size: 1000,
        // not yet calculated
        synthetic_size: 0,
@@ -274,7 +271,6 @@ fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
        metrics,
        &[
            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
-            MetricsKey::resident_size(tenant_id).at(now, 1000),
            // no synthetic size here
        ]
    );
@@ -295,14 +291,13 @@ pub(crate) const fn metric_examples_old(
    timeline_id: TimelineId,
    now: DateTime<Utc>,
    before: DateTime<Utc>,
-) -> [RawMetric; 6] {
+) -> [RawMetric; 5] {
    [
        MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
        MetricsKey::written_size_delta(tenant_id, timeline_id)
            .from_until_old_format(before, now, 0),
        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
        MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
-        MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
        MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
    ]
 }
@@ -312,13 +307,12 @@ pub(crate) const fn metric_examples(
    timeline_id: TimelineId,
    now: DateTime<Utc>,
    before: DateTime<Utc>,
-) -> [NewRawMetric; 6] {
+) -> [NewRawMetric; 5] {
    [
        MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
        MetricsKey::remote_storage_size(tenant_id).at(now, 0),
-        MetricsKey::resident_size(tenant_id).at(now, 0),
        MetricsKey::synthetic_size(tenant_id).at(now, 1),
    ]
 }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -521,10 +521,6 @@ mod tests {
                line!(),
                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
            ),
-            (
-                line!(),
-                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
-            ),
            (
                line!(),
                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
@@ -564,7 +560,7 @@ mod tests {
        assert_eq!(upgraded_samples, new_samples);
    }

-    fn metric_samples_old() -> [RawMetric; 6] {
+    fn metric_samples_old() -> [RawMetric; 5] {
        let tenant_id = TenantId::from_array([0; 16]);
        let timeline_id = TimelineId::from_array([0xff; 16]);

@@ -576,7 +572,7 @@ mod tests {
        super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
    }

-    fn metric_samples() -> [NewRawMetric; 6] {
+    fn metric_samples() -> [NewRawMetric; 5] {
        let tenant_id = TenantId::from_array([0; 16]);
        let timeline_id = TimelineId::from_array([0xff; 16]);

--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -58,14 +58,8 @@ pub trait StorageControllerUpcallApi {
 impl StorageControllerUpcallClient {
    /// A None return value indicates that the input `conf` object does not have control
    /// plane API enabled.
-    pub fn new(
-        conf: &'static PageServerConf,
-        cancel: &CancellationToken,
-    ) -> Result<Option<Self>, reqwest::Error> {
-        let mut url = match conf.control_plane_api.as_ref() {
-            Some(u) => u.clone(),
-            None => return Ok(None),
-        };
+    pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Self {
+        let mut url = conf.control_plane_api.clone();

        if let Ok(mut segs) = url.path_segments_mut() {
            // This ensures that `url` ends with a slash if it doesn't already.
@@ -85,15 +79,17 @@ impl StorageControllerUpcallClient {
        }

        for cert in &conf.ssl_ca_certs {
-            client = client.add_root_certificate(Certificate::from_der(cert.contents())?);
+            client = client.add_root_certificate(
+                Certificate::from_der(cert.contents()).expect("Invalid certificate in config"),
+            );
        }

-        Ok(Some(Self {
-            http_client: client.build()?,
+        Self {
+            http_client: client.build().expect("Failed to construct HTTP client"),
            base_url: url,
            node_id: conf.id,
            cancel: cancel.clone(),
-        }))
+        }
    }

    #[tracing::instrument(skip_all)]
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -585,7 +585,7 @@ impl DeletionQueue {
    /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
    pub fn new<C>(
        remote_storage: GenericRemoteStorage,
-        controller_upcall_client: Option<C>,
+        controller_upcall_client: C,
        conf: &'static PageServerConf,
    ) -> (Self, DeletionQueueWorkers<C>)
    where
@@ -701,7 +701,7 @@ mod test {
        async fn restart(&mut self) {
            let (deletion_queue, workers) = DeletionQueue::new(
                self.storage.clone(),
-                Some(self.mock_control_plane.clone()),
+                self.mock_control_plane.clone(),
                self.harness.conf,
            );

@@ -821,11 +821,8 @@ mod test {

        let mock_control_plane = MockStorageController::new();

-        let (deletion_queue, worker) = DeletionQueue::new(
-            storage.clone(),
-            Some(mock_control_plane.clone()),
-            harness.conf,
-        );
+        let (deletion_queue, worker) =
+            DeletionQueue::new(storage.clone(), mock_control_plane.clone(), harness.conf);

        let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());

--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -53,7 +53,7 @@ where
    tx: tokio::sync::mpsc::Sender<DeleterMessage>,

    // Client for calling into control plane API for validation of deletes
-    controller_upcall_client: Option<C>,
+    controller_upcall_client: C,

    // DeletionLists which are waiting generation validation.  Not safe to
    // execute until [`validate`] has processed them.
@@ -86,7 +86,7 @@ where
        conf: &'static PageServerConf,
        rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
        tx: tokio::sync::mpsc::Sender<DeleterMessage>,
-        controller_upcall_client: Option<C>,
+        controller_upcall_client: C,
        lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
        cancel: CancellationToken,
    ) -> Self {
@@ -137,20 +137,16 @@ where
            return Ok(());
        }

-        let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client {
-            match controller_upcall_client
-                .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
-                .await
-            {
-                Ok(tenants) => tenants,
-                Err(RetryForeverError::ShuttingDown) => {
-                    // The only way a validation call returns an error is when the cancellation token fires
-                    return Err(DeletionQueueError::ShuttingDown);
-                }
+        let tenants_valid = match self
+            .controller_upcall_client
+            .validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
+            .await
+        {
+            Ok(tenants) => tenants,
+            Err(RetryForeverError::ShuttingDown) => {
+                // The only way a validation call returns an error is when the cancellation token fires
+                return Err(DeletionQueueError::ShuttingDown);
            }
-        } else {
-            // Control plane API disabled.  In legacy mode we consider everything valid.
-            tenant_generations.keys().map(|k| (*k, true)).collect()
        };

        let mut validated_sequence: Option<u64> = None;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -497,6 +497,24 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

+pub(crate) static ONDEMAND_DOWNLOAD_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_ondemand_download_bytes_total",
+        "Total bytes of layers on-demand downloaded",
+        &["task_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static ONDEMAND_DOWNLOAD_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_ondemand_download_count",
+        "Total count of layers on-demand downloaded",
+        &["task_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod wait_ondemand_download_time {
    use super::*;
    const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[
@@ -2180,6 +2198,10 @@ impl BasebackupQueryTimeOngoingRecording<'_> {
        // If you want to change categorize of a specific error, also change it in `log_query_error`.
        let metric = match res {
            Ok(_) => &self.parent.ok,
+            Err(QueryError::Shutdown) => {
+                // Do not observe ok/err for shutdown
+                return;
+            }
            Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
                if is_expected_io_error(io_error) =>
            {
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1035,10 +1035,25 @@ impl PageServerHandler {
                // avoid a somewhat costly Span::record() by constructing the entire span in one go.
                macro_rules! mkspan {
                    (before shard routing) => {{
-                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn)
+                        tracing::info_span!(
+                            parent: &parent_span,
+                            "handle_get_page_request",
+                            rel = %req.rel,
+                            blkno = %req.blkno,
+                            req_lsn = %req.hdr.request_lsn,
+                            not_modified_since_lsn = %req.hdr.not_modified_since
+                        )
                    }};
                    ($shard_id:expr) => {{
-                        tracing::info_span!(parent: &parent_span, "handle_get_page_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.hdr.request_lsn, shard_id = %$shard_id)
+                        tracing::info_span!(
+                            parent: &parent_span,
+                            "handle_get_page_request",
+                            rel = %req.rel,
+                            blkno = %req.blkno,
+                            req_lsn = %req.hdr.request_lsn,
+                            not_modified_since_lsn = %req.hdr.not_modified_since,
+                            shard_id = %$shard_id
+                        )
                    }};
                }

@@ -1102,6 +1117,7 @@ impl PageServerHandler {
                            shard_id = %shard.get_shard_identity().shard_slug(),
                            timeline_id = %timeline_id,
                            lsn = %req.hdr.request_lsn,
+                            not_modified_since_lsn = %req.hdr.not_modified_since,
                            request_id = %req.hdr.reqid,
                            key = %key,
                            )
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1084,8 +1084,17 @@ impl Timeline {
        let mut result = HashMap::new();
        for (k, v) in kv {
            let v = v?;
+            if v.is_empty() {
+                // This is a tombstone -- we can skip it.
+                // Originally, the replorigin code uses `Lsn::INVALID` to represent a tombstone. However, as it part of
+                // the sparse keyspace and the sparse keyspace uses an empty image to universally represent a tombstone,
+                // we also need to consider that. Such tombstones might be written on the detach ancestor code path to
+                // avoid the value going into the child branch. (See [`crate::tenant::timeline::detach_ancestor::generate_tombstone_image_layer`] for more details.)
+                continue;
+            }
            let origin_id = k.field6 as RepOriginId;
-            let origin_lsn = Lsn::des(&v).unwrap();
+            let origin_lsn = Lsn::des(&v)
+                .with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?;
            if origin_lsn != Lsn::INVALID {
                result.insert(origin_id, origin_lsn);
            }
@@ -2578,6 +2587,11 @@ impl DatadirModification<'_> {
        }
    }

+    #[cfg(test)]
+    pub fn put_for_unit_test(&mut self, key: Key, val: Value) {
+        self.put(key, val);
+    }
+
    fn put(&mut self, key: Key, val: Value) {
        if Self::is_data_key(&key) {
            self.put_data(key.to_compact(), val)
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4254,9 +4254,7 @@ impl TenantShard {
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
    ) -> TenantShard {
-        debug_assert!(
-            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
-        );
+        assert!(!attached_conf.location.generation.is_none());

        let (state, mut rx) = watch::channel(state);

@@ -5949,7 +5947,9 @@ mod tests {
    use itertools::Itertools;
    #[cfg(feature = "testing")]
    use models::CompactLsnRange;
-    use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
+    use pageserver_api::key::{
+        AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX, repl_origin_key,
+    };
    use pageserver_api::keyspace::KeySpace;
    #[cfg(feature = "testing")]
    use pageserver_api::keyspace::KeySpaceRandomAccum;
@@ -8185,6 +8185,54 @@ mod tests {
        assert_eq!(files.get("pg_logical/mappings/test2"), None);
    }

+    #[tokio::test]
+    async fn test_repl_origin_tombstones() {
+        let harness = TenantHarness::create("test_repl_origin_tombstones")
+            .await
+            .unwrap();
+
+        let (tenant, ctx) = harness.load().await;
+        let io_concurrency = IoConcurrency::spawn_for_test();
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        let repl_lsn = Lsn(0x10);
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification.put_for_unit_test(repl_origin_key(2), Value::Image(Bytes::new()));
+            modification.set_replorigin(1, repl_lsn).await.unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // we can read everything from the storage
+        let repl_origins = tline
+            .get_replorigins(lsn, &ctx, io_concurrency.clone())
+            .await
+            .unwrap();
+        assert_eq!(repl_origins.len(), 1);
+        assert_eq!(repl_origins[&1], lsn);
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification.put_for_unit_test(
+                repl_origin_key(3),
+                Value::Image(Bytes::copy_from_slice(b"cannot_decode_this")),
+            );
+            modification.commit(&ctx).await.unwrap();
+        }
+        let result = tline
+            .get_replorigins(lsn, &ctx, io_concurrency.clone())
+            .await;
+        assert!(result.is_err());
+    }
+
    #[tokio::test]
    async fn test_metadata_image_creation() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_metadata_image_creation").await?;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -346,7 +346,8 @@ async fn init_load_generations(
            "Emergency mode!  Tenants will be attached unsafely using their last known generation"
        );
        emergency_generations(tenant_confs)
-    } else if let Some(client) = StorageControllerUpcallClient::new(conf, cancel)? {
+    } else {
+        let client = StorageControllerUpcallClient::new(conf, cancel);
        info!("Calling {} API to re-attach tenants", client.base_url());
        // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
        match client.re_attach(conf).await {
@@ -360,9 +361,6 @@ async fn init_load_generations(
                anyhow::bail!("Shut down while waiting for control plane re-attach response")
            }
        }
-    } else {
-        info!("Control plane API not configured, tenant generations are disabled");
-        return Ok(None);
    };

    // The deletion queue needs to know about the startup attachment state to decide which (if any) stored
@@ -1153,17 +1151,8 @@ impl TenantManager {
                // Testing hack: if we are configured with no control plane, then drop the generation
                // from upserts.  This enables creating generation-less tenants even though neon_local
                // always uses generations when calling the location conf API.
-                let attached_conf = if cfg!(feature = "testing") {
-                    let mut conf = AttachedTenantConf::try_from(new_location_config)
-                        .map_err(UpsertLocationError::BadRequest)?;
-                    if self.conf.control_plane_api.is_none() {
-                        conf.location.generation = Generation::none();
-                    }
-                    conf
-                } else {
-                    AttachedTenantConf::try_from(new_location_config)
-                        .map_err(UpsertLocationError::BadRequest)?
-                };
+                let attached_conf = AttachedTenantConf::try_from(new_location_config)
+                    .map_err(UpsertLocationError::BadRequest)?;

                let tenant = tenant_spawn(
                    self.conf,
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1441,14 +1441,6 @@ impl DeltaLayerInner {
        offset
    }

-    pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
-        self.iter_with_options(
-            ctx,
-            1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
-            1024,        // The default value. Unit tests might use a different value
-        )
-    }
-
    pub fn iter_with_options<'a>(
        &'a self,
        ctx: &'a RequestContext,
@@ -1634,7 +1626,6 @@ pub(crate) mod test {
    use crate::tenant::disk_btree::tests::TestDisk;
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
-    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
    use crate::tenant::{TenantShard, Timeline};

    /// Construct an index for a fictional delta layer and and then
@@ -2311,8 +2302,7 @@ pub(crate) mod test {
            for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                // Test if the batch size is correctly determined
-                let mut iter = delta_layer.iter(&ctx);
-                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size);
                let mut num_items = 0;
                for _ in 0..3 {
                    iter.next_batch().await.unwrap();
@@ -2329,8 +2319,7 @@ pub(crate) mod test {
                    iter.key_values_batch.clear();
                }
                // Test if the result is correct
-                let mut iter = delta_layer.iter(&ctx);
-                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut iter = delta_layer.iter_with_options(&ctx, max_read_size, batch_size);
                assert_delta_iter_equal(&mut iter, &test_deltas).await;
            }
        }
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -157,7 +157,7 @@ mod tests {
            .await
            .unwrap();

-        let merge_iter = MergeIterator::create(
+        let merge_iter = MergeIterator::create_for_testing(
            &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
            &[],
            &ctx,
@@ -182,7 +182,7 @@ mod tests {
        result.extend(test_deltas1[90..100].iter().cloned());
        assert_filter_iter_equal(&mut filter_iter, &result).await;

-        let merge_iter = MergeIterator::create(
+        let merge_iter = MergeIterator::create_for_testing(
            &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
            &[],
            &ctx,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -684,14 +684,6 @@ impl ImageLayerInner {
        }
    }

-    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
-        self.iter_with_options(
-            ctx,
-            1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
-            1024,        // The default value. Unit tests might use a different value
-        )
-    }
-
    pub(crate) fn iter_with_options<'a>(
        &'a self,
        ctx: &'a RequestContext,
@@ -1240,7 +1232,6 @@ mod test {
    use crate::context::RequestContext;
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
-    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
    use crate::tenant::{TenantShard, Timeline};

    #[tokio::test]
@@ -1507,8 +1498,7 @@ mod test {
            for batch_size in [1, 2, 4, 8, 3, 7, 13] {
                println!("running with batch_size={batch_size} max_read_size={max_read_size}");
                // Test if the batch size is correctly determined
-                let mut iter = img_layer.iter(&ctx);
-                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size);
                let mut num_items = 0;
                for _ in 0..3 {
                    iter.next_batch().await.unwrap();
@@ -1525,8 +1515,7 @@ mod test {
                    iter.key_values_batch.clear();
                }
                // Test if the result is correct
-                let mut iter = img_layer.iter(&ctx);
-                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut iter = img_layer.iter_with_options(&ctx, max_read_size, batch_size);
                assert_img_iter_equal(&mut iter, &test_imgs, Lsn(0x10)).await;
            }
        }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,6 +4,7 @@ use std::sync::{Arc, Weak};
 use std::time::{Duration, SystemTime};

 use crate::PERF_TRACE_TARGET;
+use crate::metrics::{ONDEMAND_DOWNLOAD_BYTES, ONDEMAND_DOWNLOAD_COUNT};
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
@@ -1255,6 +1256,14 @@ impl LayerInner {

                self.access_stats.record_residence_event();

+                let task_kind: &'static str = ctx.task_kind().into();
+                ONDEMAND_DOWNLOAD_BYTES
+                    .with_label_values(&[task_kind])
+                    .inc_by(self.desc.file_size);
+                ONDEMAND_DOWNLOAD_COUNT
+                    .with_label_values(&[task_kind])
+                    .inc();
+
                Ok(self.initialize_after_layer_is_on_disk(permit))
            }
            Err(e) => {
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -19,14 +19,6 @@ pub(crate) enum LayerRef<'a> {
 }

 impl<'a> LayerRef<'a> {
-    #[allow(dead_code)]
-    fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
-        match self {
-            Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
-            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
-        }
-    }
-
    fn iter_with_options(
        self,
        ctx: &'a RequestContext,
@@ -322,6 +314,28 @@ impl MergeIteratorItem for ((Key, Lsn, Value), Arc<PersistentLayerKey>) {
 }

 impl<'a> MergeIterator<'a> {
+    #[cfg(test)]
+    pub(crate) fn create_for_testing(
+        deltas: &[&'a DeltaLayerInner],
+        images: &[&'a ImageLayerInner],
+        ctx: &'a RequestContext,
+    ) -> Self {
+        Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024)
+    }
+
+    /// Create a new merge iterator with custom options.
+    ///
+    /// Adjust `max_read_size` and `max_batch_size` to trade memory usage for performance. The size should scale
+    /// with the number of layers to compact. If there are a lot of layers, consider reducing the values, so that
+    /// the buffer does not take too much memory.
+    ///
+    /// The default options for L0 compactions are:
+    /// - max_read_size: 1024 * 8192 (8MB)
+    /// - max_batch_size: 1024
+    ///
+    /// The default options for gc-compaction are:
+    /// - max_read_size: 128 * 8192 (1MB)
+    /// - max_batch_size: 128
    pub fn create_with_options(
        deltas: &[&'a DeltaLayerInner],
        images: &[&'a ImageLayerInner],
@@ -351,14 +365,6 @@ impl<'a> MergeIterator<'a> {
        }
    }

-    pub fn create(
-        deltas: &[&'a DeltaLayerInner],
-        images: &[&'a ImageLayerInner],
-        ctx: &'a RequestContext,
-    ) -> Self {
-        Self::create_with_options(deltas, images, ctx, 1024 * 8192, 1024)
-    }
-
    pub(crate) async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
        while let Some(mut iter) = self.heap.peek_mut() {
            if !iter.is_loaded() {
@@ -477,7 +483,7 @@ mod tests {
        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
            .await
            .unwrap();
-        let mut merge_iter = MergeIterator::create(
+        let mut merge_iter = MergeIterator::create_for_testing(
            &[
                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
@@ -549,7 +555,7 @@ mod tests {
        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
            .await
            .unwrap();
-        let mut merge_iter = MergeIterator::create(
+        let mut merge_iter = MergeIterator::create_for_testing(
            &[
                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
@@ -670,7 +676,7 @@ mod tests {
        // Test with different layer order for MergeIterator::create to ensure the order
        // is stable.

-        let mut merge_iter = MergeIterator::create(
+        let mut merge_iter = MergeIterator::create_for_testing(
            &[
                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
@@ -682,7 +688,7 @@ mod tests {
        );
        assert_merge_iter_equal(&mut merge_iter, &expect).await;

-        let mut merge_iter = MergeIterator::create(
+        let mut merge_iter = MergeIterator::create_for_testing(
            &[
                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
                resident_layer_4.get_as_delta(&ctx).await.unwrap(),
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1994,7 +1994,13 @@ impl Timeline {
                let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
                deltas.push(l);
            }
-            MergeIterator::create(&deltas, &[], ctx)
+            MergeIterator::create_with_options(
+                &deltas,
+                &[],
+                ctx,
+                1024 * 8192, /* 8 MiB buffer per layer iterator */
+                1024,
+            )
        };

        // This iterator walks through all keys and is needed to calculate size used by each key
@@ -2828,7 +2834,7 @@ impl Timeline {
        Ok(())
    }

-    /// Check if the memory usage is within the limit.
+    /// Check to bail out of gc compaction early if it would use too much memory.
    async fn check_memory_usage(
        self: &Arc<Self>,
        layer_selection: &[Layer],
@@ -2841,7 +2847,8 @@ impl Timeline {
            let layer_desc = layer.layer_desc();
            if layer_desc.is_delta() {
                // Delta layers at most have 1MB buffer; 3x to make it safe (there're deltas as large as 16KB).
-                // Multiply the layer size so that tests can pass.
+                // Scale it by target_layer_size_bytes so that tests can pass (some tests, e.g., `test_pageserver_gc_compaction_preempt
+                // use 3MB layer size and we need to account for that).
                estimated_memory_usage_mb +=
                    3.0 * (layer_desc.file_size / target_layer_size_bytes) as f64;
                num_delta_layers += 1;
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -178,7 +178,7 @@ impl Attempt {
    }
 }

-async fn generate_tombstone_image_layer(
+pub(crate) async fn generate_tombstone_image_layer(
    detached: &Arc<Timeline>,
    ancestor: &Arc<Timeline>,
    ancestor_lsn: Lsn,
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -163,8 +163,7 @@ pub async fn doit(
        // Ensure at-least-once delivery of the upcall to storage controller
        // before we mark the task as done and never come here again.
        //
-        let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel)?
-            .expect("storcon configured");
+        let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel);
        storcon_client
            .put_timeline_import_status(
                timeline.tenant_shard_id,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -14,8 +14,6 @@
 use std::fs::File;
 use std::io::{Error, ErrorKind};
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-#[cfg(target_os = "linux")]
-use std::os::unix::fs::OpenOptionsExt;
 use std::sync::LazyLock;
 use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};

@@ -99,7 +97,7 @@ impl VirtualFile {

    pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
        path: P,
-        open_options: &OpenOptions,
+        #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions,
        ctx: &RequestContext,
    ) -> Result<Self, std::io::Error> {
        let mode = get_io_mode();
@@ -112,21 +110,16 @@ impl VirtualFile {
            #[cfg(target_os = "linux")]
            (IoMode::DirectRw, _) => true,
        };
-        let open_options = open_options.clone();
-        let open_options = if set_o_direct {
+        if set_o_direct {
            #[cfg(target_os = "linux")]
            {
-                let mut open_options = open_options;
-                open_options.custom_flags(nix::libc::O_DIRECT);
-                open_options
+                open_options = open_options.custom_flags(nix::libc::O_DIRECT);
            }
            #[cfg(not(target_os = "linux"))]
            unreachable!(
                "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined"
            );
-        } else {
-            open_options
-        };
+        }
        let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
        Ok(VirtualFile { inner, _mode: mode })
    }
@@ -530,7 +523,7 @@ impl VirtualFileInner {
        path: P,
        ctx: &RequestContext,
    ) -> Result<VirtualFileInner, std::io::Error> {
-        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true).clone(), ctx).await
+        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
    }

    /// Open a file with given options.
@@ -558,10 +551,11 @@ impl VirtualFileInner {
        // It would perhaps be nicer to check just for the read and write flags
        // explicitly, but OpenOptions doesn't contain any functions to read flags,
        // only to set them.
-        let mut reopen_options = open_options.clone();
-        reopen_options.create(false);
-        reopen_options.create_new(false);
-        reopen_options.truncate(false);
+        let reopen_options = open_options
+            .clone()
+            .create(false)
+            .create_new(false)
+            .truncate(false);

        let vfile = VirtualFileInner {
            handle: RwLock::new(handle),
@@ -1307,7 +1301,7 @@ mod tests {
                opts: OpenOptions,
                ctx: &RequestContext,
            ) -> Result<MaybeVirtualFile, anyhow::Error> {
-                let vf = VirtualFile::open_with_options_v2(&path, &opts, ctx).await?;
+                let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?;
                Ok(MaybeVirtualFile::VirtualFile(vf))
            }
        }
@@ -1374,7 +1368,7 @@ mod tests {
        let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err();

        // Close the file and re-open for reading
-        let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
+        let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?;

        // cannot write to a file opened in read-only mode
        let _ = file_a
@@ -1393,8 +1387,7 @@ mod tests {
                .read(true)
                .write(true)
                .create(true)
-                .truncate(true)
-                .to_owned(),
+                .truncate(true),
            &ctx,
        )
        .await?;
@@ -1412,12 +1405,7 @@ mod tests {

        let mut vfiles = Vec::new();
        for _ in 0..100 {
-            let mut vfile = A::open(
-                path_b.clone(),
-                OpenOptions::new().read(true).to_owned(),
-                &ctx,
-            )
-            .await?;
+            let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?;
            assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?);
            vfiles.push(vfile);
        }
@@ -1466,7 +1454,7 @@ mod tests {
        for _ in 0..VIRTUAL_FILES {
            let f = VirtualFileInner::open_with_options(
                &test_file_path,
-                OpenOptions::new().read(true).clone(),
+                OpenOptions::new().read(true),
                &ctx,
            )
            .await?;
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -1,6 +1,7 @@
 //! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];

 use std::os::fd::OwnedFd;
+use std::os::unix::fs::OpenOptionsExt;
 use std::path::Path;

 use super::io_engine::IoEngine;
@@ -43,7 +44,7 @@ impl OpenOptions {
        self.write
    }

-    pub fn read(&mut self, read: bool) -> &mut OpenOptions {
+    pub fn read(mut self, read: bool) -> Self {
        match &mut self.inner {
            Inner::StdFs(x) => {
                let _ = x.read(read);
@@ -56,7 +57,7 @@ impl OpenOptions {
        self
    }

-    pub fn write(&mut self, write: bool) -> &mut OpenOptions {
+    pub fn write(mut self, write: bool) -> Self {
        self.write = write;
        match &mut self.inner {
            Inner::StdFs(x) => {
@@ -70,7 +71,7 @@ impl OpenOptions {
        self
    }

-    pub fn create(&mut self, create: bool) -> &mut OpenOptions {
+    pub fn create(mut self, create: bool) -> Self {
        match &mut self.inner {
            Inner::StdFs(x) => {
                let _ = x.create(create);
@@ -83,7 +84,7 @@ impl OpenOptions {
        self
    }

-    pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions {
+    pub fn create_new(mut self, create_new: bool) -> Self {
        match &mut self.inner {
            Inner::StdFs(x) => {
                let _ = x.create_new(create_new);
@@ -96,7 +97,7 @@ impl OpenOptions {
        self
    }

-    pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions {
+    pub fn truncate(mut self, truncate: bool) -> Self {
        match &mut self.inner {
            Inner::StdFs(x) => {
                let _ = x.truncate(truncate);
@@ -124,10 +125,8 @@ impl OpenOptions {
            }
        }
    }
-}

-impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
-    fn mode(&mut self, mode: u32) -> &mut OpenOptions {
+    pub fn mode(mut self, mode: u32) -> Self {
        match &mut self.inner {
            Inner::StdFs(x) => {
                let _ = x.mode(mode);
@@ -140,7 +139,7 @@ impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
        self
    }

-    fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions {
+    pub fn custom_flags(mut self, flags: i32) -> Self {
        match &mut self.inner {
            Inner::StdFs(x) => {
                let _ = x.custom_flags(flags);
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -36,6 +36,8 @@ DATA = \
 	neon--1.2--1.3.sql \
 	neon--1.3--1.4.sql \
 	neon--1.4--1.5.sql \
+	neon--1.5--1.6.sql \
+	neon--1.6--1.5.sql \
 	neon--1.5--1.4.sql \
 	neon--1.4--1.3.sql \
 	neon--1.3--1.2.sql \
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -687,8 +687,14 @@ prefetch_wait_for(uint64 ring_index)
 		END_PREFETCH_RECEIVE_WORK();
 		CHECK_FOR_INTERRUPTS();
 	}
-
-	return result;
+	if (result)
+	{
+		/* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */
+		PrefetchRequest *slot = GetPrfSlot(ring_index);
+		return slot->status == PRFS_RECEIVED;
+	}
+	return false;
+;
 }

 /*
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -98,7 +98,6 @@
 #define MB					((uint64)1024*1024)

 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log))
-
 #define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1))

 /*
@@ -135,6 +134,15 @@ typedef struct FileCacheEntry
 #define N_COND_VARS 	64
 #define CV_WAIT_TIMEOUT	10

+#define MAX_PREWARM_WORKERS 8
+
+typedef struct PrewarmWorkerState
+{
+	uint32		prewarmed_pages;
+	uint32		skipped_pages;
+	TimestampTz completed;
+} PrewarmWorkerState;
+
 typedef struct FileCacheControl
 {
 	uint64		generation;		/* generation is needed to handle correct hash
@@ -156,25 +164,43 @@ typedef struct FileCacheControl
 	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 	ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
+	PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
+	size_t n_prewarm_workers;
+	size_t n_prewarm_entries;
+	size_t total_prewarm_pages;
+	size_t prewarm_batch;
+	bool   prewarm_active;
+	bool   prewarm_canceled;
+	dsm_handle prewarm_lfc_state_handle;
 } FileCacheControl;

-bool lfc_store_prefetch_result;
+#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc
+
+#define FILE_CACHE_STATE_BITMAP(fcs)	((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
+#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks)	(sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8)
+#define FILE_CACHE_STATE_SIZE(fcs)		(sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)

 static HTAB *lfc_hash;
 static int	lfc_desc = -1;
 static LWLockId lfc_lock;
 static int	lfc_max_size;
 static int	lfc_size_limit;
+static int	lfc_prewarm_limit;
+static int	lfc_prewarm_batch;
 static int	lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
 static int	lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
 static char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
+static bool lfc_do_prewarm;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif

+bool lfc_store_prefetch_result;
+bool lfc_prewarm_update_ws_estimation;
+
 #define LFC_ENABLED() (lfc_ctl->limit != 0)

 /*
@@ -500,6 +526,17 @@ lfc_init(void)
 							NULL,
 							NULL);

+	DefineCustomBoolVariable("neon.prewarm_update_ws_estimation",
+							"Consider prewarmed pages for working set estimation",
+							NULL,
+							&lfc_prewarm_update_ws_estimation,
+							true,
+							PGC_SUSET,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
 	DefineCustomIntVariable("neon.max_file_cache_size",
 							"Maximal size of Neon local file cache",
 							NULL,
@@ -550,6 +587,32 @@ lfc_init(void)
 							lfc_change_chunk_size,
 							NULL);

+	DefineCustomIntVariable("neon.file_cache_prewarm_limit",
+							"Maximal number of prewarmed chunks",
+							NULL,
+							&lfc_prewarm_limit,
+							INT_MAX,	/* no limit by default */
+							0,
+							INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomIntVariable("neon.file_cache_prewarm_batch",
+							"Number of pages retrivied by prewarm from page server",
+							NULL,
+							&lfc_prewarm_batch,
+							64,
+							1,
+							INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
 	if (lfc_max_size == 0)
 		return;

@@ -563,6 +626,317 @@ lfc_init(void)
 #endif
 }

+FileCacheState*
+lfc_get_state(size_t max_entries)
+{
+	FileCacheState* fcs = NULL;
+
+	if (lfc_maybe_disabled() || max_entries == 0)	/* fast exit if file cache is disabled */
+		return NULL;
+
+	LWLockAcquire(lfc_lock, LW_SHARED);
+
+	if (LFC_ENABLED())
+	{
+		dlist_iter iter;
+		size_t i = 0;
+		uint8* bitmap;
+		size_t n_pages = 0;
+		size_t n_entries = Min(max_entries, lfc_ctl->used - lfc_ctl->pinned);
+		size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries);
+		fcs = (FileCacheState*)palloc0(state_size);
+		SET_VARSIZE(fcs, state_size);
+		fcs->magic = FILE_CACHE_STATE_MAGIC;
+		fcs->chunk_size_log = lfc_chunk_size_log;
+		fcs->n_chunks = n_entries;
+		bitmap = FILE_CACHE_STATE_BITMAP(fcs);
+
+		dlist_reverse_foreach(iter, &lfc_ctl->lru)
+		{
+			FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur);
+			fcs->chunks[i] = entry->key;
+			for (int j = 0; j < lfc_blocks_per_chunk; j++)
+			{
+				if (GET_STATE(entry, j) != UNAVAILABLE)
+				{
+					BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j);
+					n_pages += 1;
+				}
+			}
+			if (++i == n_entries)
+				break;
+		}
+		Assert(i == n_entries);
+		fcs->n_pages = n_pages;
+		Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages);
+		elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages);
+	}
+
+	LWLockRelease(lfc_lock);
+
+	return fcs;
+}
+
+/*
+ * Prewarm LFC cache to the specified state. It uses lfc_prefetch function to load prewarmed page without hoilding shared buffer lock
+ * and avoid race conditions with other backends.
+ */
+void
+lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
+{
+	size_t fcs_chunk_size_log;
+	size_t n_entries;
+	size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
+	size_t fcs_size;
+	dsm_segment *seg;
+	BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
+
+
+	if (!lfc_ensure_opened())
+		return;
+
+	if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0)
+	{
+		elog(LOG, "LFC: prewarm is disabled");
+		return;
+	}
+
+	if (n_workers > MAX_PREWARM_WORKERS)
+	{
+		elog(ERROR, "LFC: Too much prewarm workers, maximum is %d", MAX_PREWARM_WORKERS);
+	}
+
+	if (fcs == NULL || fcs->n_chunks == 0)
+	{
+		elog(LOG, "LFC: nothing to prewarm");
+		return;
+	}
+
+	if (fcs->magic != FILE_CACHE_STATE_MAGIC)
+	{
+		elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic);
+	}
+
+	fcs_size = VARSIZE(fcs);
+	if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size)
+	{
+		elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs));
+	}
+
+	fcs_chunk_size_log = fcs->chunk_size_log;
+	if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
+	{
+		elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
+	}
+
+	n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
+	Assert(n_entries != 0);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+	/* Do not prewarm more entries than LFC limit */
+	if (lfc_ctl->limit <= lfc_ctl->size)
+	{
+		elog(LOG, "LFC: skip prewarm because LFC is already filled");
+		LWLockRelease(lfc_lock);
+		return;
+	}
+
+	if (lfc_ctl->prewarm_active)
+	{
+		LWLockRelease(lfc_lock);
+		elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
+	}
+	lfc_ctl->n_prewarm_entries = n_entries;
+	lfc_ctl->n_prewarm_workers = n_workers;
+	lfc_ctl->prewarm_active = true;
+	lfc_ctl->prewarm_canceled = false;
+	lfc_ctl->prewarm_batch = prewarm_batch;
+	memset(lfc_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState));
+
+	LWLockRelease(lfc_lock);
+
+	/* Calculate total number of pages to be prewarmed */
+	lfc_ctl->total_prewarm_pages = fcs->n_pages;
+
+	seg = dsm_create(fcs_size, 0);
+	memcpy(dsm_segment_address(seg), fcs, fcs_size);
+	lfc_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg);
+
+	/* Spawn background workers */
+	for (uint32 i = 0; i < n_workers; i++)
+	{
+		BackgroundWorker worker = {0};
+
+		worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
+		worker.bgw_start_time = BgWorkerStart_ConsistentState;
+		worker.bgw_restart_time = BGW_NEVER_RESTART;
+		strcpy(worker.bgw_library_name, "neon");
+		strcpy(worker.bgw_function_name, "lfc_prewarm_main");
+		snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1);
+		strcpy(worker.bgw_type, "LFC prewarm worker");
+		worker.bgw_main_arg = Int32GetDatum(i);
+		/* must set notify PID to wait for shutdown */
+		worker.bgw_notify_pid = MyProcPid;
+
+		if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i]))
+		{
+			ereport(LOG,
+					(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+					 errmsg("LFC: registering dynamic bgworker prewarm failed"),
+					 errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes")));
+			n_workers = i;
+			lfc_ctl->prewarm_canceled = true;
+			break;
+		}
+	}
+
+	for (uint32 i = 0; i < n_workers; i++)
+	{
+		bool interrupted;
+		do
+		{
+			interrupted = false;
+			PG_TRY();
+			{
+				BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]);
+				if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED)
+				{
+					elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status);
+				}
+			}
+			PG_CATCH();
+			{
+				elog(LOG, "LFC: cancel prewarm");
+				lfc_ctl->prewarm_canceled = true;
+				interrupted = true;
+			}
+			PG_END_TRY();
+		} while (interrupted);
+
+		if (!lfc_ctl->prewarm_workers[i].completed)
+		{
+			/* Background worker doesn't set completion time: it means that it was abnormally terminated */
+			elog(LOG, "LFC: prewarm worker %d failed", i+1);
+			/* Set completion time to prevent get_prewarm_info from considering this worker as active */
+			lfc_ctl->prewarm_workers[i].completed = GetCurrentTimestamp();
+		}
+	}
+	dsm_detach(seg);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	lfc_ctl->prewarm_active = false;
+	LWLockRelease(lfc_lock);
+}
+
+void
+lfc_prewarm_main(Datum main_arg)
+{
+	size_t snd_idx = 0, rcv_idx = 0;
+	size_t n_sent = 0, n_received = 0;
+	size_t fcs_chunk_size_log;
+	size_t max_prefetch_pages;
+	size_t prewarm_batch;
+	size_t n_workers;
+	dsm_segment *seg;
+	FileCacheState* fcs;
+	uint8* bitmap;
+	BufferTag tag;
+	PrewarmWorkerState* ws;
+	uint32 worker_id = DatumGetInt32(main_arg);
+
+	pqsignal(SIGTERM, die);
+	BackgroundWorkerUnblockSignals();
+
+	seg = dsm_attach(lfc_ctl->prewarm_lfc_state_handle);
+	if (seg == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not map dynamic shared memory segment")));
+
+	fcs = (FileCacheState*) dsm_segment_address(seg);
+	prewarm_batch = lfc_ctl->prewarm_batch;
+	fcs_chunk_size_log = fcs->chunk_size_log;
+	n_workers = lfc_ctl->n_prewarm_workers;
+	max_prefetch_pages = lfc_ctl->n_prewarm_entries << fcs_chunk_size_log;
+	ws = &lfc_ctl->prewarm_workers[worker_id];
+	bitmap = FILE_CACHE_STATE_BITMAP(fcs);
+
+	/* enable prefetch in LFC */
+	lfc_store_prefetch_result = true;
+	lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */
+
+	elog(LOG, "LFC: worker %d start prewarming", worker_id);
+	while (!lfc_ctl->prewarm_canceled)
+	{
+		if (snd_idx < max_prefetch_pages)
+		{
+			if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id)
+			{
+				/* If there are multiple workers, split chunks between them */
+				snd_idx += 1 << fcs_chunk_size_log;
+			}
+			else
+			{
+				if (BITMAP_ISSET(bitmap, snd_idx))
+				{
+					tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
+					tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
+					if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
+					{
+						(void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
+						n_sent += 1;
+					}
+					else
+					{
+						ws->skipped_pages += 1;
+						BITMAP_CLR(bitmap, snd_idx);
+					}
+				}
+				snd_idx += 1;
+			}
+		}
+		if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages)
+		{
+			if (n_received == n_sent && snd_idx == max_prefetch_pages)
+			{
+				break;
+			}
+			if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id)
+			{
+				/* Skip chunks processed by other workers */
+				rcv_idx += 1 << fcs_chunk_size_log;
+				continue;
+			}
+
+			/* Locate next block to prefetch */
+			while (!BITMAP_ISSET(bitmap, rcv_idx))
+			{
+				rcv_idx += 1;
+			}
+			tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log];
+			tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1);
+			if (communicator_prefetch_receive(tag))
+			{
+				ws->prewarmed_pages += 1;
+			}
+			else
+			{
+				ws->skipped_pages += 1;
+			}
+			rcv_idx += 1;
+			n_received += 1;
+		}
+	}
+	/* No need to perform prefetch cleanup here because prewarm worker will be terminated and
+	 * connection to PS dropped just after return from this function.
+	 */
+	Assert(n_sent == n_received || lfc_ctl->prewarm_canceled);
+	elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received);
+	lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
+}
+
+
 /*
 * Check if page is present in the cache.
 * Returns true if page is found in local cache.
@@ -1001,8 +1375,11 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 	 * If we can't (e.g. because all other slots are being accessed)
 	 * then we will remove this entry from the hash and continue
 	 * on to the next chunk, as we may not exceed the limit.
+	 *
+	 * While prewarming LFC we do not want to replace existed entries,
+	 * so we just stop prewarm is LFC cache is full.
 	 */
-	else if (!dlist_is_empty(&lfc_ctl->lru))
+	else if (!dlist_is_empty(&lfc_ctl->lru) && !lfc_do_prewarm)
 	{
 		/* Cache overflow: evict least recently used chunk */
 		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
@@ -1026,6 +1403,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 		/* Can't add this chunk - we don't have the space for it */
 		hash_search_with_hash_value(lfc_hash, &entry->key, hash,
 									HASH_REMOVE, NULL);
+		lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */
 		return false;
 	}

@@ -1112,9 +1490,11 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,

 	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);

-	tag.blockNum = blkno;
-	addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-
+	if (lfc_prewarm_update_ws_estimation)
+	{
+		tag.blockNum = blkno;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
 	if (found)
 	{
 		state = GET_STATE(entry, chunk_offs);
@@ -1748,3 +2128,82 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
 	}
 	PG_RETURN_NULL();
 }
+
+PG_FUNCTION_INFO_V1(get_local_cache_state);
+
+Datum
+get_local_cache_state(PG_FUNCTION_ARGS)
+{
+	size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
+	FileCacheState* fcs = lfc_get_state(max_entries);
+	if (fcs != NULL)
+		PG_RETURN_BYTEA_P((bytea*)fcs);
+	else
+		PG_RETURN_NULL();
+}
+
+PG_FUNCTION_INFO_V1(prewarm_local_cache);
+
+Datum
+prewarm_local_cache(PG_FUNCTION_ARGS)
+{
+	bytea* state = PG_GETARG_BYTEA_PP(0);
+	uint32 n_workers =  PG_GETARG_INT32(1);
+	FileCacheState* fcs = (FileCacheState*)state;
+
+	lfc_prewarm(fcs, n_workers);
+
+	PG_RETURN_NULL();
+}
+
+PG_FUNCTION_INFO_V1(get_prewarm_info);
+
+Datum
+get_prewarm_info(PG_FUNCTION_ARGS)
+{
+	Datum		values[4];
+	bool		nulls[4];
+	TupleDesc	tupdesc;
+	uint32 prewarmed_pages = 0;
+	uint32 skipped_pages = 0;
+	uint32 active_workers = 0;
+	uint32 total_pages;
+	size_t n_workers;
+
+	if (lfc_size_limit == 0)
+		PG_RETURN_NULL();
+
+	LWLockAcquire(lfc_lock, LW_SHARED);
+	if (!lfc_ctl || lfc_ctl->n_prewarm_workers == 0)
+	{
+		LWLockRelease(lfc_lock);
+		PG_RETURN_NULL();
+	}
+	n_workers = lfc_ctl->n_prewarm_workers;
+	total_pages = lfc_ctl->total_prewarm_pages;
+	for (size_t i = 0; i < n_workers; i++)
+	{
+		PrewarmWorkerState* ws = &lfc_ctl->prewarm_workers[i];
+		prewarmed_pages += ws->prewarmed_pages;
+		skipped_pages += ws->skipped_pages;
+		active_workers += ws->completed != 0;
+	}
+	LWLockRelease(lfc_lock);
+
+	tupdesc = CreateTemplateTupleDesc(4);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0);
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	MemSet(nulls, 0, sizeof(nulls));
+
+	values[0] = Int32GetDatum(total_pages);
+	values[1] = Int32GetDatum(prewarmed_pages);
+	values[2] = Int32GetDatum(skipped_pages);
+	values[3] = Int32GetDatum(active_workers);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
+
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -13,6 +13,17 @@

 #include "neon_pgversioncompat.h"

+typedef struct FileCacheState
+{
+	int32		vl_len_;		/* varlena header (do not touch directly!) */
+	uint32		magic;
+	uint32		n_chunks;
+	uint32		n_pages;
+	uint16		chunk_size_log;
+	BufferTag	chunks[FLEXIBLE_ARRAY_MEMBER];
+	/* followed by bitmap */
+} FileCacheState;
+
 /* GUCs */
 extern bool lfc_store_prefetch_result;

@@ -32,7 +43,10 @@ extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
 extern void lfc_init(void);
 extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 						 const void* buffer, XLogRecPtr lsn);
+extern FileCacheState* lfc_get_state(size_t max_entries);
+extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);

+PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);

 static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -26,6 +26,7 @@
 #include "portability/instr_time.h"
 #include "postmaster/interrupt.h"
 #include "storage/buf_internals.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
@@ -79,6 +80,7 @@ int         neon_protocol_version = 3;
 static int	neon_compute_mode = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;
+static int	max_sockets;

 static int pageserver_response_log_timeout = 10000;
 /* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */
@@ -336,6 +338,13 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
 				pageserver_disconnect(i);
 		}
 		pagestore_local_counter = end_update_counter;
+
+        /* Reserve file descriptors for sockets */
+		while (max_sockets < num_shards)
+		{
+			max_sockets += 1;
+			ReserveExternalFD();
+		}
 	}

 	if (num_shards_p)
@@ -736,8 +745,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	default:
 		neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
 	}
-	/* This shouldn't be hit */
-	Assert(false);
+
+	pg_unreachable();
 }

 static void
@@ -877,6 +886,7 @@ retry:
 			int			port;
 			int			sndbuf;
 			int			recvbuf;
+			uint64*		max_wait;

 			get_local_port(PQsocket(pageserver_conn), &port);
 			get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf);
@@ -887,7 +897,10 @@ retry:
 						   shard->nrequests_sent, shard->nresponses_received, port, sndbuf, recvbuf,
 				           pageserver_conn->inStart, pageserver_conn->inEnd);
 			shard->receive_last_log_time = now;
+			MyNeonCounters->compute_getpage_stuck_requests_total += !shard->receive_logged;
 			shard->receive_logged = true;
+			max_wait = &MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms;
+			*max_wait = Max(*max_wait, INSTR_TIME_GET_MILLISEC(since_start));
 		}

 		/*
@@ -910,6 +923,7 @@ retry:
 			get_local_port(PQsocket(pageserver_conn), &port);
 			neon_shard_log(shard_no, LOG, "no response from pageserver for %0.3f s, disconnecting (socket port=%d)",
 					   INSTR_TIME_GET_DOUBLE(since_start), port);
+			MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0;
 			pageserver_disconnect(shard_no);
 			return -1;
 		}
@@ -933,6 +947,7 @@ retry:
 	INSTR_TIME_SET_ZERO(shard->receive_start_time);
 	INSTR_TIME_SET_ZERO(shard->receive_last_log_time);
 	shard->receive_logged = false;
+	MyNeonCounters->compute_getpage_max_inflight_stuck_time_ms = 0;

 	return ret;
 }
--- a/pgxn/neon/libpqwalproposer.h
+++ b/pgxn/neon/libpqwalproposer.h
@@ -86,7 +86,7 @@ typedef struct WalProposerConn
 								 * walprop_async_read */
 } WalProposerConn;

-extern WalProposerConn *libpqwp_connect_start(char *conninfo);
+extern WalProposerConn *libpqwp_connect_start(const char *conninfo);
 extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
 extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
 extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
--- a/pgxn/neon/neon--1.5--1.6.sql
+++ b/pgxn/neon/neon--1.5--1.6.sql
@@ -0,0 +1,22 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.6'" to load this file. \quit
+
+CREATE FUNCTION get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer)
+RETURNS record
+AS 'MODULE_PATHNAME', 'get_prewarm_info'
+LANGUAGE C STRICT
+PARALLEL SAFE;
+
+CREATE FUNCTION get_local_cache_state(max_chunks integer default null)
+RETURNS bytea
+AS 'MODULE_PATHNAME', 'get_local_cache_state'
+LANGUAGE C
+PARALLEL UNSAFE;
+
+CREATE FUNCTION prewarm_local_cache(state bytea, n_workers integer default 1)
+RETURNS void
+AS 'MODULE_PATHNAME', 'prewarm_local_cache'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+
+
--- a/pgxn/neon/neon--1.6--1.5.sql
+++ b/pgxn/neon/neon--1.6--1.5.sql
@@ -0,0 +1,7 @@
+DROP FUNCTION IF EXISTS get_prewarm_info(out total_pages integer, out prewarmed_pages integer, out skipped_pages integer, out active_workers integer);
+
+DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer);
+
+DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer default 1);
+
+
--- a/pgxn/neon/neon_lwlsncache.c
+++ b/pgxn/neon/neon_lwlsncache.c
@@ -4,6 +4,7 @@

 #include "miscadmin.h"
 #include "access/xlog.h"
+#include "access/xlog_internal.h"
 #include "storage/ipc.h"
 #include "storage/shmem.h"
 #include "storage/buf_internals.h"
@@ -396,9 +397,10 @@ SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn,
 XLogRecPtr
 neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks)
 {
-	if (lsn < FirstNormalUnloggedLSN || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0)
+	if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0)
 		return lsn;

+	Assert(lsn >= WalSegMinSize);
 	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
 	lsn = SetLastWrittenLSNForBlockRangeInternal(lsn, rlocator, forknum, from, n_blocks);
 	LWLockRelease(LastWrittenLsnLock);
@@ -435,7 +437,6 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode,
 		NInfoGetRelNumber(relfilenode) == InvalidOid)
 		return InvalidXLogRecPtr;

-	
 	BufTagInit(key,  relNumber, forknum, blockno, spcOid, dbOid);

 	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
@@ -444,6 +445,10 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode,
 	{
 		XLogRecPtr	lsn = lsns[i];

+		if (lsn == InvalidXLogRecPtr)
+			continue;
+
+		Assert(lsn >= WalSegMinSize);
 		key.blockNum = blockno + i;
 		entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
 		if (found)
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -148,7 +148,7 @@ histogram_to_metrics(IOHistogram histogram,
 static metric_t *
 neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
 {
-#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10)
+#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12)
 	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
 	int			i = 0;

@@ -166,6 +166,8 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)

 	APPEND_METRIC(getpage_prefetch_requests_total);
 	APPEND_METRIC(getpage_sync_requests_total);
+	APPEND_METRIC(compute_getpage_stuck_requests_total);
+	APPEND_METRIC(compute_getpage_max_inflight_stuck_time_ms);
 	APPEND_METRIC(getpage_prefetch_misses_total);
 	APPEND_METRIC(getpage_prefetch_discards_total);
 	APPEND_METRIC(pageserver_requests_sent_total);
@@ -294,6 +296,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 		totals.file_cache_hits_total += counters->file_cache_hits_total;
 		histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
 		histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
+
+		totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total;
+		totals.compute_getpage_max_inflight_stuck_time_ms = Max(
+			totals.compute_getpage_max_inflight_stuck_time_ms,
+			counters->compute_getpage_max_inflight_stuck_time_ms);
 	}

 	metrics = neon_perf_counters_to_metrics(&totals);
--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -57,6 +57,18 @@ typedef struct
 	uint64		getpage_prefetch_requests_total;
 	uint64		getpage_sync_requests_total;

+	/* 
+	 * Total number of Getpage requests left without an answer for more than
+	 * pageserver_response_log_timeout but less than pageserver_response_disconnect_timeout
+	 */
+	uint64 compute_getpage_stuck_requests_total;
+
+	/* 
+	 * Longest waiting time for active stuck requests. If a stuck request gets a
+	 * response or disconnects, this metric is updated
+	 */
+	uint64 compute_getpage_max_inflight_stuck_time_ms;
+
 	/*
 	 * Total number of readahead misses; consisting of either prefetches that
 	 * don't satisfy the LSN bounds, or cases where no readahead was issued
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -150,7 +150,7 @@ NeonWALReaderFree(NeonWALReader *state)
 * fetched from timeline 'tli'.
 *
 * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
- * occurs, in which case 'err' has the desciption. Error always closes remote
+ * occurs, in which case 'err' has the description. Error always closes remote
 * connection, if there was any, so socket subscription should be removed.
 *
 * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1989,8 +1989,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

+#if PG_MAJORVERSION_NUM >= 17
+	/*
+	 * We have to disable this check for pg14-16 because sorted build of GIST index requires
+	 * to perform unlogged build several times
+	 */
 	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
 		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
+#endif

 	unlogged_build_rel = reln;
 	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -35,9 +35,11 @@
 *
 *-------------------------------------------------------------------------
 */
+#include <string.h>
 #include <sys/resource.h>

 #include "postgres.h"
+#include "libpq-fe.h"
 #include "libpq/pqformat.h"
 #include "neon.h"
 #include "walproposer.h"
@@ -90,9 +92,8 @@ static void MembershipConfigurationFree(MembershipConfiguration *mconf);
 WalProposer *
 WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 {
-	char	   *host;
+	char	   *connstring;
 	char	   *sep;
-	char	   *port;
 	WalProposer *wp;

 	wp = palloc0(sizeof(WalProposer));
@@ -103,71 +104,122 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->mconf.members.len = 0;
 	wp->mconf.new_members.len = 0;

-	wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
+	wp_log(LOG, "neon.safekeeper_connstrings=%s", wp->config->safekeeper_connstrings);

 	/*
 	 * If safekeepers list starts with g# parse generation number followed by
 	 * :
 	 */
-	if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0)
+	if (strncmp(wp->config->safekeeper_connstrings, "g#", 2) == 0)
 	{
 		char	   *endptr;

 		errno = 0;
-		wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10);
+		wp->safekeepers_generation = strtoul(wp->config->safekeeper_connstrings + 2, &endptr, 10);
 		if (errno != 0)
 		{
-			wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
+			wp_log(FATAL, "failed to parse neon.safekeeper_connstrings generation number: %m");
 		}
 		/* Skip past : to the first hostname. */
-		host = endptr + 1;
+		connstring = endptr + 1;
 	}
 	else
 	{
-		host = wp->config->safekeepers_list;
+		wp->safekeepers_generation = INVALID_GENERATION;
+		connstring = wp->config->safekeeper_connstrings;
 	}
 	wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation);

-	for (; host != NULL && *host != '\0'; host = sep)
+	for (; connstring != NULL && *connstring != '\0'; connstring = sep)
 	{
-		port = strchr(host, ':');
-		if (port == NULL)
-		{
-			wp_log(FATAL, "port is not specified");
-		}
-		*port++ = '\0';
-		sep = strchr(port, ',');
+		char	   *port;
+		char	   *errmsg;
+		Safekeeper *sk;
+		PQconninfoOption *conninfo_options, *option;
+
+		sk = &wp->safekeeper[wp->n_safekeepers];
+
+		sep = strchr(connstring, ',');
 		if (sep != NULL)
 			*sep++ = '\0';
 		if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
 		{
 			wp_log(FATAL, "too many safekeepers");
 		}
-		wp->safekeeper[wp->n_safekeepers].host = host;
-		wp->safekeeper[wp->n_safekeepers].port = port;
-		wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE;
-		wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND;
-		wp->safekeeper[wp->n_safekeepers].wp = wp;
+
+		/* TODO(tristan957): Remove this compatibility code and only keep the
+		 * else branch.
+		 *
+		 * Check if we have a connection string formatted as host:port, and use
+		 * the old parsing code.
+		 */
+		port = strchr(connstring, ':');
+		if (port)
+		{
+			sk->host = connstring;
+			*port++ = '\0';
+			sk->port = port;
+		}
+		else
+		{
+			conninfo_options = PQconninfoParse(connstring, &errmsg);
+			if (!conninfo_options)
+				wp_log(FATAL, "invalid safekeeper connection string: %s", errmsg);
+
+			// Save off the host and port for identification purposes
+			option = conninfo_options;
+			while (option)
+			{
+				if (!option->keyword)
+					break;
+
+				if (strcmp(option->keyword, "host") == 0)
+				{
+					sk->host = pstrdup(option->val);
+					goto end;
+				}
+
+				if (strcmp(option->keyword, "port") == 0)
+				{
+					sk->port = pstrdup(option->val);
+					goto end;
+				}
+
+			end:
+				// We've saved both the host and port, so we can skip iterating
+				// the rest of the list
+				if (sk->host && sk->port)
+					break;
+
+				option++;
+			}
+
+			PQconninfoFree(conninfo_options);
+			conninfo_options = option = NULL;
+		}
+
+		sk->state = SS_OFFLINE;
+		sk->active_state = SS_ACTIVE_SEND;
+		sk->wp = wp;

 		{
-			Safekeeper *sk = &wp->safekeeper[wp->n_safekeepers];
 			int			written = 0;

-			written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
+			written = snprintf(sk->conninfo, sizeof(sk->conninfo),
 							   "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
 							   sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
-			if (written > MAXCONNINFO || written < 0)
+			if (written > sizeof(sk->conninfo) || written < 0)
 				wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
 		}

-		initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
-		wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
-		wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr;
+		initStringInfo(&sk->outbuf);
+		sk->startStreamingAt = InvalidXLogRecPtr;
+		sk->streamingAt = InvalidXLogRecPtr;
 		wp->n_safekeepers += 1;
 	}
 	if (wp->n_safekeepers < 1)
 	{
-		wp_log(FATAL, "safekeepers addresses are not specified");
+		wp_log(FATAL, "safekeepers connection strings are not specified");
 	}
 	wp->quorum = wp->n_safekeepers / 2 + 1;

@@ -756,7 +808,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
 	{
 		SafekeeperId *sk_id = &wp->mconf.members.m[i];

-		if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
+		if (sk_id->node_id == sk->greetResponse.nodeId)
 		{
 			/*
 			 * If mconf or list of safekeepers to connect to changed (the
@@ -781,7 +833,7 @@ UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
 	{
 		SafekeeperId *sk_id = &wp->mconf.new_members.m[i];

-		if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
+		if (sk_id->node_id == sk->greetResponse.nodeId)
 		{
 			if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
 			{
@@ -836,7 +888,7 @@ TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
 {
 	uint32		n_greeted = 0;

-	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	for (uint32 i = 0; i < mset->len; i++)
 	{
 		Safekeeper *sk = msk[i];

@@ -1071,7 +1123,6 @@ RecvVoteResponse(Safekeeper *sk)
 	/* ready for elected message */
 	sk->state = SS_WAIT_ELECTED;

-	wp->n_votes++;
 	/* Are we already elected? */
 	if (wp->state == WPS_CAMPAIGN)
 	{
@@ -1106,7 +1157,7 @@ VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInf
 {
 	uint32		n_votes = 0;

-	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	for (uint32 i = 0; i < mset->len; i++)
 	{
 		Safekeeper *sk = msk[i];

--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -707,12 +707,11 @@ typedef struct WalProposerConfig
 	char	   *neon_timeline;

 	/*
-	 * Comma-separated list of safekeepers, in the following format:
-	 * host1:port1,host2:port2,host3:port3
+	 * Comma-separated list of safekeeper connection strings
 	 *
 	 * This cstr should be editable.
 	 */
-	char	   *safekeepers_list;
+	char	   *safekeeper_connstrings;

 	/*
 	 * WalProposer reconnects to offline safekeepers once in this interval.
@@ -788,14 +787,15 @@ typedef struct WalProposer
 	/*
 	 * Generation of the membership conf of which safekeepers[] are presumably
 	 * members. To make cplane life a bit easier and have more control in
-	 * tests with which sks walproposer gets connected neon.safekeepers GUC
-	 * doesn't provide full mconf, only the list of endpoints to connect to.
-	 * We still would like to know generation associated with it because 1) we
-	 * need some handle to enforce using generations in walproposer, and
-	 * non-zero value of this serves the purpose; 2) currently we don't do
-	 * that, but in theory walproposer can update list of safekeepers to
-	 * connect to upon receiving mconf from safekeepers, and generation number
-	 * must be checked to see which list is newer.
+	 * tests with which sks walproposer gets connected
+	 * neon.safekeeper_connstrings GUC doesn't provide full mconf, only the list
+	 * of endpoints to connect to. We still would like to know generation
+	 * associated with it because 1) we need some handle to enforce using
+	 * generations in walproposer, and non-zero value of this serves the
+	 * purpose; 2) currently we don't do that, but in theory walproposer can
+	 * update list of safekeepers to connect to upon receiving mconf from
+	 * safekeepers, and generation number must be checked to see which list is
+	 * newer.
 	 */
 	Generation	safekeepers_generation;
 	/* Number of occupied slots in safekeepers[] */
@@ -845,9 +845,6 @@ typedef struct WalProposer
 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;

-	/* number of votes collected from safekeepers */
-	int			n_votes;
-
 	/* number of successful connections over the lifetime of walproposer */
 	int			n_connected;

--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -63,7 +63,8 @@
 char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
-int			safekeeper_proto_version = 2;
+char	   *safekeeper_connstrings = "";
+int			safekeeper_proto_version = 3;

 /* Set to true in the walproposer bgw. */
 static bool am_walproposer;
@@ -81,6 +82,7 @@ static HotStandbyFeedback agg_hs_feedback;
 static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
 static void assign_neon_safekeepers(const char *newval, void *extra);
+static void assign_neon_safekeeper_connstrings(const char *newval, void *extra);
 static void nwp_prepare_shmem(void);
 static uint64 backpressure_lag_impl(void);
 static uint64 startup_backpressure_wrap(void);
@@ -117,8 +119,11 @@ init_walprop_config(bool syncSafekeepers)
 {
 	walprop_config.neon_tenant = neon_tenant;
 	walprop_config.neon_timeline = neon_timeline;
+	/* TODO(tristan957): Remove this compatibility code after the control plane
+	 * is updated to pass neon.safekeeper_connstrings
+	 */
 	/* WalProposerCreate scribbles directly on it, so pstrdup */
-	walprop_config.safekeepers_list = pstrdup(wal_acceptors_list);
+	walprop_config.safekeeper_connstrings = pstrdup(wal_acceptors_list[0] == '\0' ? safekeeper_connstrings : wal_acceptors_list);
 	walprop_config.safekeeper_reconnect_timeout = wal_acceptor_reconnect_timeout;
 	walprop_config.safekeeper_connection_timeout = wal_acceptor_connection_timeout;
 	walprop_config.wal_segment_size = wal_segment_size;
@@ -203,6 +208,15 @@ nwp_register_gucs(void)
 												 * GUC_LIST_QUOTE */
 							   NULL, assign_neon_safekeepers, NULL);

+	DefineCustomStringVariable(
+							   "neon.safekeeper_connstrings",
+							   "Comma-separated list of safekeeper connection strings with an optional generation prefix of the form g#X:",
+							   NULL,
+							   &safekeeper_connstrings,
+							   "",
+							   PGC_SIGHUP,
+							   GUC_LIST_INPUT,
+							   NULL, assign_neon_safekeeper_connstrings, NULL);
 	DefineCustomIntVariable(
 							"neon.safekeeper_reconnect_timeout",
 							"Walproposer reconnects to offline safekeepers once in this interval.",
@@ -228,7 +242,7 @@ nwp_register_gucs(void)
 							"Version of compute <-> safekeeper protocol.",
 							"Used while migrating from 2 to 3.",
 							&safekeeper_proto_version,
-							2, 0, INT_MAX,
+							3, 0, INT_MAX,
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);
@@ -236,24 +250,24 @@ nwp_register_gucs(void)


 static int
-split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
+split_safekeeper_connstrings(char *connstrings, char *safekeepers[])
 {
 	int			n_safekeepers = 0;
-	char	   *curr_sk = safekeepers_list;
+	char	   *curr_sk = connstrings;

-	for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
+	for (char *comma = connstrings; comma != NULL && *comma != '\0'; curr_sk = comma)
 	{
 		if (++n_safekeepers >= MAX_SAFEKEEPERS)
 		{
 			wpg_log(FATAL, "too many safekeepers");
 		}

-		coma = strchr(coma, ',');
+		comma = strchr(comma, ',');
 		safekeepers[n_safekeepers - 1] = curr_sk;

-		if (coma != NULL)
+		if (comma != NULL)
 		{
-			*coma++ = '\0';
+			*comma++ = '\0';
 		}
 	}

@@ -261,19 +275,19 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
 }

 /*
- * Accept two coma-separated strings with list of safekeeper host:port addresses.
+ * Accept two comma-separated strings with list of safekeeper host:port addresses.
 * Split them into arrays and return false if two sets do not match, ignoring the order.
 */
 static bool
-safekeepers_cmp(char *old, char *new)
+safekeeper_connstrings_cmp(char *old, char *new)
 {
 	char	   *safekeepers_old[MAX_SAFEKEEPERS];
 	char	   *safekeepers_new[MAX_SAFEKEEPERS];
 	int			len_old = 0;
 	int			len_new = 0;

-	len_old = split_safekeepers_list(old, safekeepers_old);
-	len_new = split_safekeepers_list(new, safekeepers_new);
+	len_old = split_safekeeper_connstrings(old, safekeepers_old);
+	len_new = split_safekeeper_connstrings(new, safekeepers_new);

 	if (len_old != len_new)
 	{
@@ -294,6 +308,44 @@ safekeepers_cmp(char *old, char *new)
 	return true;
 }

+/*
+ * GUC assign_hook for neon.safekeeper_connstrings. Restarts walproposer through
+ * FATAL if the list changed.
+ */
+static void
+assign_neon_safekeeper_connstrings(const char *newval, void *extra)
+{
+	char	   *newval_copy;
+	char	   *oldval;
+
+	if (!am_walproposer)
+		return;
+
+	if (!newval)
+	{
+		/* should never happen */
+		wpg_log(FATAL, "neon.safekeeper_connstrings is empty");
+	}
+
+	/* Copy values because we will modify them in split_safekeeper_connstrings() */
+	newval_copy = pstrdup(newval);
+	oldval = pstrdup(safekeeper_connstrings);
+
+	/*
+	 * TODO: restarting through FATAL is stupid and introduces 1s delay before
+	 * next bgw start. We should refactor walproposer to allow graceful exit
+	 * and thus remove this delay. XXX: If you change anything here, sync with
+	 * test_safekeepers_reconfigure_reorder.
+	 */
+	if (!safekeeper_connstrings_cmp(oldval, newval_copy))
+	{
+		wpg_log(FATAL, "restarting walproposer to change safekeeper list from \"%s\" to \"%s\"",
+				safekeeper_connstrings, newval);
+	}
+	pfree(newval_copy);
+	pfree(oldval);
+}
+
 /*
 * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
 * the list changed.
@@ -323,7 +375,7 @@ assign_neon_safekeepers(const char *newval, void *extra)
 	 * and thus remove this delay. XXX: If you change anything here, sync with
 	 * test_safekeepers_reconfigure_reorder.
 	 */
-	if (!safekeepers_cmp(oldval, newval_copy))
+	if (!safekeeper_connstrings_cmp(oldval, newval_copy))
 	{
 		wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
 				wal_acceptors_list, newval);
@@ -500,8 +552,8 @@ walprop_register_bgworker(void)
 {
 	BackgroundWorker bgw;

-	/* If no wal acceptors are specified, don't start the background worker. */
-	if (*wal_acceptors_list == '\0')
+	/* If no safekeepers are specified, don't start the background worker. */
+	if (*safekeeper_connstrings == '\0')
 		return;

 	memset(&bgw, 0, sizeof(bgw));
@@ -841,7 +893,7 @@ walprop_status(Safekeeper *sk)
 }

 WalProposerConn *
-libpqwp_connect_start(char *conninfo)
+libpqwp_connect_start(const char *conninfo)
 {

 	PGconn	   *pg_conn;
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -32,7 +32,7 @@ To play with it locally one may start proxy over a local postgres installation
 (see end of this page on how to generate certs with openssl):

 ```
-./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
+LOGFMT=text ./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
 ```

 If both postgres and proxy are running you may send a SQL query:
@@ -130,7 +130,7 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key

 Then we need to build proxy with 'testing' feature and run, e.g.:
 ```sh
-RUST_LOG=proxy cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
 ```

 Now from client you can start a new session:
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -409,14 +409,22 @@ impl JwkCacheEntryLock {

        if let Some(exp) = payload.expiration {
            if now >= exp + CLOCK_SKEW_LEEWAY {
-                return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired));
+                return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired(
+                    exp.duration_since(SystemTime::UNIX_EPOCH)
+                        .unwrap_or_default()
+                        .as_secs(),
+                )));
            }
        }

        if let Some(nbf) = payload.not_before {
            if nbf >= now + CLOCK_SKEW_LEEWAY {
                return Err(JwtError::InvalidClaims(
-                    JwtClaimsError::JwtTokenNotYetReadyToUse,
+                    JwtClaimsError::JwtTokenNotYetReadyToUse(
+                        nbf.duration_since(SystemTime::UNIX_EPOCH)
+                            .unwrap_or_default()
+                            .as_secs(),
+                    ),
                ));
            }
        }
@@ -534,10 +542,10 @@ struct JwtPayload<'a> {
    #[serde(rename = "aud", default)]
    audience: OneOrMany,
    /// Expiration - Time after which the JWT expires
-    #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)]
+    #[serde(rename = "exp", deserialize_with = "numeric_date_opt", default)]
    expiration: Option<SystemTime>,
-    /// Not before - Time after which the JWT expires
-    #[serde(deserialize_with = "numeric_date_opt", rename = "nbf", default)]
+    /// Not before - Time before which the JWT is not valid
+    #[serde(rename = "nbf", deserialize_with = "numeric_date_opt", default)]
    not_before: Option<SystemTime>,

    // the following entries are only extracted for the sake of debug logging.
@@ -609,8 +617,15 @@ impl<'de> Deserialize<'de> for OneOrMany {
 }

 fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result<Option<SystemTime>, D::Error> {
-    let d = <Option<u64>>::deserialize(d)?;
-    Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n)))
+    <Option<u64>>::deserialize(d)?
+        .map(|t| {
+            SystemTime::UNIX_EPOCH
+                .checked_add(Duration::from_secs(t))
+                .ok_or_else(|| {
+                    serde::de::Error::custom(format_args!("timestamp out of bounds: {t}"))
+                })
+        })
+        .transpose()
 }

 struct JwkRenewalPermit<'a> {
@@ -746,11 +761,11 @@ pub enum JwtClaimsError {
    #[error("invalid JWT token audience")]
    InvalidJwtTokenAudience,

-    #[error("JWT token has expired")]
-    JwtTokenHasExpired,
+    #[error("JWT token has expired (exp={0})")]
+    JwtTokenHasExpired(u64),

-    #[error("JWT token is not yet ready to use")]
-    JwtTokenNotYetReadyToUse,
+    #[error("JWT token is not yet ready to use (nbf={0})")]
+    JwtTokenNotYetReadyToUse(u64),
 }

 #[allow(dead_code, reason = "Debug use only")]
@@ -1233,14 +1248,14 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
                    "nbf": now + 60,
                    "aud": "neon",
                }},
-                error: JwtClaimsError::JwtTokenNotYetReadyToUse,
+                error: JwtClaimsError::JwtTokenNotYetReadyToUse(now + 60),
            },
            Test {
                body: json! {{
                    "exp": now - 60,
                    "aud": ["neon"],
                }},
-                error: JwtClaimsError::JwtTokenHasExpired,
+                error: JwtClaimsError::JwtTokenHasExpired(now - 60),
            },
            Test {
                body: json! {{
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -32,12 +32,6 @@ pub(crate) enum ComputeUserInfoParseError {
        option: EndpointId,
    },

-    #[error(
-        "Common name inferred from SNI ('{}') is not known",
-        .cn,
-    )]
-    UnknownCommonName { cn: String },
-
    #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
    MalformedProjectName(EndpointId),
 }
@@ -66,22 +60,15 @@ impl ComputeUserInfoMaybeEndpoint {
    }
 }

-pub(crate) fn endpoint_sni(
-    sni: &str,
-    common_names: &HashSet<String>,
-) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
-    let Some((subdomain, common_name)) = sni.split_once('.') else {
-        return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
-    };
+pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet<String>) -> Option<EndpointId> {
+    let (subdomain, common_name) = sni.split_once('.')?;
    if !common_names.contains(common_name) {
-        return Err(ComputeUserInfoParseError::UnknownCommonName {
-            cn: common_name.into(),
-        });
+        return None;
    }
    if subdomain == SERVERLESS_DRIVER_SNI {
-        return Ok(None);
+        return None;
    }
-    Ok(Some(EndpointId::from(subdomain)))
+    Some(EndpointId::from(subdomain))
 }

 impl ComputeUserInfoMaybeEndpoint {
@@ -113,15 +100,8 @@ impl ComputeUserInfoMaybeEndpoint {
            })
            .map(|name| name.into());

-        let endpoint_from_domain = if let Some(sni_str) = sni {
-            if let Some(cn) = common_names {
-                endpoint_sni(sni_str, cn)?
-            } else {
-                None
-            }
-        } else {
-            None
-        };
+        let endpoint_from_domain =
+            sni.and_then(|sni_str| common_names.and_then(|cn| endpoint_sni(sni_str, cn)));

        let endpoint = match (endpoint_option, endpoint_from_domain) {
            // Invariant: if we have both project name variants, they should match.
@@ -424,21 +404,34 @@ mod tests {
    }

    #[test]
-    fn parse_inconsistent_sni() {
+    fn parse_unknown_sni() {
        let options = StartupMessageParams::new([("user", "john_doe")]);

        let sni = Some("project.localhost");
        let common_names = Some(["example.com".into()].into());

        let ctx = RequestContext::test();
-        let err = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
-            .expect_err("should fail");
-        match err {
-            UnknownCommonName { cn } => {
-                assert_eq!(cn, "localhost");
-            }
-            _ => panic!("bad error: {err:?}"),
-        }
+        let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .unwrap();
+
+        assert!(info.endpoint_id.is_none());
+    }
+
+    #[test]
+    fn parse_unknown_sni_with_options() {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "endpoint=foo-bar-baz-1234"),
+        ]);
+
+        let sni = Some("project.localhost");
+        let common_names = Some(["example.com".into()].into());
+
+        let ctx = RequestContext::test();
+        let info = ComputeUserInfoMaybeEndpoint::parse(&ctx, &options, sni, common_names.as_ref())
+            .unwrap();
+
+        assert_eq!(info.endpoint_id.as_deref(), Some("foo-bar-baz-1234"));
    }

    #[test]
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -132,11 +132,10 @@ impl Drop for LoggingGuard {
    }
 }

-// TODO: make JSON the default
 #[derive(Copy, Clone, PartialEq, Eq, Default, Debug)]
 enum LogFormat {
+    Text,
    #[default]
-    Text = 1,
    Json,
 }

--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -24,9 +24,6 @@ pub(crate) enum HandshakeError {
    #[error("protocol violation")]
    ProtocolViolation,

-    #[error("missing certificate")]
-    MissingCertificate,
-
    #[error("{0}")]
    StreamUpgradeError(#[from] StreamUpgradeError),

@@ -42,10 +39,6 @@ impl ReportableError for HandshakeError {
        match self {
            HandshakeError::EarlyData => crate::error::ErrorKind::User,
            HandshakeError::ProtocolViolation => crate::error::ErrorKind::User,
-            // This error should not happen, but will if we have no default certificate and
-            // the client sends no SNI extension.
-            // If they provide SNI then we can be sure there is a certificate that matches.
-            HandshakeError::MissingCertificate => crate::error::ErrorKind::Service,
            HandshakeError::StreamUpgradeError(upgrade) => match upgrade {
                StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service,
                StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
@@ -146,7 +139,7 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                        // try parse endpoint
                        let ep = conn_info
                            .server_name()
-                            .and_then(|sni| endpoint_sni(sni, &tls.common_names).ok().flatten());
+                            .and_then(|sni| endpoint_sni(sni, &tls.common_names));
                        if let Some(ep) = ep {
                            ctx.set_endpoint_id(ep);
                        }
@@ -161,10 +154,8 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                            }
                        }

-                        let (_, tls_server_end_point) = tls
-                            .cert_resolver
-                            .resolve(conn_info.server_name())
-                            .ok_or(HandshakeError::MissingCertificate)?;
+                        let (_, tls_server_end_point) =
+                            tls.cert_resolver.resolve(conn_info.server_name());

                        stream = PqStream {
                            framed: Framed {
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -98,8 +98,7 @@ fn generate_tls_config<'a>(
                .with_no_client_auth()
                .with_single_cert(vec![cert.clone()], key.clone_key())?;

-        let mut cert_resolver = CertResolver::new();
-        cert_resolver.add_cert(key, vec![cert], true)?;
+        let cert_resolver = CertResolver::new(key, vec![cert])?;

        let common_names = cert_resolver.get_common_names();

--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -199,8 +199,7 @@ fn get_conn_info(
    let endpoint = match connection_url.host() {
        Some(url::Host::Domain(hostname)) => {
            if let Some(tls) = tls {
-                endpoint_sni(hostname, &tls.common_names)?
-                    .ok_or(ConnInfoError::MalformedEndpoint)?
+                endpoint_sni(hostname, &tls.common_names).ok_or(ConnInfoError::MalformedEndpoint)?
            } else {
                hostname
                    .split_once('.')
--- a/proxy/src/tls/server_config.rs
+++ b/proxy/src/tls/server_config.rs
@@ -5,6 +5,7 @@ use anyhow::{Context, bail};
 use itertools::Itertools;
 use rustls::crypto::ring::{self, sign};
 use rustls::pki_types::{CertificateDer, PrivateKeyDer};
+use rustls::sign::CertifiedKey;
 use x509_cert::der::{Reader, SliceReader};

 use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint};
@@ -25,10 +26,8 @@ pub fn configure_tls(
    certs_dir: Option<&String>,
    allow_tls_keylogfile: bool,
 ) -> anyhow::Result<TlsConfig> {
-    let mut cert_resolver = CertResolver::new();
-
    // add default certificate
-    cert_resolver.add_cert_path(key_path, cert_path, true)?;
+    let mut cert_resolver = CertResolver::parse_new(key_path, cert_path)?;

    // add extra certificates
    if let Some(certs_dir) = certs_dir {
@@ -40,11 +39,8 @@ pub fn configure_tls(
                let key_path = path.join("tls.key");
                let cert_path = path.join("tls.crt");
                if key_path.exists() && cert_path.exists() {
-                    cert_resolver.add_cert_path(
-                        &key_path.to_string_lossy(),
-                        &cert_path.to_string_lossy(),
-                        false,
-                    )?;
+                    cert_resolver
+                        .add_cert_path(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?;
                }
            }
        }
@@ -83,92 +79,42 @@ pub fn configure_tls(
    })
 }

-#[derive(Default, Debug)]
+#[derive(Debug)]
 pub struct CertResolver {
    certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
-    default: Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
+    default: (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint),
 }

 impl CertResolver {
-    pub fn new() -> Self {
-        Self::default()
+    fn parse_new(key_path: &str, cert_path: &str) -> anyhow::Result<Self> {
+        let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?;
+        Self::new(priv_key, cert_chain)
    }

-    fn add_cert_path(
-        &mut self,
-        key_path: &str,
-        cert_path: &str,
-        is_default: bool,
-    ) -> anyhow::Result<()> {
-        let priv_key = {
-            let key_bytes = std::fs::read(key_path)
-                .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?;
-            rustls_pemfile::private_key(&mut &key_bytes[..])
-                .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
-                .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
-        };
+    pub fn new(
+        priv_key: PrivateKeyDer<'static>,
+        cert_chain: Vec<CertificateDer<'static>>,
+    ) -> anyhow::Result<Self> {
+        let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?;

-        let cert_chain_bytes = std::fs::read(cert_path)
-            .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
-
-        let cert_chain = {
-            rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-                .try_collect()
-                .with_context(|| {
-                    format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.")
-                })?
-        };
-
-        self.add_cert(priv_key, cert_chain, is_default)
+        let mut certs = HashMap::new();
+        let default = (cert.clone(), tls_server_end_point);
+        certs.insert(common_name, (cert, tls_server_end_point));
+        Ok(Self { certs, default })
    }

-    pub fn add_cert(
+    fn add_cert_path(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> {
+        let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?;
+        self.add_cert(priv_key, cert_chain)
+    }
+
+    fn add_cert(
        &mut self,
        priv_key: PrivateKeyDer<'static>,
        cert_chain: Vec<CertificateDer<'static>>,
-        is_default: bool,
    ) -> anyhow::Result<()> {
-        let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
-
-        let first_cert = &cert_chain[0];
-        let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
-
-        let certificate = SliceReader::new(first_cert)
-            .context("Failed to parse cerficiate")?
-            .decode::<x509_cert::Certificate>()
-            .context("Failed to parse cerficiate")?;
-
-        let common_name = certificate.tbs_certificate.subject.to_string();
-
-        // We need to get the canonical name for this certificate so we can match them against any domain names
-        // seen within the proxy codebase.
-        //
-        // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI.
-        // We need to remove the wildcard prefix for the purposes of certificate selection.
-        //
-        // auth-broker does not use SNI and instead uses the Neon-Connection-String header.
-        // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
-        //
-        // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
-        // validation, so let's we can continue with any common-name
-        let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
-            s.to_string()
-        } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") {
-            s.to_string()
-        } else if let Some(s) = common_name.strip_prefix("CN=") {
-            s.to_string()
-        } else {
-            bail!("Failed to parse common name from certificate")
-        };
-
-        let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));
-
-        if is_default {
-            self.default = Some((cert.clone(), tls_server_end_point));
-        }
-
+        let (common_name, cert, tls_server_end_point) = process_key_cert(priv_key, cert_chain)?;
        self.certs.insert(common_name, (cert, tls_server_end_point));
-
        Ok(())
    }

@@ -177,12 +123,82 @@ impl CertResolver {
    }
 }

+fn parse_key_cert(
+    key_path: &str,
+    cert_path: &str,
+) -> anyhow::Result<(PrivateKeyDer<'static>, Vec<CertificateDer<'static>>)> {
+    let priv_key = {
+        let key_bytes = std::fs::read(key_path)
+            .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?;
+        rustls_pemfile::private_key(&mut &key_bytes[..])
+            .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
+            .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))?
+    };
+
+    let cert_chain_bytes = std::fs::read(cert_path)
+        .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;
+
+    let cert_chain = {
+        rustls_pemfile::certs(&mut &cert_chain_bytes[..])
+            .try_collect()
+            .with_context(|| {
+                format!(
+                    "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
+                )
+            })?
+    };
+
+    Ok((priv_key, cert_chain))
+}
+
+fn process_key_cert(
+    priv_key: PrivateKeyDer<'static>,
+    cert_chain: Vec<CertificateDer<'static>>,
+) -> anyhow::Result<(String, Arc<CertifiedKey>, TlsServerEndPoint)> {
+    let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
+
+    let first_cert = &cert_chain[0];
+    let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
+
+    let certificate = SliceReader::new(first_cert)
+        .context("Failed to parse cerficiate")?
+        .decode::<x509_cert::Certificate>()
+        .context("Failed to parse cerficiate")?;
+
+    let common_name = certificate.tbs_certificate.subject.to_string();
+
+    // We need to get the canonical name for this certificate so we can match them against any domain names
+    // seen within the proxy codebase.
+    //
+    // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI.
+    // We need to remove the wildcard prefix for the purposes of certificate selection.
+    //
+    // auth-broker does not use SNI and instead uses the Neon-Connection-String header.
+    // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
+    //
+    // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
+    // validation, so let's we can continue with any common-name
+    let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
+        s.to_string()
+    } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") {
+        s.to_string()
+    } else if let Some(s) = common_name.strip_prefix("CN=") {
+        s.to_string()
+    } else {
+        bail!("Failed to parse common name from certificate")
+    };
+
+    let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));
+
+    Ok((common_name, cert, tls_server_end_point))
+}
+
 impl rustls::server::ResolvesServerCert for CertResolver {
    fn resolve(
        &self,
        client_hello: rustls::server::ClientHello<'_>,
    ) -> Option<Arc<rustls::sign::CertifiedKey>> {
-        self.resolve(client_hello.server_name()).map(|x| x.0)
+        Some(self.resolve(client_hello.server_name()).0)
    }
 }

@@ -190,7 +206,7 @@ impl CertResolver {
    pub fn resolve(
        &self,
        server_name: Option<&str>,
-    ) -> Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)> {
+    ) -> (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint) {
        // loop here and cut off more and more subdomains until we find
        // a match to get a proper wildcard support. OTOH, we now do not
        // use nested domains, so keep this simple for now.
@@ -200,12 +216,17 @@ impl CertResolver {
        if let Some(mut sni_name) = server_name {
            loop {
                if let Some(cert) = self.certs.get(sni_name) {
-                    return Some(cert.clone());
+                    return cert.clone();
                }
                if let Some((_, rest)) = sni_name.split_once('.') {
                    sni_name = rest;
                } else {
-                    return None;
+                    // The customer has some custom DNS mapping - just return
+                    // a default certificate.
+                    //
+                    // This will error if the customer uses anything stronger
+                    // than sslmode=require. That's a choice they can make.
+                    return self.default.clone();
                }
            }
        } else {
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -1,7 +1,6 @@
 //
 // Main entry point for the safekeeper executable
 //
-use std::env::{VarError, var};
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::str::FromStr;
@@ -354,29 +353,13 @@ async fn main() -> anyhow::Result<()> {
    };

    // Load JWT auth token to connect to other safekeepers for pull_timeline.
-    // First check if the env var is present, then check the arg with the path.
-    // We want to deprecate and remove the env var method in the future.
-    let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") {
-        Ok(v) => {
-            info!("loaded JWT token for authentication with safekeepers");
-            Some(SecretString::from(v))
-        }
-        Err(VarError::NotPresent) => {
-            if let Some(auth_token_path) = args.auth_token_path.as_ref() {
-                info!(
-                    "loading JWT token for authentication with safekeepers from {auth_token_path}"
-                );
-                let auth_token = tokio::fs::read_to_string(auth_token_path).await?;
-                Some(SecretString::from(auth_token.trim().to_owned()))
-            } else {
-                info!("no JWT token for authentication with safekeepers detected");
-                None
-            }
-        }
-        Err(_) => {
-            warn!("JWT token for authentication with safekeepers is not unicode");
-            None
-        }
+    let sk_auth_token = if let Some(auth_token_path) = args.auth_token_path.as_ref() {
+        info!("loading JWT token for authentication with safekeepers from {auth_token_path}");
+        let auth_token = tokio::fs::read_to_string(auth_token_path).await?;
+        Some(SecretString::from(auth_token.trim().to_owned()))
+    } else {
+        info!("no JWT token for authentication with safekeepers detected");
+        None
    };

    let ssl_ca_certs = match args.ssl_ca_file.as_ref() {
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -401,7 +401,10 @@ pub async fn handle_request(
        request.timeline_id,
    ));
    if existing_tli.is_ok() {
-        bail!("Timeline {} already exists", request.timeline_id);
+        info!("Timeline {} already exists", request.timeline_id);
+        return Ok(PullTimelineResponse {
+            safekeeper_host: None,
+        });
    }

    let mut http_client = reqwest::Client::builder();
@@ -425,8 +428,25 @@ pub async fn handle_request(

    let mut statuses = Vec::new();
    for (i, response) in responses.into_iter().enumerate() {
-        let status = response.context(format!("fetching status from {}", http_hosts[i]))?;
-        statuses.push((status, i));
+        match response {
+            Ok(status) => {
+                statuses.push((status, i));
+            }
+            Err(e) => {
+                info!("error fetching status from {}: {e}", http_hosts[i]);
+            }
+        }
+    }
+
+    // Allow missing responses from up to one safekeeper (say due to downtime)
+    // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
+    // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
+    let min_required_successful = (http_hosts.len() - 1).max(1);
+    if statuses.len() < min_required_successful {
+        bail!(
+            "only got {} successful status responses. required: {min_required_successful}",
+            statuses.len()
+        )
    }

    // Find the most advanced safekeeper
@@ -536,6 +556,6 @@ async fn pull_timeline(
        .await?;

    Ok(PullTimelineResponse {
-        safekeeper_host: host,
+        safekeeper_host: Some(host),
    })
 }
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -32,7 +32,7 @@ use crate::metrics::{
    WAL_RECEIVERS,
 };
 use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage};
-use crate::timeline::WalResidentTimeline;
+use crate::timeline::{TimelineError, WalResidentTimeline};

 const DEFAULT_FEEDBACK_CAPACITY: usize = 8;

@@ -357,9 +357,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
                        .await
                        .context("create timeline")?
                } else {
-                    self.global_timelines
-                        .get(self.ttid)
-                        .context("get timeline")?
+                    let timeline_res = self.global_timelines.get(self.ttid);
+                    match timeline_res {
+                        Ok(tl) => tl,
+                        Err(TimelineError::NotFound(_)) => {
+                            return Err(CopyStreamHandlerEnd::TimelineNoCreate);
+                        }
+                        other => other.context("get_timeline")?,
+                    }
                };
                tli.wal_residence_guard().await?
            }
--- a/safekeeper/tests/walproposer_sim/simulation.rs
+++ b/safekeeper/tests/walproposer_sim/simulation.rs
@@ -86,7 +86,7 @@ impl WalProposer {

        let config = Config {
            ttid,
-            safekeepers_list: addrs,
+            safekeeper_connstrings: addrs,
            safekeeper_reconnect_timeout: 1000,
            safekeeper_connection_timeout: 5000,
            sync_safekeepers,
--- a/safekeeper/tests/walproposer_sim/walproposer_api.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs
@@ -207,7 +207,7 @@ impl SimulationApi {
        // initialize connection state for each safekeeper
        let sk_conns = args
            .config
-            .safekeepers_list
+            .safekeeper_connstrings
            .iter()
            .map(|s| {
                SafekeeperConn::new(
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -19,7 +19,8 @@ use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
    Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
    MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
-    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, Service,
+    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+    SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT, Service,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -132,6 +133,10 @@ struct Cli {
    #[arg(long)]
    priority_reconciler_concurrency: Option<usize>,

+    /// Maximum number of safekeeper reconciliations that may run in parallel (per safekeeper)
+    #[arg(long)]
+    safekeeper_reconciler_concurrency: Option<usize>,
+
    /// Tenant API rate limit, as requests per second per tenant.
    #[arg(long, default_value = "10")]
    tenant_rate_limit: NonZeroU32,
@@ -403,6 +408,9 @@ async fn async_main() -> anyhow::Result<()> {
        priority_reconciler_concurrency: args
            .priority_reconciler_concurrency
            .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT),
+        safekeeper_reconciler_concurrency: args
+            .safekeeper_reconciler_concurrency
+            .unwrap_or(SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT),
        tenant_rate_limit: args.tenant_rate_limit,
        split_threshold: args.split_threshold,
        max_split_shards: args.max_split_shards,
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -194,6 +194,7 @@ pub(crate) enum LeadershipStatus {

 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
+pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;

 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
 // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -382,6 +383,9 @@ pub struct Config {
    /// How many high-priority Reconcilers may be spawned concurrently
    pub priority_reconciler_concurrency: usize,

+    /// How many safekeeper reconciles may happen concurrently (per safekeeper)
+    pub safekeeper_reconciler_concurrency: usize,
+
    /// How many API requests per second to allow per tenant, across all
    /// tenant-scoped API endpoints. Further API requests queue until ready.
    pub tenant_rate_limit: NonZeroU32,
@@ -3659,7 +3663,7 @@ impl Service {
                locations: ShardMutationLocations,
                http_client: reqwest::Client,
                jwt: Option<String>,
-                create_req: TimelineCreateRequest,
+                mut create_req: TimelineCreateRequest,
            ) -> Result<TimelineInfo, ApiError> {
                let latest = locations.latest.node;

@@ -3678,6 +3682,15 @@ impl Service {
                    .await
                    .map_err(|e| passthrough_api_error(&latest, e))?;

+                // If we are going to create the timeline on some stale locations for shard 0, then ask them to re-use
+                // the initdb generated by the latest location, rather than generating their own.  This avoids racing uploads
+                // of initdb to S3 which might not be binary-identical if different pageservers have different postgres binaries.
+                if tenant_shard_id.is_shard_zero() {
+                    if let models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } = &mut create_req.mode {
+                        *existing_initdb_timeline_id = Some(create_req.new_timeline_id);
+                    }
+                }
+
                // We propagate timeline creations to all attached locations such that a compute
                // for the new timeline is able to start regardless of the current state of the
                // tenant shard reconciliation.
@@ -3720,6 +3733,10 @@ impl Service {
            // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
            // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
            // that will get the first creation request, and propagate the LSN to all the >0 shards.
+            //
+            // This also enables non-zero shards to use the initdb that shard 0 generated and uploaded to S3, rather than
+            // independently generating their own initdb.  This guarantees that shards cannot end up with different initial
+            // states if e.g. they have different postgres binary versions.
            let timeline_info = create_one(
                shard_zero_tid,
                shard_zero_locations,
@@ -3729,11 +3746,16 @@ impl Service {
            )
            .await?;

-            // Propagate the LSN that shard zero picked, if caller didn't provide one
+            // Update the create request for shards >= 0
            match &mut create_req.mode {
                models::TimelineCreateRequestMode::Branch { ancestor_start_lsn, .. } if ancestor_start_lsn.is_none() => {
+                    // Propagate the LSN that shard zero picked, if caller didn't provide one
                    *ancestor_start_lsn = timeline_info.ancestor_lsn;
                },
+                models::TimelineCreateRequestMode::Bootstrap { existing_initdb_timeline_id, .. } => {
+                    // For shards >= 0, do not run initdb: use the one that shard 0 uploaded to S3
+                    *existing_initdb_timeline_id = Some(create_req.new_timeline_id)
+                }
                _ => {}
            }

@@ -5159,7 +5181,8 @@ impl Service {
            }

            // We don't expect any new_shard_count shards to exist here, but drop them just in case
-            tenants.retain(|_id, s| s.shard.count != *new_shard_count);
+            tenants
+                .retain(|id, s| !(id.tenant_id == *tenant_id && s.shard.count == *new_shard_count));

            detach_locations
        };
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -3,7 +3,10 @@ use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration};
 use clashmap::{ClashMap, Entry};
 use safekeeper_api::models::PullTimelineRequest;
 use safekeeper_client::mgmt_api;
-use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
+use tokio::sync::{
+    Semaphore,
+    mpsc::{self, UnboundedReceiver, UnboundedSender},
+};
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{
@@ -206,18 +209,27 @@ impl ReconcilerHandle {
 }

 pub(crate) struct SafekeeperReconciler {
-    service: Arc<Service>,
+    inner: SafekeeperReconcilerInner,
+    concurrency_limiter: Arc<Semaphore>,
    rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
    cancel: CancellationToken,
 }

+/// Thin wrapper over `Service` to not clutter its inherent functions
+#[derive(Clone)]
+struct SafekeeperReconcilerInner {
+    service: Arc<Service>,
+}
+
 impl SafekeeperReconciler {
    fn spawn(cancel: CancellationToken, service: Arc<Service>) -> ReconcilerHandle {
        // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking.
        let (tx, rx) = mpsc::unbounded_channel();
+        let concurrency = service.config.safekeeper_reconciler_concurrency;
        let mut reconciler = SafekeeperReconciler {
-            service,
+            inner: SafekeeperReconcilerInner { service },
            rx,
+            concurrency_limiter: Arc::new(Semaphore::new(concurrency)),
            cancel: cancel.clone(),
        };
        let handle = ReconcilerHandle {
@@ -230,31 +242,44 @@ impl SafekeeperReconciler {
    }
    async fn run(&mut self) {
        loop {
-            // TODO add parallelism with semaphore here
            let req = tokio::select! {
                req = self.rx.recv() => req,
                _ = self.cancel.cancelled() => break,
            };
            let Some((req, req_cancel)) = req else { break };
+
+            let permit_res = tokio::select! {
+                req = self.concurrency_limiter.clone().acquire_owned() => req,
+                _ = self.cancel.cancelled() => break,
+            };
+            let Ok(_permit) = permit_res else { return };
+
+            let inner = self.inner.clone();
            if req_cancel.is_cancelled() {
                continue;
            }

-            let kind = req.kind;
-            let tenant_id = req.tenant_id;
-            let timeline_id = req.timeline_id;
-            let node_id = req.safekeeper.skp.id;
-            self.reconcile_one(req, req_cancel)
-                .instrument(tracing::info_span!(
-                    "reconcile_one",
-                    ?kind,
-                    %tenant_id,
-                    ?timeline_id,
-                    %node_id,
-                ))
-                .await;
+            tokio::task::spawn(async move {
+                let kind = req.kind;
+                let tenant_id = req.tenant_id;
+                let timeline_id = req.timeline_id;
+                let node_id = req.safekeeper.skp.id;
+                inner
+                    .reconcile_one(req, req_cancel)
+                    .instrument(tracing::info_span!(
+                        "reconcile_one",
+                        ?kind,
+                        %tenant_id,
+                        ?timeline_id,
+                        %node_id,
+                    ))
+                    .await;
+            });
        }
    }
+}
+
+impl SafekeeperReconcilerInner {
    async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
        let req_host = req.safekeeper.skp.host.clone();
        match req.kind {
@@ -281,10 +306,11 @@ impl SafekeeperReconciler {
                    req,
                    async |client| client.pull_timeline(&pull_req).await,
                    |resp| {
-                        tracing::info!(
-                            "pulled timeline from {} onto {req_host}",
-                            resp.safekeeper_host,
-                        );
+                        if let Some(host) = resp.safekeeper_host {
+                            tracing::info!("pulled timeline from {host} onto {req_host}");
+                        } else {
+                            tracing::info!("timeline already present on safekeeper on {req_host}");
+                        }
                    },
                    req_cancel,
                )
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1194,8 +1194,7 @@ class NeonEnv:
        else:
            cfg["broker"]["listen_addr"] = self.broker.listen_addr()

-        if self.control_plane_api is not None:
-            cfg["control_plane_api"] = self.control_plane_api
+        cfg["control_plane_api"] = self.control_plane_api

        if self.control_plane_hooks_api is not None:
            cfg["control_plane_hooks_api"] = self.control_plane_hooks_api
@@ -1280,7 +1279,8 @@ class NeonEnv:
                )

            tenant_config = ps_cfg.setdefault("tenant_config", {})
-            tenant_config["rel_size_v2_enabled"] = True  # Enable relsize_v2 by default in tests
+            # This feature is pending rollout.
+            # tenant_config["rel_size_v2_enabled"] = True

            if self.pageserver_remote_storage is not None:
                ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
--- a/test_runner/performance/test_cumulative_statistics_persistence.py
+++ b/test_runner/performance/test_cumulative_statistics_persistence.py
@@ -1,4 +1,5 @@
 import math  # Add this import
+import os
 import time
 import traceback
 from pathlib import Path
@@ -87,7 +88,10 @@ def test_cumulative_statistics_persistence(
    - insert additional tuples that by itself are not enough to trigger auto-vacuum but in combination with the previous tuples are
    - verify that autovacuum is triggered by the combination of tuples inserted before and after endpoint suspension
    """
-    project = neon_api.create_project(pg_version)
+    project = neon_api.create_project(
+        pg_version,
+        f"Test cumulative statistics persistence, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}",
+    )
    project_id = project["project"]["id"]
    neon_api.wait_for_operation_to_finish(project_id)
    endpoint_id = project["endpoints"][0]["id"]
--- a/test_runner/performance/test_lfc_prefetch.py
+++ b/test_runner/performance/test_lfc_prefetch.py
@@ -1,156 +0,0 @@
-from __future__ import annotations
-
-import random
-import threading
-import time
-import timeit
-from typing import TYPE_CHECKING
-
-import pytest
-from fixtures.log_helper import log
-
-if TYPE_CHECKING:
-    from fixtures.neon_fixtures import NeonEnv
-from fixtures.utils import USE_LFC
-
-
-@pytest.mark.remote_cluster
-@pytest.mark.timeout(100000)
-@pytest.mark.parametrize("n_readers", [1, 2, 4, 8])
-@pytest.mark.parametrize("n_writers", [0, 1, 2, 4, 8])
-@pytest.mark.parametrize("chunk_size", [1, 8, 16])
-@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
-def test_lfc_prefetch(neon_simple_env: NeonEnv, n_readers: int, n_writers: int, chunk_size: int):
-    """
-    Test prefetch under different kinds of workload
-    """
-    env = neon_simple_env
-    endpoint = env.endpoints.create_start(
-        "main",
-        config_lines=[
-            "neon.max_file_cache_size=100MB",
-            "neon.file_cache_size_limit=100MB",
-            "effective_io_concurrency=100",
-            "shared_buffers=128MB",
-            "enable_bitmapscan=off",
-            "enable_seqscan=off",
-            f"neon.file_cache_chunk_size={chunk_size}",
-            "neon.store_prefetch_result_in_lfc=on",
-        ],
-    )
-    n_records = 100000  # 800Mb table
-    top_n = n_records // 4  # 200Mb - should be larger than LFC size
-    test_time = 100.0  # seconds
-
-    conn = endpoint.connect()
-    cur = conn.cursor()
-    cur.execute(
-        "create table account(id integer primary key, balance integer default 0, filler text default repeat('?',1000)) with (fillfactor=10)"
-    )
-    cur.execute(f"insert into account values (generate_series(1,{n_records}))")
-    cur.execute("vacuum account")
-
-    def reader():
-        conn = endpoint.connect()
-        cur = conn.cursor()
-        i = 0
-        cur.execute("set statement_timeout=0")
-        while running:
-            cur.execute(
-                f"select sum(balance) from (select balance from account order by id limit {top_n}) s"
-            )
-            sum = cur.fetchall()[0][0]
-            assert sum == 0  # check consistency
-            i += 1
-        log.info(f"Did {i} index scans")
-
-    def writer():
-        conn = endpoint.connect()
-        cur = conn.cursor()
-        i = 0
-        cur.execute("set statement_timeout=0")
-        while running:
-            r1 = random.randint(1, top_n)
-            r2 = random.randint(1, top_n)
-            # avoid deadlock by ordering src and dst
-            src = min(r1, r2)
-            dst = max(r1, r2)
-            cur.execute(
-                f"update account set balance=balance-1 where id={src}; update account set balance=balance+1 where id={dst}"
-            )
-            i += 1
-        log.info(f"Did {i} updates")
-
-    readers = [threading.Thread(target=reader) for _ in range(n_readers)]
-    writers = [threading.Thread(target=writer) for _ in range(n_writers)]
-
-    running = True
-    for t in readers:
-        t.start()
-    for t in writers:
-        t.start()
-
-    time.sleep(test_time)
-    running = False
-    for t in readers:
-        t.join()
-    for t in writers:
-        t.join()
-
-
-@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
-def test_lfc_async_prefetch_performance(neon_simple_env: NeonEnv, zenbenchmark):
-    """
-    Demonstrate performance advantages of storing prefetch results in LFC
-    """
-    env = neon_simple_env
-    endpoint = env.endpoints.create_start(
-        "main",
-        config_lines=[
-            "neon.max_file_cache_size=100MB",
-            "neon.file_cache_size_limit=100MB",
-            "effective_io_concurrency=100",
-            "shared_buffers=1MB",
-            "enable_bitmapscan=off",
-            "enable_seqscan=off",
-            "autovacuum=off",
-        ],
-    )
-    n_records = 100000  # 800Mb table
-    n_iterations = 1000
-
-    conn = endpoint.connect()
-    cur = conn.cursor()
-    cur.execute(
-        "create table account(id integer primary key, balance integer default 0, filler text default repeat('?',1000)) with (fillfactor=10)"
-    )
-    cur.execute(f"insert into account values (generate_series(1,{n_records}))")
-    cur.execute("vacuum account")
-
-    start = timeit.default_timer()
-    with zenbenchmark.record_duration("do_not_store_prefetch_results"):
-        cur.execute("set neon.store_prefetch_result_in_lfc=off")
-        for _ in range(n_iterations):
-            cur.execute(
-                "select sum(balance) from (select balance from account where id between 1000 and 2000 limit 100) s"
-            )
-            cur.execute(
-                "select sum(balance) from (select balance from account where id between 6000 and 7000 limit 100) s"
-            )
-    end = timeit.default_timer()
-    do_not_store_prefetch_results_duration = end - start
-
-    start = timeit.default_timer()
-    with zenbenchmark.record_duration("store_prefetch_results"):
-        cur.execute("set neon.store_prefetch_result_in_lfc=on")
-        for _ in range(n_iterations):
-            cur.execute(
-                "select sum(balance) from (select balance from account where id between 1000 and 2000 limit 100) s"
-            )
-            cur.execute(
-                "select sum(balance) from (select balance from account where id between 6000 and 7000 limit 100) s"
-            )
-    end = timeit.default_timer()
-    store_prefetch_results_duration = end - start
-
-    assert do_not_store_prefetch_results_duration >= store_prefetch_results_duration
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -62,7 +62,9 @@ def test_ro_replica_lag(

    pgbench_duration = f"-T{test_duration_min * 60 * 2}"

-    project = neon_api.create_project(pg_version)
+    project = neon_api.create_project(
+        pg_version, f"Test readonly replica lag, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
+    )
    project_id = project["project"]["id"]
    log.info("Project ID: %s", project_id)
    log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"])
@@ -195,7 +197,9 @@ def test_replication_start_stop(
    pgbench_duration = f"-T{2**num_replicas * configuration_test_time_sec}"
    error_occurred = False

-    project = neon_api.create_project(pg_version)
+    project = neon_api.create_project(
+        pg_version, f"Test replication start stop, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
+    )
    project_id = project["project"]["id"]
    log.info("Project ID: %s", project_id)
    log.info("Primary endpoint ID: %s", project["endpoints"][0]["id"])
--- a/test_runner/random_ops/test_random_ops.py
+++ b/test_runner/random_ops/test_random_ops.py
@@ -206,7 +206,7 @@ class NeonProject:
        self.neon_api = neon_api
        self.pg_bin = pg_bin
        proj = self.neon_api.create_project(
-            pg_version, f"Automatic random API test {os.getenv('GITHUB_RUN_ID')}"
+            pg_version, f"Automatic random API test GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
        )
        self.id: str = proj["project"]["id"]
        self.name: str = proj["project"]["name"]
--- a/Show More
+++ b/Show More