Merge pull request #5375 from neondatabase/releases/2023-09-26

Release 2023-09-26
page cache: metrics: add page content kind dimension (#5373 )
2026-07-07 14:10:43 +00:00 · 2023-09-26 15:19:45 +03:00 · 2023-09-26 10:01:09 +03:00 · 2023-09-26 09:20:56 +03:00 · 2023-09-25 19:49:07 +01:00 · 2023-09-25 17:16:36 +00:00
164 changed files with 8445 additions and 3097 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -19,6 +19,7 @@
 !trace/
 !vendor/postgres-v14/
 !vendor/postgres-v15/
+!vendor/postgres-v16/
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -0,0 +1,8 @@
+self-hosted-runner:
+  labels:
+    - gen3
+    - large
+    - small
+    - us-east-2
+config-variables:
+  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -70,6 +70,9 @@ runs:
        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
        path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        prefix: latest
+        # The lack of compatibility snapshot (for example, for the new Postgres version)
+        # shouldn't fail the whole job. Only relevant test should fail.
+        skip-if-does-not-exist: true

    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -0,0 +1,31 @@
+name: Lint GitHub Workflows
+
+on:
+  push:
+    branches:
+      - main
+      - release
+    paths:
+      - '.github/workflows/*.ya?ml'
+  pull_request:
+    paths:
+      - '.github/workflows/*.ya?ml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: reviewdog/action-actionlint@v1
+        env:
+          # SC2046 - Quote this to prevent word splitting. - https://www.shellcheck.net/wiki/SC2046
+          # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086
+          SHELLCHECK_OPTS: --exclude=SC2046,SC2086
+        with:
+          fail_on_error: true
+          filter_mode: nofilter
+          level: error
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -16,20 +16,29 @@ on:
      # Actual magic happens here:
      - labeled

+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+
 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
+  BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}"

-permissions: write-all
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}

-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+defaults:
+  run:
+    shell: bash -euo pipefail {0}

 jobs:
  remove-label:
    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
    # The PR should be reviewed and labelled manually again.

+    permissions:
+      pull-requests: write # For `gh pr edit`
+
    if: |
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
@@ -42,6 +51,10 @@ jobs:
  create-or-update-pr-for-ci-run:
    # Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.

+    permissions:
+      pull-requests: write # for `gh pr edit`
+      # For `git push` and `gh pr create` we use CI_ACCESS_TOKEN
+
    if: |
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
@@ -54,25 +67,52 @@ jobs:
      - uses: actions/checkout@v3
        with:
          ref: main
+          token: ${{ secrets.CI_ACCESS_TOKEN }}

      - run: gh pr checkout "${PR_NUMBER}"

-      - run: git checkout -b "ci-run/pr-${PR_NUMBER}"
+      - run: git checkout -b "${BRANCH}"

-      - run: git push --force origin "ci-run/pr-${PR_NUMBER}"
+      - run: git push --force origin "${BRANCH}"

      - name: Create a Pull Request for CI run (if required)
        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
-          HEAD="ci-run/pr-${PR_NUMBER}"
-          BODY="This Pull Request was create automatically to run CI pipeline for #${PR_NUMBER}.\n\nPlease do not alter or merge/close it.\n\nFeel free to comment the original PR."
+          cat << EOF > body.md
+            This Pull Request is created automatically to run the CI pipeline for #${PR_NUMBER}

-          ALREADY_CREATED=$(gh pr --repo "${GITHUB_REPOSITORY}" list --head "${HEAD}" --base "main" --json "number" --jq '.[].number')
+            Please do not alter or merge/close it.
+
+            Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
+          EOF
+
+          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
          if [ -z "${ALREADY_CREATED}" ]; then
-            gh pr --repo "${GITHUB_REPOSITORY}" create  --title "CI run for PR #${PR_NUMBER}" \
-                                                        --body "${BODY}" \
-                                                        --head "${HEAD}" \
-                                                        --base "main" \
-                                                        --draft
+            gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
+                                                       --body-file "body.md" \
+                                                       --head "${BRANCH}" \
+                                                       --base "main" \
+                                                       --draft
+          fi
+
+  cleanup:
+    # Close PRs and delete branchs if the original PR is closed.
+
+    permissions:
+      contents: write # for `--delete-branch` flag in `gh pr close`
+      pull-requests: write # for `gh pr close`
+
+    if: |
+      github.event.action == 'closed' &&
+      github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
+        run: |
+          CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --json 'closed' --jq '.[].closed')"
+          if [ "${CLOSED}" == "false" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch
          fi
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -137,11 +137,11 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

-        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

    - name: Generate matrix for OLAP benchmarks
      id: olap-compare-matrix
@@ -153,11 +153,11 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" },
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
                                                   { "platform": "rds-aurora"   }]')
        fi

-        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

    - name: Generate matrix for TPC-H benchmarks
      id: tpch-compare-matrix
@@ -172,11 +172,11 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
                                                   { "platform": "rds-aurora",   "scale": "10" }]')
        fi

-        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

  pgbench-compare:
    needs: [ generate-matrices ]
@@ -254,7 +254,7 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

        QUERY="SELECT version();"
-        if [ "${PLATFORM}" = "neon"* ]; then
+        if [[ "${PLATFORM}" = "neon"* ]]; then
          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
        psql ${CONNSTR} -c "${QUERY}"
@@ -383,7 +383,7 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

        QUERY="SELECT version();"
-        if [ "${PLATFORM}" = "neon"* ]; then
+        if [[ "${PLATFORM}" = "neon"* ]]; then
          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
        psql ${CONNSTR} -c "${QUERY}"
@@ -476,7 +476,7 @@ jobs:
            ;;
        esac

-        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${SCALE}_CONNSTR"
+        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
        echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV

    - name: Set up Connection String
@@ -487,7 +487,7 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

        QUERY="SELECT version();"
-        if [ "${PLATFORM}" = "neon"* ]; then
+        if [[ "${PLATFORM}" = "neon"* ]]; then
          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
        psql ${CONNSTR} -c "${QUERY}"
@@ -577,7 +577,7 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

        QUERY="SELECT version();"
-        if [ "${PLATFORM}" = "neon"* ]; then
+        if [[ "${PLATFORM}" = "neon"* ]]; then
          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
        psql ${CONNSTR} -c "${QUERY}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -23,7 +23,30 @@ env:
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
+  check-permissions:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Disallow PRs from forks
+      if: |
+        github.event_name == 'pull_request' &&
+        github.event.pull_request.head.repo.full_name != github.repository
+
+      run: |
+        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
+          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
+        else
+          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
+        fi
+
+        echo >&2 "We don't run CI for PRs from forks"
+        echo >&2 "${MESSAGE}"
+
+        exit 1
+
+
  tag:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
@@ -52,6 +75,7 @@ jobs:
        id: build-tag

  check-codestyle-python:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -84,6 +108,7 @@ jobs:
        run: poetry run mypy .

  check-codestyle-rust:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -150,6 +175,7 @@ jobs:
        run: cargo deny check

  build-neon:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -186,7 +212,7 @@ jobs:
          # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603

          FAILED=false
-          for postgres in postgres-v14 postgres-v15; do
+          for postgres in postgres-v14 postgres-v15 postgres-v16; do
            expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
            actual=$(git rev-parse "HEAD:vendor/${postgres}")
            if [ "${expected}" != "${actual}" ]; then
@@ -208,6 +234,10 @@ jobs:
        id: pg_v15_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT

+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
      # Set some environment variables used by all the steps.
      #
      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
@@ -228,10 +258,12 @@ jobs:
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
          fi
-          echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
-          echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
-          echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
-          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+          {
+            echo "cov_prefix=${cov_prefix}"
+            echo "CARGO_FEATURES=${CARGO_FEATURES}"
+            echo "CARGO_FLAGS=${CARGO_FLAGS}"
+            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
+          } >> $GITHUB_ENV

      # Disabled for now
      # Don't include the ~/.cargo/registry/src directory. It contains just
@@ -266,6 +298,13 @@ jobs:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
        run: mold -run make postgres-v14 -j$(nproc)
@@ -274,6 +313,10 @@ jobs:
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
        run: mold -run make postgres-v15 -j$(nproc)

+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

@@ -347,17 +390,17 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
+    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
-    needs: [ build-neon ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug, release ]
-        pg_version: [ v14, v15 ]
+        pg_version: [ v14, v15, v16 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -385,12 +428,12 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  benchmarks:
+    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
-    needs: [ build-neon ]
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
@@ -417,12 +460,13 @@ jobs:
      # while coverage is currently collected for the debug ones

  create-test-report:
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
+    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ regress-tests, coverage-report, benchmarks ]
-    if: ${{ !cancelled() }}

    steps:
      - uses: actions/checkout@v3
@@ -463,11 +507,12 @@ jobs:
            })

  coverage-report:
+    needs: [ check-permissions, regress-tests ]
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ regress-tests ]
    strategy:
      fail-fast: false
      matrix:
@@ -582,11 +627,11 @@ jobs:
            })

  trigger-e2e-tests:
+    needs: [ check-permissions, promote-images, tag ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ promote-images, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -627,8 +672,8 @@ jobs:
            }"

  neon-image:
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -675,7 +720,7 @@ jobs:

  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag ]
+    needs: [ check-permissions, tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -720,17 +765,17 @@ jobs:
        run: rm -rf ~/.ecr

  compute-node-image:
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: gcr.io/kaniko-project/executor:v1.9.2-debug
      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
      options: --add-host=download.osgeo.org:140.211.15.30
-    needs: [ tag ]
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15 ]
+        version: [ v14, v15, v16 ]
    defaults:
      run:
        shell: sh -eu {0}
@@ -779,17 +824,17 @@ jobs:
        run: rm -rf ~/.ecr

  vm-compute-node-image:
+    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag, compute-node-image ]
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15 ]
+        version: [ v14, v15, v16 ]
    defaults:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.17.5
+      VM_BUILDER_VERSION: v0.17.11

    steps:
      - name: Checkout
@@ -821,7 +866,7 @@ jobs:
          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
-    needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
+    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
    runs-on: [ self-hosted, gen3, small ]

    steps:
@@ -864,8 +909,8 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml down

  promote-images:
+    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: [ self-hosted, gen3, small ]
-    needs: [ tag, test-images, vm-compute-node-image ]
    container: golang:1.19-bullseye
    # Don't add if-condition here.
    # The job should always be run because we have dependant other jobs that shouldn't be skipped
@@ -885,6 +930,7 @@ jobs:
        run: |
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16

      - name: Add latest tag to images
        if: |
@@ -897,6 +943,8 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -909,6 +957,8 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest

      - name: Configure Docker Hub login
        run: |
@@ -920,6 +970,7 @@ jobs:
        run: |
          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}

      - name: Push latest tags to Docker Hub
        if: |
@@ -932,13 +983,15 @@ jobs:
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

  trigger-custom-extensions-build-and-wait:
+    needs: [ check-permissions, tag ]
    runs-on: ubuntu-latest
-    needs: [ tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -981,7 +1034,7 @@ jobs:

          last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context

-          for ((i=0; i <= $TIMEOUT; i+=$INTERVAL)); do
+          for ((i=0; i <= TIMEOUT; i+=INTERVAL)); do
            sleep $INTERVAL

            # Get statuses for the latest commit in the PR / branch
@@ -1011,10 +1064,11 @@ jobs:
          exit 1

  deploy:
+    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
+    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
+
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
-    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
        run: |
@@ -1037,8 +1091,9 @@ jobs:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
@@ -1052,20 +1107,35 @@ jobs:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
          script: |
-            github.rest.git.createRef({
+            await github.rest.git.createRef({
              owner: context.repo.owner,
              repo: context.repo.repo,
              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
              sha: context.sha,
            })

+      - name: Create GitHub release
+        if: github.ref_name == 'release'
+        uses: actions/github-script@v6
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            await github.rest.repos.createRelease({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              tag_name: "${{ needs.tag.outputs.build-tag }}",
+              generate_release_notes: true,
+            })
+
  promote-compatibility-data:
+    needs: [ check-permissions, promote-images, tag, regress-tests ]
+    if: github.ref_name == 'release'
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ promote-images, tag, regress-tests ]
-    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
    steps:
      - name: Promote compatibility snapshot for the release
        env:
@@ -1073,7 +1143,7 @@ jobs:
          PREFIX: artifacts/latest
        run: |
          # Update compatibility snapshot for the release
-          for pg_version in v14 v15; do
+          for pg_version in v14 v15 v16; do
            for build_type in debug release; do
              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -38,7 +38,7 @@ jobs:
          fetch-depth: 1

      - name: Install macOS postgres dependencies
-        run: brew install flex bison openssl protobuf
+        run: brew install flex bison openssl protobuf icu4c pkg-config

      - name: Set pg 14 revision for caching
        id: pg_v14_rev
@@ -48,6 +48,10 @@ jobs:
        id: pg_v15_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT

+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
      - name: Cache postgres v14 build
        id: cache_pg_14
        uses: actions/cache@v3
@@ -62,6 +66,13 @@ jobs:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
      - name: Set extra env for macOS
        run: |
          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
@@ -85,6 +96,10 @@ jobs:
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
        run: make postgres-v15 -j$(nproc)

+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: make postgres-v16 -j$(nproc)
+
      - name: Build neon extensions
        run: make neon-pg-ext -j$(nproc)

--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -0,0 +1,29 @@
+name: Notify Slack channel about upcoming release
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.number }}
+  cancel-in-progress: true
+
+on:
+  pull_request:
+    branches:
+      - release
+    types:
+      # Default types that triggers a workflow:
+      # - https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
+      - opened
+      - synchronize
+      - reopened
+      # Additional types that we want to handle:
+      - closed
+
+jobs:
+  notify:
+    runs-on: [ ubuntu-latest ]
+
+    steps:
+      - uses: neondatabase/dev-actions/release-pr-notify@main
+        with:
+          slack-token: ${{ secrets.SLACK_BOT_TOKEN }}
+          slack-channel-id: ${{ vars.SLACK_UPCOMING_RELEASE_CHANNEL_ID || 'C05QQ9J1BRC' }} # if not set, then `#test-release-notifications`
+          github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -7,11 +7,14 @@ on:

 jobs:
  create_release_branch:
-    runs-on: [ubuntu-latest]
+    runs-on: [ ubuntu-latest ]
+
+    permissions:
+      contents: write # for `git push`

    steps:
    - name: Check out code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
      with:
        ref: main

@@ -26,9 +29,16 @@ jobs:
      run: git push origin releases/${{ steps.date.outputs.date }}

    - name: Create pull request into release
-      uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
-      with:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        head: releases/${{ steps.date.outputs.date }}
-        base: release
-        title: Release ${{ steps.date.outputs.date }}
+      env:
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+      run: |
+        cat << EOF > body.md
+          ## Release ${{ steps.date.outputs.date }}
+
+          **Please merge this PR using 'Create a merge commit'!**
+        EOF
+
+        gh pr create --title "Release ${{ steps.date.outputs.date }}" \
+                     --body-file "body.md" \
+                     --head "releases/${{ steps.date.outputs.date }}" \
+                     --base "release"
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,7 @@
 	path = vendor/postgres-v15
 	url = https://github.com/neondatabase/postgres.git
 	branch = REL_15_STABLE_neon
+[submodule "vendor/postgres-v16"]
+	path = vendor/postgres-v16
+	url = https://github.com/neondatabase/postgres.git
+	branch = REL_16_STABLE_neon
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,3 +27,28 @@ your patch's fault. Help to fix the root cause if something else has
 broken the CI, before pushing.

 *Happy Hacking!*
+
+# How to run a CI pipeline on Pull Requests from external contributors
+_An instruction for maintainers_
+
+## TL;DR:
+- Review the PR
+- If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then:
+    - Press the "Approve and run" button in GitHub UI
+    - Add the `approved-for-ci-run` label to the PR
+
+Repeat all steps after any change to the PR.
+- When the changes are ready to get merged — merge the original PR (not the internal one)
+
+## Longer version:
+
+GitHub Actions triggered by the `pull_request` event don't share repository secrets with the forks (for security reasons).
+So, passing the CI pipeline on Pull Requests from external contributors is impossible.
+
+We're using the following approach to make it work:
+- After the review, assign the `approved-for-ci-run` label to the PR if changes look safe
+- A GitHub Action will create an internal branch and a new PR with the same changes (for example, for a PR `#1234`, it'll be a branch `ci-run/pr-1234`)
+- Because the PR is created from the internal branch, it is able to access repository secrets (that's why it's crucial to make sure that the PR doesn't contain any malicious code that could expose our secrets or intentionally harm the CI)
+- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
+
+For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -636,7 +636,7 @@ dependencies = [
 "sha1",
 "sync_wrapper",
 "tokio",
- "tokio-tungstenite 0.20.0",
+ "tokio-tungstenite",
 "tower",
 "tower-layer",
 "tower-service",
@@ -1941,15 +1941,15 @@ dependencies = [

 [[package]]
 name = "hyper-tungstenite"
-version = "0.9.0"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "880b8b1c98a5ec2a505c7c90db6d3f6f1f480af5655d9c5b55facc9382a5a5b5"
+checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
 dependencies = [
 "hyper",
- "pin-project",
+ "pin-project-lite",
 "tokio",
- "tokio-tungstenite 0.18.0",
- "tungstenite 0.18.0",
+ "tokio-tungstenite",
+ "tungstenite",
 ]

 [[package]]
@@ -2908,9 +2908,9 @@ dependencies = [

 [[package]]
 name = "pin-project-lite"
-version = "0.2.9"
+version = "0.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
+checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"

 [[package]]
 name = "pin-utils"
@@ -4641,18 +4641,6 @@ dependencies = [
 "xattr",
 ]

-[[package]]
-name = "tokio-tungstenite"
-version = "0.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "54319c93411147bced34cb5609a80e0a8e44c5999c93903a81cd866630ec0bfd"
-dependencies = [
- "futures-util",
- "log",
- "tokio",
- "tungstenite 0.18.0",
-]
-
 [[package]]
 name = "tokio-tungstenite"
 version = "0.20.0"
@@ -4662,7 +4650,7 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tungstenite 0.20.0",
+ "tungstenite",
 ]

 [[package]]
@@ -4977,28 +4965,9 @@ checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"

 [[package]]
 name = "tungstenite"
-version = "0.18.0"
+version = "0.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30ee6ab729cd4cf0fd55218530c4522ed30b7b6081752839b68fcec8d0960788"
-dependencies = [
- "base64 0.13.1",
- "byteorder",
- "bytes",
- "http",
- "httparse",
- "log",
- "rand",
- "sha1",
- "thiserror",
- "url",
- "utf-8",
-]
-
-[[package]]
-name = "tungstenite"
-version = "0.20.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649"
+checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9"
 dependencies = [
 "byteorder",
 "bytes",
@@ -5648,6 +5617,7 @@ dependencies = [
 "tower",
 "tracing",
 "tracing-core",
+ "tungstenite",
 "url",
 "uuid",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,4 +1,5 @@
 [workspace]
+resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
@@ -77,7 +78,7 @@ hostname = "0.3.1"
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.9"
+hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 itertools = "0.10"
 jsonwebtoken = "8"
--- a/4
+++ b/4
@@ -12,6 +12,7 @@ WORKDIR /home/nonroot

 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
+COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -39,6 +40,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev

 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --chown=nonroot . .

 # Show build caching stats to check if it was used in the end.
@@ -65,6 +67,7 @@ RUN set -e \
    && apt install -y \
        libreadline-dev \
        libseccomp-dev \
+        libicu67 \
        openssl \
        ca-certificates \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
@@ -81,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
+COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
 COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -74,8 +74,8 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar

 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
-    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
+    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
@@ -124,8 +124,21 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
-    echo "1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 plv8.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PLV8_VERSION=3.1.5 \
+        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
+        ;; \
+      "v16") \
+        export PLV8_VERSION=3.1.8 \
+        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
+    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -172,8 +185,8 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
    cp -R /h3/usr / && \
    rm -rf build

-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
-    echo "c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 h3-pg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -243,8 +256,8 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 FROM build-deps AS hypopg-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
-    echo "e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e hypopg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
+    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -307,8 +320,8 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 FROM build-deps AS ip4r-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \
-    echo "78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 ip4r.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
+    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -323,8 +336,8 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O i
 FROM build-deps AS prefix-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \
-    echo "38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e prefix.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
+    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -339,8 +352,8 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O pr
 FROM build-deps AS hll-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \
-    echo "9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 hll.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
+    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -355,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \
-    echo "9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
+    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -371,12 +384,21 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz
 FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN apt-get update && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export TIMESCALEDB_VERSION=2.10.1 \
+        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
+        ;; \
+      *) \
+        echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
+    esac && \
+    apt-get update && \
    apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
-    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
+    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
@@ -405,6 +427,10 @@ RUN case "${PG_VERSION}" in \
        export PG_HINT_PLAN_VERSION=15_1_5_0 \
        export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \
        ;; \
+      "v16") \
+        export PG_HINT_PLAN_VERSION=16_1_6_0 \
+        export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
+        ;; \
      *) \
        echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
        ;; \
@@ -452,8 +478,8 @@ FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
-    echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
+    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -479,8 +505,8 @@ RUN apt-get update && \
        libfreetype6-dev

 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
-    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
+    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
@@ -551,12 +577,19 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
-    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PG_EMBEDDING_VERSION=0.3.5 \
+        export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
+        ;; \
+      *) \
+        echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
+    esac && \
+    wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
+    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
+    make -j $(getconf _NPROCESSORS_ONLN) install

 #########################################################################################
 #
@@ -584,6 +617,10 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
 # Layer "rust extensions"
 # This layer is used to build `pgx` deps
 #
+# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
+# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
+# dependency on all the rust extension that depend on it, too.
+#
 #########################################################################################
 FROM build-deps AS rust-extensions-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -598,7 +635,17 @@ USER nonroot
 WORKDIR /home/nonroot
 ARG PG_VERSION

-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
+        ;; \
+    esac && \
+    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
@@ -615,10 +662,21 @@ USER root
 #########################################################################################

 FROM rust-extensions-build AS pg-jsonschema-pg-build
+ARG PG_VERSION

 # caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
 # there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
-RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -633,12 +691,23 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421e
 #########################################################################################

 FROM rust-extensions-build AS pg-graphql-pg-build
+ARG PG_VERSION

 # b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
 # Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
 # pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
 # same 1.1 version we've used before.
-RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -656,9 +725,20 @@ RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367
 #########################################################################################

 FROM rust-extensions-build AS pg-tiktoken-pg-build
+ARG PG_VERSION

 # 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
-RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
    cargo pgx install --release && \
@@ -672,8 +752,19 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
 #########################################################################################

 FROM rust-extensions-build AS pg-pgx-ulid-build
+ARG PG_VERSION

-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        ;; \
+      "v16") \
+        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
+	;; \
+      *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
@@ -726,6 +817,20 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_utils \
        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon_rmgr \
+        -s install && \
+    case "${PG_VERSION}" in \
+        "v14" | "v15") \
+        ;; \
+        "v16") \
+            echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
+        ;; \
+        *) \
+            echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+        esac && \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/hnsw \
--- a/41
+++ b/41
@@ -29,6 +29,7 @@ else ifeq ($(UNAME_S),Darwin)
 	# It can be configured with OPENSSL_PREFIX variable
 	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
 	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
 	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
 	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
@@ -83,6 +84,8 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 # I'm not sure why it wouldn't work, but this is the only place (apart from
 # the "build-all-versions" entry points) where direct mention of PostgreSQL
 # versions is used.
+.PHONY: postgres-configure-v16
+postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
 .PHONY: postgres-configure-v15
 postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
 .PHONY: postgres-configure-v14
@@ -118,6 +121,10 @@ postgres-clean-%:
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean

+.PHONY: postgres-check-%
+postgres-check-%: postgres-%
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check
+
 .PHONY: neon-pg-ext-%
 neon-pg-ext-%: postgres-%
 	+@echo "Compiling neon $*"
@@ -130,6 +137,11 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+	+@echo "Compiling neon_rmgr $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install
 	+@echo "Compiling neon_test_utils $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
@@ -140,11 +152,6 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
-	+@echo "Compiling hnsw $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install

 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
@@ -160,35 +167,43 @@ neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-	-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-	-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean

 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
-	neon-pg-ext-v15
+	neon-pg-ext-v15 \
+	neon-pg-ext-v16

 .PHONY: neon-pg-ext-clean
 neon-pg-ext-clean: \
 	neon-pg-ext-clean-v14 \
-	neon-pg-ext-clean-v15
+	neon-pg-ext-clean-v15 \
+	neon-pg-ext-clean-v16

 # shorthand to build all Postgres versions
 .PHONY: postgres
 postgres: \
 	postgres-v14 \
-	postgres-v15
+	postgres-v15 \
+	postgres-v16

 .PHONY: postgres-headers
 postgres-headers: \
 	postgres-headers-v14 \
-	postgres-headers-v15
+	postgres-headers-v15 \
+	postgres-headers-v16

 .PHONY: postgres-clean
 postgres-clean: \
 	postgres-clean-v14 \
-	postgres-clean-v15
+	postgres-clean-v15 \
+	postgres-clean-v16
+
+.PHONY: postgres-check
+postgres-check: \
+	postgres-check-v14 \
+	postgres-check-v15 \
+	postgres-check-v16

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
--- a/README.md
+++ b/README.md
@@ -29,18 +29,18 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry
+libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry
+  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
 pacman -S base-devel readline zlib libseccomp openssl clang \
-postgresql-libs cmake postgresql protobuf curl
+postgresql-libs cmake postgresql protobuf curl lsof
 ```

 Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
@@ -55,7 +55,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf openssl flex bison
+brew install protobuf openssl flex bison icu4c pkg-config

 # add openssl to PATH, required for ed25519 keys generation in neon_local
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
--- a/clippy.toml
+++ b/clippy.toml
@@ -0,0 +1,5 @@
+disallowed-methods = [
+    "tokio::task::block_in_place",
+    # Allow this for now, to deny it later once we stop using Handle::block_on completely
+    # "tokio::runtime::Handle::block_on",
+]
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,7 +1,7 @@
 use anyhow::{anyhow, Ok, Result};
 use postgres::Client;
 use tokio_postgres::NoTls;
-use tracing::{error, instrument};
+use tracing::{error, instrument, warn};

 use crate::compute::ComputeNode;

@@ -55,13 +55,24 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
        ON CONFLICT (id) DO UPDATE
         SET updated_at = now();";

-    let result = client.simple_query(query).await?;
-
-    if result.len() != 1 {
-        return Err(anyhow::format_err!(
-            "expected 1 query result, but got {}",
-            result.len()
-        ));
+    match client.simple_query(query).await {
+        Result::Ok(result) => {
+            if result.len() != 1 {
+                return Err(anyhow::anyhow!(
+                    "expected 1 query results, but got {}",
+                    result.len()
+                ));
+            }
+        }
+        Err(err) => {
+            if let Some(state) = err.code() {
+                if state == &tokio_postgres::error::SqlState::DISK_FULL {
+                    warn!("Tenant disk is full");
+                    return Ok(());
+                }
+            }
+            return Err(err.into());
+        }
    }

    Ok(())
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -46,8 +46,6 @@ pub fn write_postgres_conf(
        writeln!(file, "{}", conf)?;
    }

-    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
-
    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
    if let Some(s) = &spec.pageserver_connstring {
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -74,6 +74,7 @@ More specifically, here is an example ext_index.json
 use anyhow::Context;
 use anyhow::{self, Result};
 use compute_api::spec::RemoteExtSpec;
+use regex::Regex;
 use remote_storage::*;
 use serde_json;
 use std::io::Read;
@@ -106,16 +107,71 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {

 pub fn get_pg_version(pgbin: &str) -> String {
    // pg_config --version returns a (platform specific) human readable string
-    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
    let human_version = get_pg_config("--version", pgbin);
-    if human_version.contains("15") {
-        return "v15".to_string();
-    } else if human_version.contains("14") {
-        return "v14".to_string();
+    return parse_pg_version(&human_version).to_string();
+}
+
+fn parse_pg_version(human_version: &str) -> &str {
+    // Normal releases have version strings like "PostgreSQL 15.4". But there
+    // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
+    // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
+    // configure option, you can tack any string to the version number,
+    // e.g. "PostgreSQL 15.4foobar".
+    match Regex::new(r"^PostgreSQL (?<major>\d+).+")
+        .unwrap()
+        .captures(human_version)
+    {
+        Some(captures) if captures.len() == 2 => match &captures["major"] {
+            "14" => return "v14",
+            "15" => return "v15",
+            "16" => return "v16",
+            _ => {}
+        },
+        _ => {}
    }
    panic!("Unsuported postgres version {human_version}");
 }

+#[cfg(test)]
+mod tests {
+    use super::parse_pg_version;
+
+    #[test]
+    fn test_parse_pg_version() {
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
+            "v15"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
+            "v14"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_unsupported_version() {
+        parse_pg_version("PostgreSQL 13.14");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_incorrect_version_format() {
+        parse_pg_version("PostgreSQL 14");
+    }
+}
+
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -1,6 +1,7 @@
 # Minimal neon environment with one safekeeper. This is equivalent to the built-in
 # defaults that you get with no --config
-[pageserver]
+[[pageservers]]
+id=1
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
 pg_auth_type = 'Trust'
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -32,7 +32,7 @@ impl AttachmentService {

        // Makes no sense to construct this if pageservers aren't going to use it: assume
        // pageservers have control plane API set
-        let listen_url = env.pageserver.control_plane_api.clone().unwrap();
+        let listen_url = env.control_plane_api.clone().unwrap();

        let listen = format!(
            "{}:{}",
@@ -80,7 +80,6 @@ impl AttachmentService {

        let url = self
            .env
-            .pageserver
            .control_plane_api
            .clone()
            .unwrap()
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -50,16 +50,17 @@ fn default_conf() -> String {
    format!(
        r#"
 # Default built-in configuration, defined in main.rs
+control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
+
 [broker]
 listen_addr = '{DEFAULT_BROKER_ADDR}'

-[pageserver]
+[[pageservers]]
 id = {DEFAULT_PAGESERVER_ID}
 listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
 listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
 pg_auth_type = '{trust_auth}'
 http_auth_type = '{trust_auth}'
-control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'

 [[safekeepers]]
 id = {DEFAULT_SAFEKEEPER_ID}
@@ -258,7 +259,7 @@ fn get_timeline_infos(
    env: &local_env::LocalEnv,
    tenant_id: &TenantId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
-    Ok(PageServerNode::from_env(env)
+    Ok(get_default_pageserver(env)
        .timeline_list(tenant_id)?
        .into_iter()
        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
@@ -319,17 +320,30 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
        .context("Failed to initialize neon repository")?;

    // Initialize pageserver, create initial tenant and timeline.
-    let pageserver = PageServerNode::from_env(&env);
-    pageserver
-        .initialize(&pageserver_config_overrides(init_match))
-        .unwrap_or_else(|e| {
-            eprintln!("pageserver init failed: {e:?}");
-            exit(1);
-        });
+    for ps_conf in &env.pageservers {
+        PageServerNode::from_env(&env, ps_conf)
+            .initialize(&pageserver_config_overrides(init_match))
+            .unwrap_or_else(|e| {
+                eprintln!("pageserver init failed: {e:?}");
+                exit(1);
+            });
+    }

    Ok(env)
 }

+/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
+/// For typical interactive use, one would just run with a single pageserver.  Scenarios with
+/// tenant/timeline placement across multiple pageservers are managed by python test code rather
+/// than this CLI.
+fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
+    let ps_conf = env
+        .pageservers
+        .first()
+        .expect("Config is validated to contain at least one pageserver");
+    PageServerNode::from_env(env, ps_conf)
+}
+
 fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
    init_match
        .get_many::<String>("pageserver-config-override")
@@ -340,7 +354,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
 }

 fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    let pageserver = get_default_pageserver(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
            for t in pageserver.tenant_list()? {
@@ -356,11 +370,11 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
            // If tenant ID was not specified, generate one
            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);

-            let generation = if env.pageserver.control_plane_api.is_some() {
+            let generation = if env.control_plane_api.is_some() {
                // We must register the tenant with the attachment service, so
                // that when the pageserver restarts, it will be re-attached.
                let attachment_service = AttachmentService::from_env(env);
-                attachment_service.attach_hook(tenant_id, env.pageserver.id)?
+                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
            } else {
                None
            };
@@ -425,7 +439,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
 }

 fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    let pageserver = get_default_pageserver(env);

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
@@ -502,6 +516,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                None,
                pg_version,
                ComputeMode::Primary,
+                DEFAULT_PAGESERVER_ID,
            )?;
            println!("Done");
        }
@@ -555,7 +570,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
        Some(ep_subcommand_data) => ep_subcommand_data,
        None => bail!("no endpoint subcommand provided"),
    };
-
    let mut cplane = ComputeControlPlane::load(env.clone())?;

    // All subcommands take an optional --tenant-id option
@@ -652,6 +666,13 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .unwrap_or(false);

+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                } else {
+                    DEFAULT_PAGESERVER_ID
+                };
+
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -667,6 +688,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                http_port,
                pg_version,
                mode,
+                pageserver_id,
            )?;
        }
        "start" => {
@@ -676,6 +698,13 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                } else {
+                    DEFAULT_PAGESERVER_ID
+                };
+
            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");

            // If --safekeepers argument is given, use only the listed safekeeper nodes.
@@ -695,7 +724,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(

            let endpoint = cplane.endpoints.get(endpoint_id.as_str());

-            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
+            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
+            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(tenant_id), Scope::Tenant);

                Some(env.generate_auth_token(&claims)?)
@@ -762,6 +792,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    http_port,
                    pg_version,
                    mode,
+                    pageserver_id,
                )?;
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
@@ -786,48 +817,64 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
 }

 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
+        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
+            NodeId(id_str.parse().context("while parsing pageserver id")?)
+        } else {
+            DEFAULT_PAGESERVER_ID
+        };
+
+        Ok(PageServerNode::from_env(
+            env,
+            env.get_pageserver_conf(node_id)?,
+        ))
+    }

    match sub_match.subcommand() {
-        Some(("start", start_match)) => {
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) {
+        Some(("start", subcommand_args)) => {
+            if let Err(e) = get_pageserver(env, subcommand_args)?
+                .start(&pageserver_config_overrides(subcommand_args))
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }

-        Some(("stop", stop_match)) => {
-            let immediate = stop_match
+        Some(("stop", subcommand_args)) => {
+            let immediate = subcommand_args
                .get_one::<String>("stop-mode")
                .map(|s| s.as_str())
                == Some("immediate");

-            if let Err(e) = pageserver.stop(immediate) {
+            if let Err(e) = get_pageserver(env, subcommand_args)?.stop(immediate) {
                eprintln!("pageserver stop failed: {}", e);
                exit(1);
            }
        }

-        Some(("restart", restart_match)) => {
+        Some(("restart", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
            //TODO what shutdown strategy should we use here?
            if let Err(e) = pageserver.stop(false) {
                eprintln!("pageserver stop failed: {}", e);
                exit(1);
            }

-            if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) {
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }

-        Some(("status", _)) => match PageServerNode::from_env(env).check_status() {
-            Ok(_) => println!("Page server is up and running"),
-            Err(err) => {
-                eprintln!("Page server is not available: {}", err);
-                exit(1);
+        Some(("status", subcommand_args)) => {
+            match get_pageserver(env, subcommand_args)?.check_status() {
+                Ok(_) => println!("Page server is up and running"),
+                Err(err) => {
+                    eprintln!("Page server is not available: {}", err);
+                    exit(1);
+                }
            }
-        },
+        }

        Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
        None => bail!("no pageserver subcommand provided"),
@@ -943,7 +990,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
    broker::start_broker_process(env)?;

    // Only start the attachment service if the pageserver is configured to need it
-    if env.pageserver.control_plane_api.is_some() {
+    if env.control_plane_api.is_some() {
        let attachment_service = AttachmentService::from_env(env);
        if let Err(e) = attachment_service.start() {
            eprintln!("attachment_service start failed: {:#}", e);
@@ -952,11 +999,13 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
        }
    }

-    let pageserver = PageServerNode::from_env(env);
-    if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
-        eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
-        try_stop_all(env, true);
-        exit(1);
+    for ps_conf in &env.pageservers {
+        let pageserver = PageServerNode::from_env(env, ps_conf);
+        if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
+            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
+            try_stop_all(env, true);
+            exit(1);
+        }
    }

    for node in env.safekeepers.iter() {
@@ -980,8 +1029,6 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
 }

 fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
-    let pageserver = PageServerNode::from_env(env);
-
    // Stop all endpoints
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
@@ -996,8 +1043,11 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        }
    }

-    if let Err(e) = pageserver.stop(immediate) {
-        eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
+    for ps_conf in &env.pageservers {
+        let pageserver = PageServerNode::from_env(env, ps_conf);
+        if let Err(e) = pageserver.stop(immediate) {
+            eprintln!("pageserver {} stop failed: {:#}", ps_conf.id, e);
+        }
    }

    for node in env.safekeepers.iter() {
@@ -1011,7 +1061,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        eprintln!("neon broker stop failed: {e:#}");
    }

-    if env.pageserver.control_plane_api.is_some() {
+    if env.control_plane_api.is_some() {
        let attachment_service = AttachmentService::from_env(env);
        if let Err(e) = attachment_service.stop(immediate) {
            eprintln!("attachment service stop failed: {e:#}");
@@ -1031,6 +1081,16 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

+    // --id, when using a pageserver command
+    let pageserver_id_arg = Arg::new("pageserver-id")
+        .long("id")
+        .help("pageserver id")
+        .required(false);
+    // --pageserver-id when using a non-pageserver command
+    let endpoint_pageserver_id_arg = Arg::new("endpoint-pageserver-id")
+        .long("pageserver-id")
+        .required(false);
+
    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
        .short('e')
        .long("safekeeper-extra-opt")
@@ -1195,10 +1255,16 @@ fn cli() -> Command {
                .arg_required_else_help(true)
                .about("Manage pageserver")
                .subcommand(Command::new("status"))
-                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
+                .arg(pageserver_id_arg.clone())
+                .subcommand(Command::new("start").about("Start local pageserver")
+                .arg(pageserver_id_arg.clone())
+                .arg(pageserver_config_args.clone()))
                .subcommand(Command::new("stop").about("Stop local pageserver")
+                .arg(pageserver_id_arg.clone())
                            .arg(stop_mode_arg.clone()))
-                .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("restart").about("Restart local pageserver")
+                .arg(pageserver_id_arg.clone())
+                .arg(pageserver_config_args.clone()))
        )
        .subcommand(
            Command::new("attachment_service")
@@ -1242,6 +1308,7 @@ fn cli() -> Command {
                    .arg(lsn_arg.clone())
                    .arg(pg_port_arg.clone())
                    .arg(http_port_arg.clone())
+                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(
                        Arg::new("config-only")
                            .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1259,6 +1326,7 @@ fn cli() -> Command {
                    .arg(lsn_arg)
                    .arg(pg_port_arg)
                    .arg(http_port_arg)
+                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -70,6 +70,7 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
+    pageserver_id: NodeId,
 }

 //
@@ -82,19 +83,16 @@ pub struct ComputeControlPlane {
    pub endpoints: BTreeMap<String, Arc<Endpoint>>,

    env: LocalEnv,
-    pageserver: Arc<PageServerNode>,
 }

 impl ComputeControlPlane {
    // Load current endpoints from the endpoints/ subdirectories
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
-        let pageserver = Arc::new(PageServerNode::from_env(&env));
-
        let mut endpoints = BTreeMap::default();
        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
-            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
+            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?;
            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
        }

@@ -102,7 +100,6 @@ impl ComputeControlPlane {
            base_port: 55431,
            endpoints,
            env,
-            pageserver,
        })
    }

@@ -125,15 +122,18 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
+        pageserver_id: NodeId,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
+        let pageserver =
+            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
-            pageserver: Arc::clone(&self.pageserver),
+            pageserver,
            timeline_id,
            mode,
            tenant_id,
@@ -159,6 +159,7 @@ impl ComputeControlPlane {
                pg_port,
                pg_version,
                skip_pg_catalog_updates: true,
+                pageserver_id,
            })?,
        )?;
        std::fs::write(
@@ -193,18 +194,14 @@ pub struct Endpoint {
    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
    pub env: LocalEnv,
-    pageserver: Arc<PageServerNode>,
+    pageserver: PageServerNode,

    // Optimizations
    skip_pg_catalog_updates: bool,
 }

 impl Endpoint {
-    fn from_dir_entry(
-        entry: std::fs::DirEntry,
-        env: &LocalEnv,
-        pageserver: &Arc<PageServerNode>,
-    ) -> Result<Endpoint> {
+    fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
        if !entry.file_type()?.is_dir() {
            anyhow::bail!(
                "Endpoint::from_dir_entry failed: '{}' is not a directory",
@@ -220,12 +217,15 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

+        let pageserver =
+            PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
+
        Ok(Endpoint {
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
-            pageserver: Arc::clone(pageserver),
+            pageserver,
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -68,11 +68,17 @@ pub struct LocalEnv {

    pub broker: NeonBroker,

-    pub pageserver: PageServerConf,
+    /// This Vec must always contain at least one pageserver
+    pub pageservers: Vec<PageServerConf>,

    #[serde(default)]
    pub safekeepers: Vec<SafekeeperConf>,

+    // Control plane location: if None, we will not run attachment_service.  If set, this will
+    // be propagated into each pageserver's configuration.
+    #[serde(default)]
+    pub control_plane_api: Option<Url>,
+
    /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
    #[serde(default)]
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
@@ -118,9 +124,6 @@ pub struct PageServerConf {
    // auth type used for the PG and HTTP ports
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
-
-    // Control plane location
-    pub control_plane_api: Option<Url>,
 }

 impl Default for PageServerConf {
@@ -131,7 +134,6 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-            control_plane_api: None,
        }
    }
 }
@@ -180,26 +182,18 @@ impl LocalEnv {
    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 => Ok(path.join(format!("v{pg_version}"))),
-            15 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

    pub fn pageserver_bin(&self) -> PathBuf {
@@ -222,15 +216,23 @@ impl LocalEnv {
        self.base_data_dir.join("endpoints")
    }

-    // TODO: move pageserver files into ./pageserver
-    pub fn pageserver_data_dir(&self) -> PathBuf {
-        self.base_data_dir.clone()
+    pub fn pageserver_data_dir(&self, pageserver_id: NodeId) -> PathBuf {
+        self.base_data_dir
+            .join(format!("pageserver_{pageserver_id}"))
    }

    pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf {
        self.base_data_dir.join("safekeepers").join(data_dir_name)
    }

+    pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
+        if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
+            Ok(conf)
+        } else {
+            bail!("could not find pageserver {id}")
+        }
+    }
+
    pub fn register_branch_mapping(
        &mut self,
        branch_name: String,
@@ -307,6 +309,10 @@ impl LocalEnv {
            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
        }

+        if env.pageservers.is_empty() {
+            anyhow::bail!("Configuration must contain at least one pageserver");
+        }
+
        env.base_data_dir = base_path();

        Ok(env)
@@ -339,7 +345,7 @@ impl LocalEnv {
        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
        // a bit sad.
-        let mut conf_content = r#"# This file describes a locale deployment of the page server
+        let mut conf_content = r#"# This file describes a local deployment of the page server
 # and safekeeeper node. It is read by the 'neon_local' command-line
 # utility.
 "#
@@ -469,9 +475,9 @@ impl LocalEnv {
    }

    fn auth_keys_needed(&self) -> bool {
-        self.pageserver.pg_auth_type == AuthType::NeonJWT
-            || self.pageserver.http_auth_type == AuthType::NeonJWT
-            || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+        self.pageservers.iter().any(|ps| {
+            ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
+        }) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
    }
 }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -27,6 +27,7 @@ use utils::{
    lsn::Lsn,
 };

+use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

 #[derive(Error, Debug)]
@@ -76,43 +77,40 @@ impl ResponseErrorMessageExt for Response {
 #[derive(Debug)]
 pub struct PageServerNode {
    pub pg_connection_config: PgConnectionConfig,
+    pub conf: PageServerConf,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
 }

 impl PageServerNode {
-    pub fn from_env(env: &LocalEnv) -> PageServerNode {
-        let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr)
-            .expect("Unable to parse listen_pg_addr");
+    pub fn from_env(env: &LocalEnv, conf: &PageServerConf) -> PageServerNode {
+        let (host, port) =
+            parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
        let port = port.unwrap_or(5432);
        Self {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
+            conf: conf.clone(),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
+            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
        }
    }

    // pageserver conf overrides defined by neon_local configuration.
    fn neon_local_overrides(&self) -> Vec<String> {
-        let id = format!("id={}", self.env.pageserver.id);
+        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );

-        let http_auth_type_param =
-            format!("http_auth_type='{}'", self.env.pageserver.http_auth_type);
-        let listen_http_addr_param = format!(
-            "listen_http_addr='{}'",
-            self.env.pageserver.listen_http_addr
-        );
+        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);

-        let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type);
-        let listen_pg_addr_param =
-            format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -126,17 +124,18 @@ impl PageServerNode {
            broker_endpoint_param,
        ];

-        if let Some(control_plane_api) = &self.env.pageserver.control_plane_api {
+        if let Some(control_plane_api) = &self.env.control_plane_api {
            overrides.push(format!(
                "control_plane_api='{}'",
                control_plane_api.as_str()
            ));
        }

-        if self.env.pageserver.http_auth_type != AuthType::Trust
-            || self.env.pageserver.pg_auth_type != AuthType::Trust
+        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
        {
-            overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
+            // Keys are generated in the toplevel repo dir, pageservers' workdirs
+            // are one level below that, so refer to keys with ../
+            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
        }
        overrides
    }
@@ -144,16 +143,12 @@ impl PageServerNode {
    /// Initializes a pageserver node by creating its config with the overrides provided.
    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
-        self.pageserver_init(config_overrides).with_context(|| {
-            format!(
-                "Failed to run init for pageserver node {}",
-                self.env.pageserver.id,
-            )
-        })
+        self.pageserver_init(config_overrides)
+            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id,))
    }

    pub fn repo_path(&self) -> PathBuf {
-        self.env.pageserver_data_dir()
+        self.env.pageserver_data_dir(self.conf.id)
    }

    /// The pid file is created by the pageserver process, with its pid stored inside.
@@ -169,7 +164,7 @@ impl PageServerNode {

    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        let datadir = self.repo_path();
-        let node_id = self.env.pageserver.id;
+        let node_id = self.conf.id;
        println!(
            "Initializing pageserver node {} at '{}' in {:?}",
            node_id,
@@ -178,6 +173,10 @@ impl PageServerNode {
        );
        io::stdout().flush()?;

+        if !datadir.exists() {
+            std::fs::create_dir(&datadir)?;
+        }
+
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
        })?;
@@ -208,7 +207,7 @@ impl PageServerNode {
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
-            self.env.pageserver.id,
+            self.conf.id,
            self.pg_connection_config.raw_address(),
            datadir
        );
@@ -217,7 +216,7 @@ impl PageServerNode {
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!(
                "Cannot start pageserver node {} in path that has no string representation: {:?}",
-                self.env.pageserver.id, datadir,
+                self.conf.id, datadir,
            )
        })?;
        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
@@ -261,7 +260,7 @@ impl PageServerNode {
        // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
        // needs a token, and how to generate that token, seems independent to whether
        // the pageserver requires a token in incoming requests.
-        Ok(if self.env.pageserver.http_auth_type != AuthType::Trust {
+        Ok(if self.conf.http_auth_type != AuthType::Trust {
            // Generate a token to connect from the pageserver to a safekeeper
            let token = self
                .env
@@ -286,7 +285,7 @@ impl PageServerNode {

    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
        let mut config = self.pg_connection_config.clone();
-        if self.env.pageserver.pg_auth_type == AuthType::NeonJWT {
+        if self.conf.pg_auth_type == AuthType::NeonJWT {
            let token = self
                .env
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
@@ -297,7 +296,7 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
        let mut builder = self.http_client.request(method, url);
-        if self.env.pageserver.http_auth_type == AuthType::NeonJWT {
+        if self.conf.http_auth_type == AuthType::NeonJWT {
            let token = self
                .env
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -30,7 +30,7 @@ cleanup() {
 echo "clean up containers if exists"
 cleanup

-for pg_version in 14 15; do
+for pg_version in 14 15 16; do
    echo "start containers (pg_version=$pg_version)."
    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d

--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -3,9 +3,9 @@
 //!
 use chrono::{DateTime, Utc};
 use rand::Rng;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};

-#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -27,7 +27,8 @@ impl EventType {
    }

    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
-        // these can most likely be thought of as Range or RangeFull
+        // these can most likely be thought of as Range or RangeFull, at least pageserver creates
+        // incremental ranges where the stop and next start are equal.
        use EventType::*;
        match self {
            Incremental {
@@ -41,15 +42,25 @@ impl EventType {
    pub fn is_incremental(&self) -> bool {
        matches!(self, EventType::Incremental { .. })
    }
+
+    /// Returns the absolute time, or for incremental ranges, the stop time.
+    pub fn recorded_at(&self) -> &DateTime<Utc> {
+        use EventType::*;
+
+        match self {
+            Absolute { time } => time,
+            Incremental { stop_time, .. } => stop_time,
+        }
+    }
 }

-#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
-pub struct Event<Extra> {
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub struct Event<Extra, Metric> {
    #[serde(flatten)]
    #[serde(rename = "type")]
    pub kind: EventType,

-    pub metric: &'static str,
+    pub metric: Metric,
    pub idempotency_key: String,
    pub value: u64,

@@ -58,12 +69,38 @@ pub struct Event<Extra> {
 }

 pub fn idempotency_key(node_id: &str) -> String {
-    format!(
-        "{}-{}-{:04}",
-        Utc::now(),
-        node_id,
-        rand::thread_rng().gen_range(0..=9999)
-    )
+    IdempotencyKey::generate(node_id).to_string()
+}
+
+/// Downstream users will use these to detect upload retries.
+pub struct IdempotencyKey<'a> {
+    now: chrono::DateTime<Utc>,
+    node_id: &'a str,
+    nonce: u16,
+}
+
+impl std::fmt::Display for IdempotencyKey<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}-{}-{:04}", self.now, self.node_id, self.nonce)
+    }
+}
+
+impl<'a> IdempotencyKey<'a> {
+    pub fn generate(node_id: &'a str) -> Self {
+        IdempotencyKey {
+            now: Utc::now(),
+            node_id,
+            nonce: rand::thread_rng().gen_range(0..=9999),
+        }
+    }
+
+    pub fn for_tests(now: DateTime<Utc>, node_id: &'a str, nonce: u16) -> Self {
+        IdempotencyKey {
+            now,
+            node_id,
+            nonce,
+        }
+    }
 }

 pub const CHUNK_SIZE: usize = 1000;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -381,6 +381,8 @@ pub struct TimelineInfo {
    pub pg_version: u32,

    pub state: TimelineState,
+
+    pub walreceiver_status: String,
 }

 #[derive(Debug, Clone, Serialize)]
--- a/libs/postgres_ffi/README.md
+++ b/libs/postgres_ffi/README.md
@@ -10,9 +10,11 @@ should be auto-generated too, but that's a TODO.
 The PostgreSQL on-disk file format is not portable across different
 CPU architectures and operating systems. It is also subject to change
 in each major PostgreSQL version. Currently, this module supports
-PostgreSQL v14 and v15: bindings and code that depends on them are version-specific.
-This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15`
-Version independend code is explicitly exported into shared `postgres_ffi`.
+PostgreSQL v14, v15 and v16: bindings and code that depends on them are
+version-specific.
+This code is organized in modules `postgres_ffi::v14`, `postgres_ffi::v15` and
+`postgres_ffi::v16`. Version independent code is explicitly exported into
+shared `postgres_ffi`.


 TODO: Currently, there is also some code that deals with WAL records
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
        PathBuf::from("pg_install")
    };

-    for pg_version in &["v14", "v15"] {
+    for pg_version in &["v14", "v15", "v16"] {
        let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
        if pg_install_dir_versioned.is_relative() {
            let cwd = env::current_dir().context("Failed to get current_dir")?;
@@ -125,6 +125,7 @@ fn main() -> anyhow::Result<()> {
            .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
            .allowlist_type("PageHeaderData")
            .allowlist_type("DBState")
+            .allowlist_type("RelMapFile")
            // Because structs are used for serialization, tell bindgen to emit
            // explicit padding fields.
            .explicit_padding(true)
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -51,11 +51,59 @@ macro_rules! for_all_postgres_versions {
    ($macro:tt) => {
        $macro!(v14);
        $macro!(v15);
+        $macro!(v16);
    };
 }

 for_all_postgres_versions! { postgres_ffi }

+/// dispatch_pgversion
+///
+/// Run a code block in a context where the postgres_ffi bindings for a
+/// specific (supported) PostgreSQL version are `use`-ed in scope under the pgv
+/// identifier.
+/// If the provided pg_version is not supported, we panic!(), unless the
+/// optional third argument was provided (in which case that code will provide
+/// the default handling instead).
+///
+/// Use like
+///
+/// dispatch_pgversion!(my_pgversion, { pgv::constants::XLOG_DBASE_CREATE })
+/// dispatch_pgversion!(my_pgversion, pgv::constants::XLOG_DBASE_CREATE)
+///
+/// Other uses are for macro-internal purposes only and strictly unsupported.
+///
+#[macro_export]
+macro_rules! dispatch_pgversion {
+    ($version:expr, $code:expr) => {
+        dispatch_pgversion!($version, $code, panic!("Unknown PostgreSQL version {}", $version))
+    };
+    ($version:expr, $code:expr, $invalid_pgver_handling:expr) => {
+        dispatch_pgversion!(
+            $version => $code,
+            default = $invalid_pgver_handling,
+            pgversions = [
+                14 : v14,
+                15 : v15,
+                16 : v16,
+            ]
+        )
+    };
+    ($pgversion:expr => $code:expr,
+     default = $default:expr,
+     pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => {
+        match ($pgversion) {
+            $($sv => {
+                use $crate::$vsv as pgv;
+                $code
+            },)+
+            _ => {
+                $default
+            }
+        }
+    };
+}
+
 pub mod pg_constants;
 pub mod relfile_utils;

@@ -90,13 +138,7 @@ pub use v14::xlog_utils::XLogFileName;
 pub use v14::bindings::DBState_DB_SHUTDOWNED;

 pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
-    match version {
-        14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
-        15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
-            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
-            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
-        _ => anyhow::bail!("Unknown version {}", version),
-    }
+    dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
 }

 pub fn generate_wal_segment(
@@ -107,11 +149,11 @@ pub fn generate_wal_segment(
 ) -> Result<Bytes, SerializeError> {
    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));

-    match pg_version {
-        14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        _ => Err(SerializeError::BadInput),
-    }
+    dispatch_pgversion!(
+        pg_version,
+        pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        Err(SerializeError::BadInput)
+    )
 }

 pub fn generate_pg_control(
@@ -120,11 +162,11 @@ pub fn generate_pg_control(
    lsn: Lsn,
    pg_version: u32,
 ) -> anyhow::Result<(Bytes, u64)> {
-    match pg_version {
-        14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
-        15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
-        _ => anyhow::bail!("Unknown version {}", pg_version),
-    }
+    dispatch_pgversion!(
+        pg_version,
+        pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
+        anyhow::bail!("Unknown version {}", pg_version)
+    )
 }

 // PG timeline is always 1, changing it doesn't have any useful meaning in Neon.
@@ -196,8 +238,6 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
 }

 pub mod waldecoder {
-
-    use crate::{v14, v15};
    use bytes::{Buf, Bytes, BytesMut};
    use std::num::NonZeroU32;
    use thiserror::Error;
@@ -248,22 +288,17 @@ pub mod waldecoder {
        }

        pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
-            match self.pg_version {
-                // This is a trick to support both versions simultaneously.
-                // See WalStreamDecoderHandler comments.
-                14 => {
-                    use self::v14::waldecoder_handler::WalStreamDecoderHandler;
+            dispatch_pgversion!(
+                self.pg_version,
+                {
+                    use pgv::waldecoder_handler::WalStreamDecoderHandler;
                    self.poll_decode_internal()
-                }
-                15 => {
-                    use self::v15::waldecoder_handler::WalStreamDecoderHandler;
-                    self.poll_decode_internal()
-                }
-                _ => Err(WalDecodeError {
+                },
+                Err(WalDecodeError {
                    msg: format!("Unknown version {}", self.pg_version),
                    lsn: self.lsn,
-                }),
-            }
+                })
+            )
        }
    }
 }
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -137,9 +137,12 @@ pub const XLOG_HEAP_INSERT: u8 = 0x00;
 pub const XLOG_HEAP_DELETE: u8 = 0x10;
 pub const XLOG_HEAP_UPDATE: u8 = 0x20;
 pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
+pub const XLOG_HEAP_LOCK: u8 = 0x60;
 pub const XLOG_HEAP_INIT_PAGE: u8 = 0x80;
 pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
 pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
+pub const XLOG_HEAP2_LOCK_UPDATED: u8 = 0x60;
+pub const XLH_LOCK_ALL_FROZEN_CLEARED: u8 = 0x01;
 pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
 pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
@@ -163,6 +166,20 @@ pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
 pub const RM_LOGICALMSG_ID: u8 = 21;

+// from neon_rmgr.h
+pub const RM_NEON_ID: u8 = 134;
+
+pub const XLOG_NEON_HEAP_INIT_PAGE: u8 = 0x80;
+
+pub const XLOG_NEON_HEAP_INSERT: u8 = 0x00;
+pub const XLOG_NEON_HEAP_DELETE: u8 = 0x10;
+pub const XLOG_NEON_HEAP_UPDATE: u8 = 0x20;
+pub const XLOG_NEON_HEAP_HOT_UPDATE: u8 = 0x30;
+pub const XLOG_NEON_HEAP_LOCK: u8 = 0x40;
+pub const XLOG_NEON_HEAP_MULTI_INSERT: u8 = 0x50;
+
+pub const XLOG_NEON_HEAP_VISIBLE: u8 = 0x40;
+
 // from xlogreader.h
 pub const XLR_INFO_MASK: u8 = 0x0F;
 pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
--- a/libs/postgres_ffi/src/pg_constants_v14.rs
+++ b/libs/postgres_ffi/src/pg_constants_v14.rs
@@ -3,3 +3,8 @@ pub const XLOG_DBASE_DROP: u8 = 0x10;

 pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
 pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
+pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
+}
--- a/libs/postgres_ffi/src/pg_constants_v15.rs
+++ b/libs/postgres_ffi/src/pg_constants_v15.rs
@@ -1,10 +1,18 @@
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
-pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
 pub const XLOG_DBASE_DROP: u8 = 0x20;

 pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
 pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
 pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
 pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
+
+pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
+
+    (bimg_info & ANY_COMPRESS_FLAG) != 0
+}
--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -0,0 +1,18 @@
+pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
+
+pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
+pub const XLOG_DBASE_DROP: u8 = 0x20;
+
+pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
+pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
+
+pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
+
+    (bimg_info & ANY_COMPRESS_FLAG) != 0
+}
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -49,9 +49,9 @@ impl Conf {
    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match self.pg_version {
-            14 => Ok(path.join(format!("v{}", self.pg_version))),
-            15 => Ok(path.join(format!("v{}", self.pg_version))),
+            14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))),
            _ => bail!("Unsupported postgres version: {}", self.pg_version),
        }
    }
@@ -250,11 +250,18 @@ fn craft_internal<C: postgres::GenericClient>(
    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
    let last_lsn = match last_lsn {
        None => client.pg_current_wal_insert_lsn()?,
-        Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
-            Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
-            Ordering::Equal => last_lsn,
-            Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
-        },
+        Some(last_lsn) => {
+            let insert_lsn = client.pg_current_wal_insert_lsn()?;
+            match last_lsn.cmp(&insert_lsn) {
+                Ordering::Less => bail!(
+                    "Some records were inserted after the crafted WAL: {} vs {}",
+                    last_lsn,
+                    insert_lsn
+                ),
+                Ordering::Equal => last_lsn,
+                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
+            }
+        }
    };
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
@@ -363,8 +370,9 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
        );
        ensure!(
            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
-            "XLOG_SWITCH message ended not on page boundary: {}",
-            after_xlog_switch
+            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
+            after_xlog_switch,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
        );
        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -959,7 +959,7 @@ mod tests {
        let make_params = |options| StartupMessageParams::new([("options", options)]);

        let params = StartupMessageParams::new([]);
-        assert!(matches!(params.options_escaped(), None));
+        assert!(params.options_escaped().is_none());

        let params = make_params("");
        assert!(split_options(&params).is_empty());
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -573,7 +573,7 @@ mod tests {

    #[test]
    fn relative_path() {
-        let all_paths = vec!["", "some/path", "some/path/"];
+        let all_paths = ["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -9,11 +9,12 @@ PORT=$4
 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
 rm -fr "$DATA_DIR"
 env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
-echo port="$PORT" >> "$DATA_DIR"/postgresql.conf
+echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
+echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
 REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
 declare -i WAL_SIZE=$REDO_POS+114
-"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start
-"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
 cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
 cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
 for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -315,12 +315,8 @@ impl CgroupWatcher {
    where
        E: Stream<Item = Sequenced<u64>>,
    {
-        // There are several actions might do when receiving a `memory.high`,
-        // such as freezing the cgroup, or increasing its `memory.high`. We don't
-        // want to do these things too often (because postgres needs to run, and
-        // we only have so much memory). These timers serve as rate limits for this.
        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut last_memory_high_increase_at: Option<Instant> = None;
        let mut events = pin!(events);

        // Are we waiting to be upscaled? Could be true if we request upscale due
@@ -332,6 +328,8 @@ impl CgroupWatcher {
                upscale = upscales.recv() => {
                    let Sequenced { seqnum, data } = upscale
                        .context("failed to listen on upscale notification channel")?;
+                    waiting_on_upscale = false;
+                    last_memory_high_increase_at = None;
                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
                }
@@ -396,12 +394,17 @@ impl CgroupWatcher {
                            .send(())
                            .await
                            .context("failed to request upscale")?;
+                        waiting_on_upscale = true;
                        continue;
                    }

                    // Shoot, we can't freeze or and we're still waiting on upscale,
                    // increase memory.high to reduce throttling
-                    if wait_to_increase_memory_high.is_elapsed() {
+                    let can_increase_memory_high = match last_memory_high_increase_at {
+                        None => true,
+                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
+                    };
+                    if can_increase_memory_high {
                        info!(
                            "received memory.high event, \
                            but too soon to refreeze and already requested upscale \
@@ -437,12 +440,11 @@ impl CgroupWatcher {
                        );
                        self.set_high_bytes(new_high)
                            .context("failed to set memory.high")?;
-                        wait_to_increase_memory_high
-                            .as_mut()
-                            .reset(Instant::now() + self.config.memory_high_increase_every)
+                        last_memory_high_increase_at = Some(Instant::now());
+                        continue;
                    }

-                    // we can't do anything
+                    info!("received memory.high event, but can't do anything");
                }
            };
        }
@@ -559,14 +561,7 @@ impl CgroupWatcher {
 /// Setting these values also affects the thresholds for receiving usage alerts.
 #[derive(Debug)]
 pub struct MemoryLimits {
-    high: u64,
-    max: u64,
-}
-
-impl MemoryLimits {
-    pub fn new(high: u64, max: u64) -> Self {
-        Self { max, high }
-    }
+    pub high: u64,
 }

 // Methods for manipulating the actual cgroup
@@ -643,12 +638,7 @@ impl CgroupWatcher {

    /// Set cgroup memory.high and memory.max.
    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
-        info!(
-            limits.high,
-            limits.max,
-            path = self.path(),
-            "writing new memory limits",
-        );
+        info!(limits.high, path = self.path(), "writing new memory limits",);
        self.memory()
            .context("failed to get memory subsystem while setting memory limits")?
            .set_mem(cgroups_rs::memory::SetMemory {
@@ -657,7 +647,7 @@ impl CgroupWatcher {
                high: Some(MaxValue::Value(
                    u64::min(limits.high, i64::MAX as u64) as i64
                )),
-                max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
+                max: None,
            })
            .context("failed to set memory limits")
    }
@@ -665,7 +655,7 @@ impl CgroupWatcher {
    /// Given some amount of available memory, set the desired cgroup memory limits
    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
        let new_high = self.config.calculate_memory_high_value(available_memory);
-        let limits = MemoryLimits::new(new_high, available_memory);
+        let limits = MemoryLimits { high: new_high };
        info!(
            path = self.path(),
            memory = ?limits,
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -178,14 +178,17 @@ pub async fn ws_handler(

 /// Starts the monitor. If startup fails or the monitor exits, an error will
 /// be logged and our internal state will be reset to allow for new connections.
-#[tracing::instrument(skip_all, fields(?args))]
+#[tracing::instrument(skip_all)]
 async fn start_monitor(
    ws: WebSocket,
    args: &Args,
    kill: broadcast::Receiver<()>,
    token: CancellationToken,
 ) {
-    info!("accepted new websocket connection -> starting monitor");
+    info!(
+        ?args,
+        "accepted new websocket connection -> starting monitor"
+    );
    let timeout = Duration::from_secs(4);
    let monitor = tokio::time::timeout(
        timeout,
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -5,6 +5,7 @@
 //! all functionality.

 use std::sync::Arc;
+use std::time::{Duration, Instant};
 use std::{fmt::Debug, mem};

 use anyhow::{bail, Context};
@@ -36,6 +37,8 @@ pub struct Runner {
    /// by us vs the autoscaler-agent.
    counter: usize,

+    last_upscale_request_at: Option<Instant>,
+
    /// A signal to kill the main thread produced by `self.run()`. This is triggered
    /// when the server receives a new connection. When the thread receives the
    /// signal off this channel, it will gracefully shutdown.
@@ -99,6 +102,7 @@ impl Runner {
            cgroup: None,
            dispatcher,
            counter: 1, // NB: must be odd, see the comment about the field for more.
+            last_upscale_request_at: None,
            kill,
        };

@@ -253,12 +257,11 @@ impl Runner {
                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
            }

-            let limits = MemoryLimits::new(
+            let limits = MemoryLimits {
                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
                // since it is properly initialized in the previous cgroup if let block
-                new_cgroup_mem_high,
-                available_memory,
-            );
+                high: new_cgroup_mem_high,
+            };
            cgroup
                .set_limits(&limits)
                .context("failed to set cgroup memory limits")?;
@@ -324,7 +327,9 @@ impl Runner {
                name = cgroup.path(),
                "updating cgroup memory.high",
            );
-            let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
+            let limits = MemoryLimits {
+                high: new_cgroup_mem_high,
+            };
            cgroup
                .set_limits(&limits)
                .context("failed to set file cache size")?;
@@ -397,6 +402,20 @@ impl Runner {
                    if request.is_none() {
                        bail!("failed to listen for upscale event from cgroup")
                    }
+
+                    // If it's been less than 1 second since the last time we requested upscaling,
+                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
+                    // ~1k times per second).
+                    if let Some(t) = self.last_upscale_request_at {
+                        let elapsed = t.elapsed();
+                        if elapsed < Duration::from_secs(1) {
+                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
+                            continue;
+                        }
+                    }
+
+                    self.last_upscale_request_at = Some(Instant::now());
+
                    info!("cgroup asking for upscale; forwarding request");
                    self.counter += 2; // Increment, preserving parity (i.e. keep the
                                       // counter odd). See the field comment for more.
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -80,11 +80,11 @@ enum-map.workspace = true
 enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+tempfile.workspace = true

 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
-tempfile.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

 [[bench]]
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -3,6 +3,8 @@
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.

 use anyhow::Result;
+use pageserver::context::{DownloadBehavior, RequestContext};
+use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
@@ -96,9 +98,9 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
+async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0).await?;
+    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
@@ -125,6 +127,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
                prev_key = Some(curr.next());
                true
            },
+            ctx,
        )
        .await?;
    let mut holes = heap.into_vec();
@@ -135,6 +138,7 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
 pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let storage_path = &cmd.path;
    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
    pageserver::virtual_file::init(10);
@@ -163,7 +167,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes).await?;
+                        layer_file.holes = get_holes(&layer.path(), max_holes, &ctx).await?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -2,6 +2,8 @@ use std::path::{Path, PathBuf};

 use anyhow::Result;
 use clap::Subcommand;
+use pageserver::context::{DownloadBehavior, RequestContext};
+use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
 use pageserver::tenant::disk_btree::DiskBtreeReader;
 use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
@@ -44,12 +46,12 @@ pub(crate) enum LayerCmd {
    },
 }

-async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = path.as_ref();
    virtual_file::init(10);
    page_cache::init(100);
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0).await?;
+    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
@@ -67,11 +69,12 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
                all.push((curr, BlobRef(value_offset)));
                true
            },
+            ctx,
        )
        .await?;
    let cursor = BlockCursor::new_fileblockreader(&file);
    for (k, v) in all {
-        let value = cursor.read_blob(v.pos()).await?;
+        let value = cursor.read_blob(v.pos(), ctx).await?;
        println!("key:{} value_len:{}", k, value.len());
    }
    // TODO(chi): special handling for last key?
@@ -79,6 +82,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
 }

 pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    match cmd {
        LayerCmd::List { path } => {
            for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? {
@@ -153,7 +157,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                        );

                        if layer_file.is_delta {
-                            read_delta_file(layer.path()).await?;
+                            read_delta_file(layer.path(), &ctx).await?;
                        } else {
                            anyhow::bail!("not supported yet :(");
                        }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -25,6 +25,7 @@ use crate::context::RequestContext;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};

+use postgres_ffi::dispatch_pgversion;
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
 use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
@@ -323,14 +324,25 @@ where
                .timeline
                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                .await?;
-            ensure!(img.len() == 512);
+
+            ensure!(
+                img.len()
+                    == dispatch_pgversion!(
+                        self.timeline.pg_version,
+                        pgv::bindings::SIZEOF_RELMAPFILE
+                    )
+            );
+
            Some(img)
        } else {
            None
        };

        if spcnode == GLOBALTABLESPACE_OID {
-            let pg_version_str = self.timeline.pg_version.to_string();
+            let pg_version_str = match self.timeline.pg_version {
+                14 | 15 => self.timeline.pg_version.to_string(),
+                ver => format!("{ver}\x0A"),
+            };
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
            self.ar.append(&header, pg_version_str.as_bytes()).await?;

@@ -374,7 +386,10 @@ where
            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);

-                let pg_version_str = self.timeline.pg_version.to_string();
+                let pg_version_str = match self.timeline.pg_version {
+                    14 | 15 => self.timeline.pg_version.to_string(),
+                    ver => format!("{ver}\x0A"),
+                };
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
                self.ar.append(&header, pg_version_str.as_bytes()).await?;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -477,16 +477,19 @@ fn start_pageserver(
    {
        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();

-        let router = http::make_router(
-            conf,
-            launch_ts,
-            http_auth,
-            broker_client.clone(),
-            remote_storage,
-            disk_usage_eviction_state,
-        )?
-        .build()
-        .map_err(|err| anyhow!(err))?;
+        let router_state = Arc::new(
+            http::routes::State::new(
+                conf,
+                http_auth.clone(),
+                remote_storage,
+                broker_client.clone(),
+                disk_usage_eviction_state,
+            )
+            .context("Failed to initialize router state")?,
+        );
+        let router = http::make_router(router_state, launch_ts, http_auth.clone())?
+            .build()
+            .map_err(|err| anyhow!(err))?;
        let service = utils::http::RouterService::new(router).unwrap();
        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
@@ -515,6 +518,9 @@ fn start_pageserver(
            // creates a child context with the right DownloadBehavior.
            DownloadBehavior::Error,
        );
+
+        let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
+
        task_mgr::spawn(
            crate::BACKGROUND_RUNTIME.handle(),
            TaskKind::MetricsCollection,
@@ -541,6 +547,7 @@ fn start_pageserver(
                    conf.cached_metric_collection_interval,
                    conf.synthetic_size_calculation_interval,
                    conf.id,
+                    local_disk_storage,
                    metrics_ctx,
                )
                .instrument(info_span!("metrics_collection"))
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -64,7 +64,7 @@ pub mod defaults {
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
+    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -73,7 +73,7 @@ pub mod defaults {
    /// Default built-in configuration file.
    ///
    pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
-        r###"
+        r#"
 # Initial configuration file created by 'pageserver --init'
 #listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
 #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
@@ -118,7 +118,7 @@ pub mod defaults {

 [remote_storage]

-"###
+"#
    );
 }

@@ -668,26 +668,18 @@ impl PageServerConf {
    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 => Ok(path.join(format!("v{pg_version}"))),
-            15 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

    /// Parse a configuration file (pageserver.toml) into a PageServerConf struct,
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,188 +1,54 @@
-//!
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
-//! Cache metrics to send only the updated ones.
-//!
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
-use anyhow;
-use chrono::{DateTime, Utc};
-use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
+use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
 use reqwest::Url;
-use serde::Serialize;
-use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
+use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 use tracing::*;
-use utils::id::{NodeId, TenantId, TimelineId};
-use utils::lsn::Lsn;
+use utils::id::NodeId;
+
+mod metrics;
+use metrics::MetricsKey;
+mod disk_cache;
+mod upload;

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

-#[serde_as]
-#[derive(Serialize, Debug, Clone, Copy)]
-struct Ids {
-    #[serde_as(as = "DisplayFromStr")]
-    tenant_id: TenantId,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    timeline_id: Option<TimelineId>,
-}
+/// Basically a key-value pair, but usually in a Vec except for [`Cache`].
+///
+/// This is as opposed to `consumption_metrics::Event` which is the externally communicated form.
+/// Difference is basically the missing idempotency key, which lives only for the duration of
+/// upload attempts.
+type RawMetric = (MetricsKey, (EventType, u64));

-/// Key that uniquely identifies the object, this metric describes.
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-struct MetricsKey {
-    tenant_id: TenantId,
-    timeline_id: Option<TimelineId>,
-    metric: &'static str,
-}
-
-impl MetricsKey {
-    const fn absolute_values(self) -> AbsoluteValueFactory {
-        AbsoluteValueFactory(self)
-    }
-    const fn incremental_values(self) -> IncrementalValueFactory {
-        IncrementalValueFactory(self)
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only absolute values.
-struct AbsoluteValueFactory(MetricsKey);
-
-impl AbsoluteValueFactory {
-    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        (key, (EventType::Absolute { time }, val))
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only incremental values.
-struct IncrementalValueFactory(MetricsKey);
-
-impl IncrementalValueFactory {
-    #[allow(clippy::wrong_self_convention)]
-    fn from_previous_up_to(
-        self,
-        prev_end: DateTime<Utc>,
-        up_to: DateTime<Utc>,
-        val: u64,
-    ) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        // cannot assert prev_end < up_to because these are realtime clock based
-        (
-            key,
-            (
-                EventType::Incremental {
-                    start_time: prev_end,
-                    stop_time: up_to,
-                },
-                val,
-            ),
-        )
-    }
-
-    fn key(&self) -> &MetricsKey {
-        &self.0
-    }
-}
-
-// the static part of a MetricsKey
-impl MetricsKey {
-    /// Absolute value of [`Timeline::get_last_record_lsn`].
-    ///
-    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
-    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "written_size",
-        }
-        .absolute_values()
-    }
-
-    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
-    /// previously sent, starting from the previously sent incremental time range ending at the
-    /// latest absolute measurement.
-    const fn written_size_delta(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> IncrementalValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            // the name here is correctly about data not size, because that is what is wanted by
-            // downstream pipeline
-            metric: "written_data_bytes_delta",
-        }
-        .incremental_values()
-    }
-
-    /// Exact [`Timeline::get_current_logical_size`].
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    const fn timeline_logical_size(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "timeline_logical_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::remote_size`]
-    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
-    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "remote_storage_size",
-        }
-        .absolute_values()
-    }
-
-    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
-    ///
-    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "resident_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
-    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
-    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "synthetic_storage_size",
-        }
-        .absolute_values()
-    }
-}
+/// Caches the [`RawMetric`]s
+///
+/// In practice, during startup, last sent values are stored here to be used in calculating new
+/// ones. After successful uploading, the cached values are updated to cache. This used to be used
+/// for deduplication, but that is no longer needed.
+type Cache = HashMap<MetricsKey, (EventType, u64)>;

 /// Main thread that serves metrics collection
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
    metric_collection_interval: Duration,
-    cached_metric_collection_interval: Duration,
+    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
+    local_disk_storage: PathBuf,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-    info!("starting collect_metrics");
+    if _cached_metric_collection_interval != Duration::ZERO {
+        tracing::warn!(
+            "cached_metric_collection_interval is no longer used, please set it to zero."
+        )
+    }

    // spin up background worker that caclulates tenant sizes
    let worker_ctx =
@@ -202,543 +68,216 @@ pub async fn collect_metrics(
        },
    );

+    let path: Arc<PathBuf> = Arc::new(local_disk_storage);
+
+    let cancel = task_mgr::shutdown_token();
+
+    let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);
+
+    let mut cached_metrics = tokio::select! {
+        _ = cancel.cancelled() => return Ok(()),
+        ret = restore_and_reschedule => ret,
+    };
+
    // define client here to reuse it for all requests
    let client = reqwest::ClientBuilder::new()
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics = HashMap::new();
-    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
-
-    loop {
-        tokio::select! {
-            _ = task_mgr::shutdown_watcher() => {
-                info!("collect_metrics received cancellation request");
-                return Ok(());
-            },
-            tick_at = ticker.tick() => {
-
-                // send cached metrics every cached_metric_collection_interval
-                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
-
-                if send_cached {
-                    prev_iteration_time = std::time::Instant::now();
-                }
-
-                collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
-
-                crate::tenant::tasks::warn_when_period_overrun(
-                    tick_at.elapsed(),
-                    metric_collection_interval,
-                    "consumption_metrics_collect_metrics",
-                );
-            }
-        }
-    }
-}
-
-/// One iteration of metrics collection
-///
-/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
-/// Cache metrics to avoid sending the same metrics multiple times.
-///
-/// This function handles all errors internally
-/// and doesn't break iteration if just one tenant fails.
-///
-/// TODO
-/// - refactor this function (chunking+sending part) to reuse it in proxy module;
-async fn collect_metrics_iteration(
-    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
-    metric_collection_endpoint: &reqwest::Url,
-    node_id: NodeId,
-    ctx: &RequestContext,
-    send_cached: bool,
-) {
-    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
-    trace!(
-        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
-        metric_collection_endpoint
-    );
-
-    // get list of tenants
-    let tenants = match mgr::list_tenants().await {
-        Ok(tenants) => tenants,
-        Err(err) => {
-            error!("failed to list tenants: {:?}", err);
-            return;
-        }
-    };
-
-    // iterate through list of Active tenants and collect metrics
-    for (tenant_id, tenant_state) in tenants {
-        if tenant_state != TenantState::Active {
-            continue;
-        }
-
-        let tenant = match mgr::get_tenant(tenant_id, true).await {
-            Ok(tenant) => tenant,
-            Err(err) => {
-                // It is possible that tenant was deleted between
-                // `list_tenants` and `get_tenant`, so just warn about it.
-                warn!("failed to get tenant {tenant_id:?}: {err:?}");
-                continue;
-            }
-        };
-
-        let mut tenant_resident_size = 0;
-
-        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines() {
-            // collect per-timeline metrics only for active timelines
-
-            let timeline_id = timeline.timeline_id;
-
-            match TimelineSnapshot::collect(&timeline, ctx) {
-                Ok(Some(snap)) => {
-                    snap.to_metrics(
-                        tenant_id,
-                        timeline_id,
-                        Utc::now(),
-                        &mut current_metrics,
-                        cached_metrics,
-                    );
-                }
-                Ok(None) => {}
-                Err(e) => {
-                    error!(
-                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
-                        timeline.timeline_id
-                    );
-                    continue;
-                }
-            }
-
-            tenant_resident_size += timeline.resident_physical_size();
-        }
-
-        current_metrics
-            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
-
-        current_metrics
-            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
-
-        // Note that this metric is calculated in a separate bgworker
-        // Here we only use cached value, which may lag behind the real latest one
-        let synthetic_size = tenant.cached_synthetic_size();
-
-        if synthetic_size != 0 {
-            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics
-                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
-        }
-    }
-
-    // Filter metrics, unless we want to send all metrics, including cached ones.
-    // See: https://github.com/neondatabase/neon/issues/3485
-    if !send_cached {
-        current_metrics.retain(|(curr_key, (kind, curr_val))| {
-            if kind.is_incremental() {
-                // incremental values (currently only written_size_delta) should not get any cache
-                // deduplication because they will be used by upstream for "is still alive."
-                true
-            } else {
-                match cached_metrics.get(curr_key) {
-                    Some((_, val)) => val != curr_val,
-                    None => true,
-                }
-            }
-        });
-    }
-
-    if current_metrics.is_empty() {
-        trace!("no new metrics to send");
-        return;
-    }
-
-    // Send metrics.
-    // Split into chunks of 1000 metrics to avoid exceeding the max request size
-    let chunks = current_metrics.chunks(CHUNK_SIZE);
-
-    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

    let node_id = node_id.to_string();

-    for chunk in chunks {
-        chunk_to_send.clear();
+    // reminder: ticker is ready immediatedly
+    let mut ticker = tokio::time::interval(metric_collection_interval);

-        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
-            kind: *when,
-            metric: curr_key.metric,
-            idempotency_key: idempotency_key(&node_id),
-            value: *curr_val,
-            extra: Ids {
-                tenant_id: curr_key.tenant_id,
-                timeline_id: curr_key.timeline_id,
-            },
-        }));
+    loop {
+        let tick_at = tokio::select! {
+            _ = cancel.cancelled() => return Ok(()),
+            tick_at = ticker.tick() => tick_at,
+        };

-        const MAX_RETRIES: u32 = 3;
+        // these are point in time, with variable "now"
+        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

-        for attempt in 0..MAX_RETRIES {
-            let res = client
-                .post(metric_collection_endpoint.clone())
-                .json(&EventChunk {
-                    events: (&chunk_to_send).into(),
-                })
-                .send()
-                .await;
+        if metrics.is_empty() {
+            continue;
+        }

-            match res {
-                Ok(res) => {
-                    if res.status().is_success() {
-                        // update cached metrics after they were sent successfully
-                        for (curr_key, curr_val) in chunk.iter() {
-                            cached_metrics.insert(curr_key.clone(), *curr_val);
-                        }
-                    } else {
-                        error!("metrics endpoint refused the sent metrics: {:?}", res);
-                        for metric in chunk_to_send
-                            .iter()
-                            .filter(|metric| metric.value > (1u64 << 40))
-                        {
-                            // Report if the metric value is suspiciously large
-                            error!("potentially abnormal metric value: {:?}", metric);
-                        }
-                    }
-                    break;
+        let metrics = Arc::new(metrics);
+
+        // why not race cancellation here? because we are one of the last tasks, and if we are
+        // already here, better to try to flush the new values.
+
+        let flush = async {
+            match disk_cache::flush_metrics_to_disk(&metrics, &path).await {
+                Ok(()) => {
+                    tracing::debug!("flushed metrics to disk");
                }
-                Err(err) if err.is_timeout() => {
-                    error!(attempt, "timeout sending metrics, retrying immediately");
-                    continue;
-                }
-                Err(err) => {
-                    error!(attempt, ?err, "failed to send metrics");
-                    break;
+                Err(e) => {
+                    // idea here is that if someone creates a directory as our path, then they
+                    // might notice it from the logs before shutdown and remove it
+                    tracing::error!("failed to persist metrics to {path:?}: {e:#}");
                }
            }
+        };
+
+        let upload = async {
+            let res = upload::upload_metrics(
+                &client,
+                metric_collection_endpoint,
+                &cancel,
+                &node_id,
+                &metrics,
+                &mut cached_metrics,
+            )
+            .await;
+            if let Err(e) = res {
+                // serialization error which should never happen
+                tracing::error!("failed to upload due to {e:#}");
+            }
+        };
+
+        // let these run concurrently
+        let (_, _) = tokio::join!(flush, upload);
+
+        crate::tenant::tasks::warn_when_period_overrun(
+            tick_at.elapsed(),
+            metric_collection_interval,
+            "consumption_metrics_collect_metrics",
+        );
+    }
+}
+
+/// Called on the first iteration in an attempt to join the metric uploading schedule from previous
+/// pageserver session. Pageserver is supposed to upload at intervals regardless of restarts.
+///
+/// Cancellation safe.
+async fn restore_and_reschedule(
+    path: &Arc<PathBuf>,
+    metric_collection_interval: Duration,
+) -> Cache {
+    let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await {
+        Ok(found_some) => {
+            // there is no min needed because we write these sequentially in
+            // collect_all_metrics
+            let earlier_metric_at = found_some
+                .iter()
+                .map(|(_, (et, _))| et.recorded_at())
+                .copied()
+                .next();
+
+            let cached = found_some.into_iter().collect::<Cache>();
+
+            (cached, earlier_metric_at)
+        }
+        Err(e) => {
+            use std::io::{Error, ErrorKind};
+
+            let root = e.root_cause();
+            let maybe_ioerr = root.downcast_ref::<Error>();
+            let is_not_found = maybe_ioerr.is_some_and(|e| e.kind() == ErrorKind::NotFound);
+
+            if !is_not_found {
+                tracing::info!("failed to read any previous metrics from {path:?}: {e:#}");
+            }
+
+            (HashMap::new(), None)
+        }
+    };
+
+    if let Some(earlier_metric_at) = earlier_metric_at {
+        let earlier_metric_at: SystemTime = earlier_metric_at.into();
+
+        let error = reschedule(earlier_metric_at, metric_collection_interval).await;
+
+        if let Some(error) = error {
+            if error.as_secs() >= 60 {
+                tracing::info!(
+                    error_ms = error.as_millis(),
+                    "startup scheduling error due to restart"
+                )
+            }
        }
    }
+
+    cached
 }

-/// Internal type to make timeline metric production testable.
-///
-/// As this value type contains all of the information needed from a timeline to produce the
-/// metrics, it can easily be created with different values in test.
-struct TimelineSnapshot {
-    loaded_at: (Lsn, SystemTime),
-    last_record_lsn: Lsn,
-    current_exact_logical_size: Option<u64>,
-}
+async fn reschedule(
+    earlier_metric_at: SystemTime,
+    metric_collection_interval: Duration,
+) -> Option<Duration> {
+    let now = SystemTime::now();
+    match now.duration_since(earlier_metric_at) {
+        Ok(from_last_send) if from_last_send < metric_collection_interval => {
+            let sleep_for = metric_collection_interval - from_last_send;

-impl TimelineSnapshot {
-    /// Collect the metrics from an actual timeline.
-    ///
-    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    fn collect(
-        t: &Arc<crate::tenant::Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Self>> {
-        use anyhow::Context;
+            let deadline = std::time::Instant::now() + sleep_for;

-        if !t.is_active() {
-            // no collection for broken or stopping needed, we will still keep the cached values
-            // though at the caller.
-            Ok(None)
-        } else {
-            let loaded_at = t.loaded_at;
-            let last_record_lsn = t.get_last_record_lsn();
+            tokio::time::sleep_until(deadline.into()).await;

-            let current_exact_logical_size = {
-                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
-                    .in_scope(|| t.get_current_logical_size(ctx))
-                    .context("get_current_logical_size");
-                match res? {
-                    // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
-                    (_, _) => None,
-                }
-            };
+            let now = std::time::Instant::now();

-            Ok(Some(TimelineSnapshot {
-                loaded_at,
-                last_record_lsn,
-                current_exact_logical_size,
-            }))
-        }
-    }
-
-    /// Produce the timeline consumption metrics into the `metrics` argument.
-    fn to_metrics(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        now: DateTime<Utc>,
-        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
-        cache: &HashMap<MetricsKey, (EventType, u64)>,
-    ) {
-        let timeline_written_size = u64::from(self.last_record_lsn);
-
-        let (key, written_size_now) =
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
-
-        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
-        // features might change this.
-
-        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
-
-        // use this when available, because in a stream of incremental values, it will be
-        // accurate where as when last_record_lsn stops moving, we will only cache the last
-        // one of those.
-        let last_stop_time = cache
-            .get(written_size_delta_key.key())
-            .map(|(until, _val)| {
-                until
-                    .incremental_timerange()
-                    .expect("never create EventType::Absolute for written_size_delta")
-                    .end
-            });
-
-        // by default, use the last sent written_size as the basis for
-        // calculating the delta. if we don't yet have one, use the load time value.
-        let prev = cache
-            .get(&key)
-            .map(|(prev_at, prev)| {
-                // use the prev time from our last incremental update, or default to latest
-                // absolute update on the first round.
-                let prev_at = prev_at
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let prev_at = last_stop_time.unwrap_or(prev_at);
-                (*prev_at, *prev)
-            })
-            .unwrap_or_else(|| {
-                // if we don't have a previous point of comparison, compare to the load time
-                // lsn.
-                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
-                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
-            });
-
-        // written_size_bytes_delta
-        metrics.extend(
-            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
-                let up_to = written_size_now
-                    .0
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
-                Some(key_value)
+            // executor threads might be busy, add extra measurements
+            Some(if now < deadline {
+                deadline - now
            } else {
-                None
-            },
-        );
-
-        // written_size
-        metrics.push((key, written_size_now));
-
-        if let Some(size) = self.current_exact_logical_size {
-            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
+                now - deadline
+            })
+        }
+        Ok(from_last_send) => Some(from_last_send.saturating_sub(metric_collection_interval)),
+        Err(_) => {
+            tracing::warn!(
+                ?now,
+                ?earlier_metric_at,
+                "oldest recorded metric is in future; first values will come out with inconsistent timestamps"
+            );
+            earlier_metric_at.duration_since(now).ok()
        }
    }
 }

 /// Caclculate synthetic size for each active tenant
-pub async fn calculate_synthetic_size_worker(
+async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");

+    // reminder: ticker is ready immediatedly
    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
+    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        tokio::select! {
-            _ = task_mgr::shutdown_watcher() => {
-                return Ok(());
-            },
-            tick_at = ticker.tick() => {
+        let tick_at = tokio::select! {
+            _ = task_mgr::shutdown_watcher() => return Ok(()),
+            tick_at = ticker.tick() => tick_at,
+        };

-                let tenants = match mgr::list_tenants().await {
-                    Ok(tenants) => tenants,
-                    Err(e) => {
-                        warn!("cannot get tenant list: {e:#}");
-                        continue;
-                    }
-                };
-                // iterate through list of Active tenants and collect metrics
-                for (tenant_id, tenant_state) in tenants {
+        let tenants = match mgr::list_tenants().await {
+            Ok(tenants) => tenants,
+            Err(e) => {
+                warn!("cannot get tenant list: {e:#}");
+                continue;
+            }
+        };

-                    if tenant_state != TenantState::Active {
-                        continue;
-                    }
-
-                    if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
-                    {
-                        if let Err(e) = tenant.calculate_synthetic_size(
-                            LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
-                            ctx).await {
-                            error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
-                        }
-                    }
+        for (tenant_id, tenant_state) in tenants {
+            if tenant_state != TenantState::Active {
+                continue;
+            }

+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
+                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
-
-                crate::tenant::tasks::warn_when_period_overrun(
-                    tick_at.elapsed(),
-                    synthetic_size_calculation_interval,
-                    "consumption_metrics_synthetic_size_worker",
-                );
            }
        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-
-    use std::time::SystemTime;
-    use utils::{
-        id::{TenantId, TimelineId},
-        lsn::Lsn,
-    };
-
-    use crate::consumption_metrics::MetricsKey;
-
-    use super::TimelineSnapshot;
-    use chrono::{DateTime, Utc};
-
-    #[test]
-    fn startup_collected_timeline_metrics_before_advancing() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::new();
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, SystemTime::now()),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        let now = DateTime::<Utc>::from(SystemTime::now());
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    snap.loaded_at.1.into(),
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_second_round() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id)
-                    .from_previous_up_to(before, now, 0),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, just_before, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let just_before = DateTime::<Utc>::from(just_before);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            // at t=before was the last time the last_record_lsn changed
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
-            // end time of this event is used for the next ones
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                before,
-                just_before,
-                0,
-            ),
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    just_before,
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
-        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
-        times[0] = std::time::SystemTime::now();
-        for behind in 1..N {
-            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
-        }
-
-        times
+
+        crate::tenant::tasks::warn_when_period_overrun(
+            tick_at.elapsed(),
+            synthetic_size_calculation_interval,
+            "consumption_metrics_synthetic_size_worker",
+        );
    }
 }
--- a/pageserver/src/consumption_metrics/disk_cache.rs
+++ b/pageserver/src/consumption_metrics/disk_cache.rs
@@ -0,0 +1,117 @@
+use anyhow::Context;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use super::RawMetric;
+
+pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result<Vec<RawMetric>> {
+    // do not add context to each error, callsite will log with full path
+    let span = tracing::Span::current();
+    tokio::task::spawn_blocking(move || {
+        let _e = span.entered();
+
+        if let Some(parent) = path.parent() {
+            if let Err(e) = scan_and_delete_with_same_prefix(&path) {
+                tracing::info!("failed to cleanup temporary files in {parent:?}: {e:#}");
+            }
+        }
+
+        let mut file = std::fs::File::open(&*path)?;
+        let reader = std::io::BufReader::new(&mut file);
+        anyhow::Ok(serde_json::from_reader::<_, Vec<RawMetric>>(reader)?)
+    })
+    .await
+    .context("read metrics join error")
+    .and_then(|x| x)
+}
+
+fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<()> {
+    let it = std::fs::read_dir(path.parent().expect("caller checked"))?;
+
+    let prefix = path.file_name().expect("caller checked").to_string_lossy();
+
+    for entry in it {
+        let entry = entry?;
+        if !entry.metadata()?.is_file() {
+            continue;
+        }
+        let file_name = entry.file_name();
+
+        if path.file_name().unwrap() == file_name {
+            // do not remove our actual file
+            continue;
+        }
+
+        let file_name = file_name.to_string_lossy();
+
+        if !file_name.starts_with(&*prefix) {
+            continue;
+        }
+
+        let path = entry.path();
+
+        if let Err(e) = std::fs::remove_file(&path) {
+            tracing::warn!("cleaning up old tempfile {file_name:?} failed: {e:#}");
+        } else {
+            tracing::info!("cleaned up old tempfile {file_name:?}");
+        }
+    }
+
+    Ok(())
+}
+
+pub(super) async fn flush_metrics_to_disk(
+    current_metrics: &Arc<Vec<RawMetric>>,
+    path: &Arc<PathBuf>,
+) -> anyhow::Result<()> {
+    use std::io::Write;
+
+    anyhow::ensure!(path.parent().is_some(), "path must have parent: {path:?}");
+    anyhow::ensure!(
+        path.file_name().is_some(),
+        "path must have filename: {path:?}"
+    );
+
+    let span = tracing::Span::current();
+    tokio::task::spawn_blocking({
+        let current_metrics = current_metrics.clone();
+        let path = path.clone();
+        move || {
+            let _e = span.entered();
+
+            let parent = path.parent().expect("existence checked");
+            let file_name = path.file_name().expect("existence checked");
+            let mut tempfile = tempfile::Builder::new()
+                .prefix(file_name)
+                .suffix(".tmp")
+                .tempfile_in(parent)?;
+
+            tracing::debug!("using tempfile {:?}", tempfile.path());
+
+            // write out all of the raw metrics, to be read out later on restart as cached values
+            {
+                let mut writer = std::io::BufWriter::new(&mut tempfile);
+                serde_json::to_writer(&mut writer, &*current_metrics)
+                    .context("serialize metrics")?;
+                writer
+                    .into_inner()
+                    .map_err(|_| anyhow::anyhow!("flushing metrics failed"))?;
+            }
+
+            tempfile.flush()?;
+            tempfile.as_file().sync_all()?;
+
+            fail::fail_point!("before-persist-last-metrics-collected");
+
+            drop(tempfile.persist(&*path).map_err(|e| e.error)?);
+
+            let f = std::fs::File::open(path.parent().unwrap())?;
+            f.sync_all()?;
+
+            anyhow::Ok(())
+        }
+    })
+    .await
+    .with_context(|| format!("write metrics to {path:?} join error"))
+    .and_then(|x| x.with_context(|| format!("write metrics to {path:?}")))
+}
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -0,0 +1,455 @@
+use crate::context::RequestContext;
+use anyhow::Context;
+use chrono::{DateTime, Utc};
+use consumption_metrics::EventType;
+use futures::stream::StreamExt;
+use serde_with::serde_as;
+use std::{sync::Arc, time::SystemTime};
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::{Cache, RawMetric};
+
+/// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
+/// instead of static str.
+// Do not rename any of these without first consulting with data team and partner
+// management.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
+pub(super) enum Name {
+    /// Timeline last_record_lsn, absolute
+    #[serde(rename = "written_size")]
+    WrittenSize,
+    /// Timeline last_record_lsn, incremental
+    #[serde(rename = "written_data_bytes_delta")]
+    WrittenSizeDelta,
+    /// Timeline logical size
+    #[serde(rename = "timeline_logical_size")]
+    LogicalSize,
+    /// Tenant remote size
+    #[serde(rename = "remote_storage_size")]
+    RemoteSize,
+    /// Tenant resident size
+    #[serde(rename = "resident_size")]
+    ResidentSize,
+    /// Tenant synthetic size
+    #[serde(rename = "synthetic_storage_size")]
+    SyntheticSize,
+}
+
+/// Key that uniquely identifies the object this metric describes.
+///
+/// This is a denormalization done at the MetricsKey const methods; these should not be constructed
+/// elsewhere.
+#[serde_with::serde_as]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
+pub(crate) struct MetricsKey {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub(super) tenant_id: TenantId,
+
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub(super) timeline_id: Option<TimelineId>,
+
+    pub(super) metric: Name,
+}
+
+impl MetricsKey {
+    const fn absolute_values(self) -> AbsoluteValueFactory {
+        AbsoluteValueFactory(self)
+    }
+    const fn incremental_values(self) -> IncrementalValueFactory {
+        IncrementalValueFactory(self)
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only absolute values.
+struct AbsoluteValueFactory(MetricsKey);
+
+impl AbsoluteValueFactory {
+    const fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
+        let key = self.0;
+        (key, (EventType::Absolute { time }, val))
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only incremental values.
+struct IncrementalValueFactory(MetricsKey);
+
+impl IncrementalValueFactory {
+    #[allow(clippy::wrong_self_convention)]
+    const fn from_until(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> RawMetric {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        let when = EventType::Incremental {
+            start_time: prev_end,
+            stop_time: up_to,
+        };
+        (key, (when, val))
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+// the static part of a MetricsKey
+impl MetricsKey {
+    /// Absolute value of [`Timeline::get_last_record_lsn`].
+    ///
+    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
+    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::WrittenSize,
+        }
+        .absolute_values()
+    }
+
+    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
+    /// previously sent, starting from the previously sent incremental time range ending at the
+    /// latest absolute measurement.
+    const fn written_size_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> IncrementalValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::WrittenSizeDelta,
+        }
+        .incremental_values()
+    }
+
+    /// Exact [`Timeline::get_current_logical_size`].
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    const fn timeline_logical_size(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::LogicalSize,
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::remote_size`]
+    ///
+    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: Name::RemoteSize,
+        }
+        .absolute_values()
+    }
+
+    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
+    ///
+    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: Name::ResidentSize,
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    ///
+    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
+    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: Name::SyntheticSize,
+        }
+        .absolute_values()
+    }
+}
+
+pub(super) async fn collect_all_metrics(
+    cached_metrics: &Cache,
+    ctx: &RequestContext,
+) -> Vec<RawMetric> {
+    use pageserver_api::models::TenantState;
+
+    let started_at = std::time::Instant::now();
+
+    let tenants = match crate::tenant::mgr::list_tenants().await {
+        Ok(tenants) => tenants,
+        Err(err) => {
+            tracing::error!("failed to list tenants: {:?}", err);
+            return vec![];
+        }
+    };
+
+    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
+        if state != TenantState::Active {
+            None
+        } else {
+            crate::tenant::mgr::get_tenant(id, true)
+                .await
+                .ok()
+                .map(|tenant| (id, tenant))
+        }
+    });
+
+    let res = collect(tenants, cached_metrics, ctx).await;
+
+    tracing::info!(
+        elapsed_ms = started_at.elapsed().as_millis(),
+        total = res.len(),
+        "collected metrics"
+    );
+
+    res
+}
+
+async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<RawMetric>
+where
+    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
+{
+    let mut current_metrics: Vec<RawMetric> = Vec::new();
+
+    let mut tenants = std::pin::pin!(tenants);
+
+    while let Some((tenant_id, tenant)) = tenants.next().await {
+        let mut tenant_resident_size = 0;
+
+        for timeline in tenant.list_timelines() {
+            let timeline_id = timeline.timeline_id;
+
+            match TimelineSnapshot::collect(&timeline, ctx) {
+                Ok(Some(snap)) => {
+                    snap.to_metrics(
+                        tenant_id,
+                        timeline_id,
+                        Utc::now(),
+                        &mut current_metrics,
+                        cache,
+                    );
+                }
+                Ok(None) => {}
+                Err(e) => {
+                    tracing::error!(
+                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
+                        timeline.timeline_id
+                    );
+                    continue;
+                }
+            }
+
+            tenant_resident_size += timeline.resident_physical_size();
+        }
+
+        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
+        snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
+    }
+
+    current_metrics
+}
+
+/// In-between abstraction to allow testing metrics without actual Tenants.
+struct TenantSnapshot {
+    resident_size: u64,
+    remote_size: u64,
+    synthetic_size: u64,
+}
+
+impl TenantSnapshot {
+    /// Collect tenant status to have metrics created out of it.
+    ///
+    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
+    /// cannot just list timelines here.
+    fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
+        TenantSnapshot {
+            resident_size,
+            remote_size: t.remote_size(),
+            // Note that this metric is calculated in a separate bgworker
+            // Here we only use cached value, which may lag behind the real latest one
+            synthetic_size: t.cached_synthetic_size(),
+        }
+    }
+
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        now: DateTime<Utc>,
+        cached: &Cache,
+        metrics: &mut Vec<RawMetric>,
+    ) {
+        let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
+
+        let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
+
+        let synthetic_size = {
+            let factory = MetricsKey::synthetic_size(tenant_id);
+            let mut synthetic_size = self.synthetic_size;
+
+            if synthetic_size == 0 {
+                if let Some((_, value)) = cached.get(factory.key()) {
+                    // use the latest value from previous session
+                    synthetic_size = *value;
+                }
+            }
+
+            if synthetic_size != 0 {
+                // only send non-zeroes because otherwise these show up as errors in logs
+                Some(factory.at(now, synthetic_size))
+            } else {
+                None
+            }
+        };
+
+        metrics.extend(
+            [Some(remote_size), Some(resident_size), synthetic_size]
+                .into_iter()
+                .flatten(),
+        );
+    }
+}
+
+/// Internal type to make timeline metric production testable.
+///
+/// As this value type contains all of the information needed from a timeline to produce the
+/// metrics, it can easily be created with different values in test.
+struct TimelineSnapshot {
+    loaded_at: (Lsn, SystemTime),
+    last_record_lsn: Lsn,
+    current_exact_logical_size: Option<u64>,
+}
+
+impl TimelineSnapshot {
+    /// Collect the metrics from an actual timeline.
+    ///
+    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    fn collect(
+        t: &Arc<crate::tenant::Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Self>> {
+        if !t.is_active() {
+            // no collection for broken or stopping needed, we will still keep the cached values
+            // though at the caller.
+            Ok(None)
+        } else {
+            let loaded_at = t.loaded_at;
+            let last_record_lsn = t.get_last_record_lsn();
+
+            let current_exact_logical_size = {
+                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
+                let res = span
+                    .in_scope(|| t.get_current_logical_size(ctx))
+                    .context("get_current_logical_size");
+                match res? {
+                    // Only send timeline logical size when it is fully calculated.
+                    (size, is_exact) if is_exact => Some(size),
+                    (_, _) => None,
+                }
+            };
+
+            Ok(Some(TimelineSnapshot {
+                loaded_at,
+                last_record_lsn,
+                current_exact_logical_size,
+            }))
+        }
+    }
+
+    /// Produce the timeline consumption metrics into the `metrics` argument.
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        now: DateTime<Utc>,
+        metrics: &mut Vec<RawMetric>,
+        cache: &Cache,
+    ) {
+        let timeline_written_size = u64::from(self.last_record_lsn);
+
+        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
+
+        let last_stop_time = cache
+            .get(written_size_delta_key.key())
+            .map(|(until, _val)| {
+                until
+                    .incremental_timerange()
+                    .expect("never create EventType::Absolute for written_size_delta")
+                    .end
+            });
+
+        let (key, written_size_now) =
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
+
+        // by default, use the last sent written_size as the basis for
+        // calculating the delta. if we don't yet have one, use the load time value.
+        let prev = cache
+            .get(&key)
+            .map(|(prev_at, prev)| {
+                // use the prev time from our last incremental update, or default to latest
+                // absolute update on the first round.
+                let prev_at = prev_at
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let prev_at = last_stop_time.unwrap_or(prev_at);
+                (*prev_at, *prev)
+            })
+            .unwrap_or_else(|| {
+                // if we don't have a previous point of comparison, compare to the load time
+                // lsn.
+                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
+                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+            });
+
+        let up_to = now;
+
+        if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+            let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
+            // written_size_delta
+            metrics.push(key_value);
+            // written_size
+            metrics.push((key, written_size_now));
+        } else {
+            // the cached value was ahead of us, report zero until we've caught up
+            metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0));
+            // the cached value was ahead of us, report the same until we've caught up
+            metrics.push((key, (written_size_now.0, prev.1)));
+        }
+
+        {
+            let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
+            let current_or_previous = self
+                .current_exact_logical_size
+                .or_else(|| cache.get(factory.key()).map(|(_, val)| *val));
+
+            if let Some(size) = current_or_previous {
+                metrics.push(factory.at(now, size));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests;
+
+#[cfg(test)]
+pub(crate) use tests::metric_examples;
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -0,0 +1,297 @@
+use super::*;
+use std::collections::HashMap;
+use std::time::SystemTime;
+use utils::lsn::Lsn;
+
+#[test]
+fn startup_collected_timeline_metrics_before_advancing() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::new();
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, SystemTime::now()),
+        last_record_lsn: disk_consistent_lsn,
+        current_exact_logical_size: Some(0x42000),
+    };
+
+    let now = DateTime::<Utc>::from(SystemTime::now());
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                snap.loaded_at.1.into(),
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+        ]
+    );
+}
+
+#[test]
+fn startup_collected_timeline_metrics_second_round() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [now, before, init] = time_backwards();
+
+    let now = DateTime::<Utc>::from(now);
+    let before = DateTime::<Utc>::from(before);
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::from([
+        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
+    ]);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, init),
+        last_record_lsn: disk_consistent_lsn,
+        current_exact_logical_size: Some(0x42000),
+    };
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+        ]
+    );
+}
+
+#[test]
+fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [now, just_before, before, init] = time_backwards();
+
+    let now = DateTime::<Utc>::from(now);
+    let just_before = DateTime::<Utc>::from(just_before);
+    let before = DateTime::<Utc>::from(before);
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::from([
+        // at t=before was the last time the last_record_lsn changed
+        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+        // end time of this event is used for the next ones
+        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0),
+    ]);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, init),
+        last_record_lsn: disk_consistent_lsn,
+        current_exact_logical_size: Some(0x42000),
+    };
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+        ]
+    );
+}
+
+#[test]
+fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
+    // it can happen that we lose the inmemorylayer but have previously sent metrics and we
+    // should never go backwards
+
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [later, now, at_restart] = time_backwards();
+
+    // FIXME: tests would be so much easier if we did not need to juggle back and forth
+    // SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
+    let now = DateTime::<Utc>::from(now);
+    let later = DateTime::<Utc>::from(later);
+    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
+    let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
+    let before_restart = DateTime::<Utc>::from(before_restart);
+    let way_before = DateTime::<Utc>::from(way_before);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (Lsn(50), at_restart),
+        last_record_lsn: Lsn(50),
+        current_exact_logical_size: None,
+    };
+
+    let mut cache = HashMap::from([
+        MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100),
+        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+            way_before,
+            before_restart,
+            // not taken into account, but the timestamps are important
+            999_999_999,
+        ),
+    ]);
+
+    let mut metrics = Vec::new();
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                before_restart,
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
+        ]
+    );
+
+    // now if we cache these metrics, and re-run while "still in recovery"
+    cache.extend(metrics.drain(..));
+
+    // "still in recovery", because our snapshot did not change
+    snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
+        ]
+    );
+}
+
+#[test]
+fn post_restart_current_exact_logical_size_uses_cached() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [now, at_restart] = time_backwards();
+
+    let now = DateTime::<Utc>::from(now);
+    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
+    let before_restart = DateTime::<Utc>::from(before_restart);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (Lsn(50), at_restart),
+        last_record_lsn: Lsn(50),
+        current_exact_logical_size: None,
+    };
+
+    let cache = HashMap::from([
+        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(before_restart, 100)
+    ]);
+
+    let mut metrics = Vec::new();
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    metrics.retain(|(key, _)| key.metric == Name::LogicalSize);
+
+    assert_eq!(
+        metrics,
+        &[MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 100)]
+    );
+}
+
+#[test]
+fn post_restart_synthetic_size_uses_cached_if_available() {
+    let tenant_id = TenantId::generate();
+
+    let ts = TenantSnapshot {
+        resident_size: 1000,
+        remote_size: 1000,
+        // not yet calculated
+        synthetic_size: 0,
+    };
+
+    let now = SystemTime::now();
+    let before_restart = DateTime::<Utc>::from(now - std::time::Duration::from_secs(5 * 60));
+    let now = DateTime::<Utc>::from(now);
+
+    let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id).at(before_restart, 1000)]);
+
+    let mut metrics = Vec::new();
+    ts.to_metrics(tenant_id, now, &cached, &mut metrics);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
+            MetricsKey::resident_size(tenant_id).at(now, 1000),
+            MetricsKey::synthetic_size(tenant_id).at(now, 1000),
+        ]
+    );
+}
+
+#[test]
+fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
+    let tenant_id = TenantId::generate();
+
+    let ts = TenantSnapshot {
+        resident_size: 1000,
+        remote_size: 1000,
+        // not yet calculated
+        synthetic_size: 0,
+    };
+
+    let now = SystemTime::now();
+    let now = DateTime::<Utc>::from(now);
+
+    let cached = HashMap::new();
+
+    let mut metrics = Vec::new();
+    ts.to_metrics(tenant_id, now, &cached, &mut metrics);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
+            MetricsKey::resident_size(tenant_id).at(now, 1000),
+            // no synthetic size here
+        ]
+    );
+}
+
+fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
+    let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
+    times[0] = std::time::SystemTime::now();
+    for behind in 1..N {
+        times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
+    }
+
+    times
+}
+
+pub(crate) const fn metric_examples(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    now: DateTime<Utc>,
+    before: DateTime<Utc>,
+) -> [RawMetric; 6] {
+    [
+        MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
+        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
+        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
+        MetricsKey::remote_storage_size(tenant_id).at(now, 0),
+        MetricsKey::resident_size(tenant_id).at(now, 0),
+        MetricsKey::synthetic_size(tenant_id).at(now, 1),
+    ]
+}
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -0,0 +1,443 @@
+use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
+use serde_with::serde_as;
+use tokio_util::sync::CancellationToken;
+use tracing::Instrument;
+
+use super::{metrics::Name, Cache, MetricsKey, RawMetric};
+use utils::id::{TenantId, TimelineId};
+
+/// How the metrics from pageserver are identified.
+#[serde_with::serde_as]
+#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
+struct Ids {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub(super) tenant_id: TenantId,
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub(super) timeline_id: Option<TimelineId>,
+}
+
+#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
+pub(super) async fn upload_metrics(
+    client: &reqwest::Client,
+    metric_collection_endpoint: &reqwest::Url,
+    cancel: &CancellationToken,
+    node_id: &str,
+    metrics: &[RawMetric],
+    cached_metrics: &mut Cache,
+) -> anyhow::Result<()> {
+    let mut uploaded = 0;
+    let mut failed = 0;
+
+    let started_at = std::time::Instant::now();
+
+    let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
+
+    while let Some(res) = iter.next() {
+        let (chunk, body) = res?;
+
+        let event_bytes = body.len();
+
+        let is_last = iter.len() == 0;
+
+        let res = upload(client, metric_collection_endpoint, body, cancel, is_last)
+            .instrument(tracing::info_span!(
+                "upload",
+                %event_bytes,
+                uploaded,
+                total = metrics.len(),
+            ))
+            .await;
+
+        match res {
+            Ok(()) => {
+                for (curr_key, curr_val) in chunk {
+                    cached_metrics.insert(*curr_key, *curr_val);
+                }
+                uploaded += chunk.len();
+            }
+            Err(_) => {
+                // failure(s) have already been logged
+                //
+                // however this is an inconsistency: if we crash here, we will start with the
+                // values as uploaded. in practice, the rejections no longer happen.
+                failed += chunk.len();
+            }
+        }
+    }
+
+    let elapsed = started_at.elapsed();
+
+    tracing::info!(
+        uploaded,
+        failed,
+        elapsed_ms = elapsed.as_millis(),
+        "done sending metrics"
+    );
+
+    Ok(())
+}
+
+// The return type is quite ugly, but we gain testability in isolation
+fn serialize_in_chunks<'a, F>(
+    chunk_size: usize,
+    input: &'a [RawMetric],
+    factory: F,
+) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
+where
+    F: KeyGen<'a> + 'a,
+{
+    use bytes::BufMut;
+
+    struct Iter<'a, F> {
+        inner: std::slice::Chunks<'a, RawMetric>,
+        chunk_size: usize,
+
+        // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
+        buffer: bytes::BytesMut,
+        // chunk amount of events are reused to produce the serialized document
+        scratch: Vec<Event<Ids, Name>>,
+        factory: F,
+    }
+
+    impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
+        type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
+
+        fn next(&mut self) -> Option<Self::Item> {
+            let chunk = self.inner.next()?;
+
+            if self.scratch.is_empty() {
+                // first round: create events with N strings
+                self.scratch.extend(
+                    chunk
+                        .iter()
+                        .map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
+                );
+            } else {
+                // next rounds: update_in_place to reuse allocations
+                assert_eq!(self.scratch.len(), self.chunk_size);
+                self.scratch
+                    .iter_mut()
+                    .zip(chunk.iter())
+                    .for_each(|(slot, raw_metric)| {
+                        raw_metric.update_in_place(slot, &self.factory.generate())
+                    });
+            }
+
+            let res = serde_json::to_writer(
+                (&mut self.buffer).writer(),
+                &EventChunk {
+                    events: (&self.scratch[..chunk.len()]).into(),
+                },
+            );
+
+            match res {
+                Ok(()) => Some(Ok((chunk, self.buffer.split().freeze()))),
+                Err(e) => Some(Err(e)),
+            }
+        }
+
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            self.inner.size_hint()
+        }
+    }
+
+    impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
+
+    let buffer = bytes::BytesMut::new();
+    let inner = input.chunks(chunk_size);
+    let scratch = Vec::new();
+
+    Iter {
+        inner,
+        chunk_size,
+        buffer,
+        scratch,
+        factory,
+    }
+}
+
+trait RawMetricExt {
+    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name>;
+    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>);
+}
+
+impl RawMetricExt for RawMetric {
+    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name> {
+        let MetricsKey {
+            metric,
+            tenant_id,
+            timeline_id,
+        } = self.0;
+
+        let (kind, value) = self.1;
+
+        Event {
+            kind,
+            metric,
+            idempotency_key: key.to_string(),
+            value,
+            extra: Ids {
+                tenant_id,
+                timeline_id,
+            },
+        }
+    }
+
+    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>) {
+        use std::fmt::Write;
+
+        let MetricsKey {
+            metric,
+            tenant_id,
+            timeline_id,
+        } = self.0;
+
+        let (kind, value) = self.1;
+
+        *event = Event {
+            kind,
+            metric,
+            idempotency_key: {
+                event.idempotency_key.clear();
+                write!(event.idempotency_key, "{key}").unwrap();
+                std::mem::take(&mut event.idempotency_key)
+            },
+            value,
+            extra: Ids {
+                tenant_id,
+                timeline_id,
+            },
+        };
+    }
+}
+
+trait KeyGen<'a>: Copy {
+    fn generate(&self) -> IdempotencyKey<'a>;
+}
+
+impl<'a> KeyGen<'a> for &'a str {
+    fn generate(&self) -> IdempotencyKey<'a> {
+        IdempotencyKey::generate(self)
+    }
+}
+
+enum UploadError {
+    Rejected(reqwest::StatusCode),
+    Reqwest(reqwest::Error),
+    Cancelled,
+}
+
+impl std::fmt::Debug for UploadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // use same impl because backoff::retry will log this using both
+        std::fmt::Display::fmt(self, f)
+    }
+}
+
+impl std::fmt::Display for UploadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use UploadError::*;
+
+        match self {
+            Rejected(code) => write!(f, "server rejected the metrics with {code}"),
+            Reqwest(e) => write!(f, "request failed: {e}"),
+            Cancelled => write!(f, "cancelled"),
+        }
+    }
+}
+
+impl UploadError {
+    fn is_reject(&self) -> bool {
+        matches!(self, UploadError::Rejected(_))
+    }
+}
+
+// this is consumed by the test verifiers
+static LAST_IN_BATCH: reqwest::header::HeaderName =
+    reqwest::header::HeaderName::from_static("pageserver-metrics-last-upload-in-batch");
+
+async fn upload(
+    client: &reqwest::Client,
+    metric_collection_endpoint: &reqwest::Url,
+    body: bytes::Bytes,
+    cancel: &CancellationToken,
+    is_last: bool,
+) -> Result<(), UploadError> {
+    let warn_after = 3;
+    let max_attempts = 10;
+    let res = utils::backoff::retry(
+        move || {
+            let body = body.clone();
+            async move {
+                let res = client
+                    .post(metric_collection_endpoint.clone())
+                    .header(reqwest::header::CONTENT_TYPE, "application/json")
+                    .header(
+                        LAST_IN_BATCH.clone(),
+                        if is_last { "true" } else { "false" },
+                    )
+                    .body(body)
+                    .send()
+                    .await;
+
+                let res = res.and_then(|res| res.error_for_status());
+
+                // 10 redirects are normally allowed, so we don't need worry about 3xx
+                match res {
+                    Ok(_response) => Ok(()),
+                    Err(e) => {
+                        let status = e.status().filter(|s| s.is_client_error());
+                        if let Some(status) = status {
+                            // rejection used to be a thing when the server could reject a
+                            // whole batch of metrics if one metric was bad.
+                            Err(UploadError::Rejected(status))
+                        } else {
+                            Err(UploadError::Reqwest(e))
+                        }
+                    }
+                }
+            }
+        },
+        UploadError::is_reject,
+        warn_after,
+        max_attempts,
+        "upload consumption_metrics",
+        utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled),
+    )
+    .await;
+
+    match &res {
+        Ok(_) => {}
+        Err(e) if e.is_reject() => {
+            // permanent errors currently do not get logged by backoff::retry
+            // display alternate has no effect, but keeping it here for easier pattern matching.
+            tracing::error!("failed to upload metrics: {e:#}");
+        }
+        Err(_) => {
+            // these have been logged already
+        }
+    }
+
+    res
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use chrono::{DateTime, Utc};
+    use once_cell::sync::Lazy;
+
+    #[test]
+    fn chunked_serialization() {
+        let examples = metric_samples();
+        assert!(examples.len() > 1);
+
+        let factory = FixedGen::new(Utc::now(), "1", 42);
+
+        // need to use Event here because serde_json::Value uses default hashmap, not linked
+        // hashmap
+        #[derive(serde::Deserialize)]
+        struct EventChunk {
+            events: Vec<Event<Ids, Name>>,
+        }
+
+        let correct = serialize_in_chunks(examples.len(), &examples, factory)
+            .map(|res| res.unwrap().1)
+            .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
+            .collect::<Vec<_>>();
+
+        for chunk_size in 1..examples.len() {
+            let actual = serialize_in_chunks(chunk_size, &examples, factory)
+                .map(|res| res.unwrap().1)
+                .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
+                .collect::<Vec<_>>();
+
+            // if these are equal, it means that multi-chunking version works as well
+            assert_eq!(correct, actual);
+        }
+    }
+
+    #[derive(Clone, Copy)]
+    struct FixedGen<'a>(chrono::DateTime<chrono::Utc>, &'a str, u16);
+
+    impl<'a> FixedGen<'a> {
+        fn new(now: chrono::DateTime<chrono::Utc>, node_id: &'a str, nonce: u16) -> Self {
+            FixedGen(now, node_id, nonce)
+        }
+    }
+
+    impl<'a> KeyGen<'a> for FixedGen<'a> {
+        fn generate(&self) -> IdempotencyKey<'a> {
+            IdempotencyKey::for_tests(self.0, self.1, self.2)
+        }
+    }
+
+    static SAMPLES_NOW: Lazy<DateTime<Utc>> = Lazy::new(|| {
+        DateTime::parse_from_rfc3339("2023-09-15T00:00:00.123456789Z")
+            .unwrap()
+            .into()
+    });
+
+    #[test]
+    fn metric_image_stability() {
+        // it is important that these strings stay as they are
+
+        let examples = [
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
+            ),
+        ];
+
+        let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0);
+        let examples = examples.into_iter().zip(metric_samples());
+
+        for ((line, expected), (key, (kind, value))) in examples {
+            let e = consumption_metrics::Event {
+                kind,
+                metric: key.metric,
+                idempotency_key: idempotency_key.to_string(),
+                value,
+                extra: Ids {
+                    tenant_id: key.tenant_id,
+                    timeline_id: key.timeline_id,
+                },
+            };
+            let actual = serde_json::to_string(&e).unwrap();
+            assert_eq!(expected, actual, "example for {kind:?} from line {line}");
+        }
+    }
+
+    fn metric_samples() -> [RawMetric; 6] {
+        let tenant_id = TenantId::from_array([0; 16]);
+        let timeline_id = TimelineId::from_array([0xff; 16]);
+
+        let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z")
+            .unwrap()
+            .into();
+        let [now, before] = [*SAMPLES_NOW, before];
+
+        super::super::metrics::metric_examples(tenant_id, timeline_id, now, before)
+    }
+}
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -94,6 +94,18 @@ pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
    access_stats_behavior: AccessStatsBehavior,
+    page_content_kind: PageContentKind,
+}
+
+/// The kind of access to the page cache.
+#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
+pub enum PageContentKind {
+    Unknown,
+    DeltaLayerBtreeNode,
+    DeltaLayerValue,
+    ImageLayerBtreeNode,
+    ImageLayerValue,
+    InMemoryLayer,
 }

 /// Desired behavior if the operation requires an on-demand download
@@ -137,6 +149,7 @@ impl RequestContextBuilder {
                task_kind,
                download_behavior: DownloadBehavior::Download,
                access_stats_behavior: AccessStatsBehavior::Update,
+                page_content_kind: PageContentKind::Unknown,
            },
        }
    }
@@ -149,6 +162,7 @@ impl RequestContextBuilder {
                task_kind: original.task_kind,
                download_behavior: original.download_behavior,
                access_stats_behavior: original.access_stats_behavior,
+                page_content_kind: original.page_content_kind,
            },
        }
    }
@@ -167,6 +181,11 @@ impl RequestContextBuilder {
        self
    }

+    pub(crate) fn page_content_kind(mut self, k: PageContentKind) -> Self {
+        self.inner.page_content_kind = k;
+        self
+    }
+
    pub fn build(self) -> RequestContext {
        self.inner
    }
@@ -263,4 +282,8 @@ impl RequestContext {
    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
        self.access_stats_behavior
    }
+
+    pub(crate) fn page_content_kind(&self) -> PageContentKind {
+        self.page_content_kind
+    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,7 +12,6 @@ use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
 };
 use remote_storage::GenericRemoteStorage;
-use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -55,7 +54,7 @@ use utils::{
 // Imports only used for testing APIs
 use super::models::ConfigureFailpointsRequest;

-struct State {
+pub struct State {
    conf: &'static PageServerConf,
    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
@@ -65,7 +64,7 @@ struct State {
 }

 impl State {
-    fn new(
+    pub fn new(
        conf: &'static PageServerConf,
        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
@@ -286,6 +285,8 @@ async fn build_timeline_info_common(
    let state = timeline.current_state();
    let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

+    let walreceiver_status = timeline.walreceiver_status();
+
    let info = TimelineInfo {
        tenant_id: timeline.tenant_id,
        timeline_id: timeline.timeline_id,
@@ -306,6 +307,8 @@ async fn build_timeline_info_common(
        pg_version: timeline.pg_version,

        state,
+
+        walreceiver_status,
    };
    Ok(info)
 }
@@ -1354,12 +1357,9 @@ where
 }

 pub fn make_router(
-    conf: &'static PageServerConf,
+    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
    auth: Option<Arc<JwtAuth>>,
-    broker_client: BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
-    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1383,16 +1383,7 @@ pub fn make_router(
    );

    Ok(router
-        .data(Arc::new(
-            State::new(
-                conf,
-                auth,
-                remote_storage,
-                broker_client,
-                disk_usage_eviction_state,
-            )
-            .context("Failed to initialize router state")?,
-        ))
+        .data(state)
        .get("/v1/status", |r| api_handler(r, status_handler))
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
            {
                pg_control = Some(control_file);
            }
-            modification.flush().await?;
+            modification.flush(ctx).await?;
        }
    }

    // We're done importing all the data files.
-    modification.commit().await?;
+    modification.commit(ctx).await?;

    // We expect the Postgres server to be shut down cleanly.
    let pg_control = pg_control.context("pg_control file not found")?;
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
                    // We found the pg_control file.
                    pg_control = Some(res);
                }
-                modification.flush().await?;
+                modification.flush(ctx).await?;
            }
            tokio_tar::EntryType::Directory => {
                debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
    // sanity check: ensure that pg_control is loaded
    let _pg_control = pg_control.context("pg_control file not found")?;

-    modification.commit().await?;
+    modification.commit(ctx).await?;
    Ok(())
 }

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,3 +1,4 @@
+use enum_map::EnumMap;
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
@@ -127,22 +128,24 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub struct PageCacheMetrics {
+pub struct PageCacheMetricsForTaskKind {
    pub read_accesses_materialized_page: IntCounter,
-    pub read_accesses_ephemeral: IntCounter,
    pub read_accesses_immutable: IntCounter,

-    pub read_hits_ephemeral: IntCounter,
    pub read_hits_immutable: IntCounter,
    pub read_hits_materialized_page_exact: IntCounter,
    pub read_hits_materialized_page_older_lsn: IntCounter,
 }

+pub struct PageCacheMetrics {
+    map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
+}
+
 static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_page_cache_read_hits_total",
        "Number of read accesses to the page cache that hit",
-        &["key_kind", "hit_kind"]
+        &["task_kind", "key_kind", "content_kind", "hit_kind"]
    )
    .expect("failed to define a metric")
 });
@@ -151,55 +154,73 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_page_cache_read_accesses_total",
        "Number of read accesses to the page cache",
-        &["key_kind"]
+        &["task_kind", "key_kind", "content_kind"]
    )
    .expect("failed to define a metric")
 });

 pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
-    read_accesses_materialized_page: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
+    map: EnumMap::from_array(std::array::from_fn(|task_kind| {
+        let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
+        let task_kind: &'static str = task_kind.into();
+        EnumMap::from_array(std::array::from_fn(|content_kind| {
+            let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
+            let content_kind: &'static str = content_kind.into();
+            PageCacheMetricsForTaskKind {
+                read_accesses_materialized_page: {
+                    PAGE_CACHE_READ_ACCESSES
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                        ])
+                        .unwrap()
+                },

-    read_accesses_ephemeral: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
+                read_accesses_immutable: {
+                    PAGE_CACHE_READ_ACCESSES
+                        .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
+                        .unwrap()
+                },

-    read_accesses_immutable: {
-        PAGE_CACHE_READ_ACCESSES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
+                read_hits_immutable: {
+                    PAGE_CACHE_READ_HITS
+                        .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
+                        .unwrap()
+                },

-    read_hits_ephemeral: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["ephemeral", "-"])
-            .unwrap()
-    },
+                read_hits_materialized_page_exact: {
+                    PAGE_CACHE_READ_HITS
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                            "exact",
+                        ])
+                        .unwrap()
+                },

-    read_hits_immutable: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["immutable", "-"])
-            .unwrap()
-    },
-
-    read_hits_materialized_page_exact: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["materialized_page", "exact"])
-            .unwrap()
-    },
-
-    read_hits_materialized_page_older_lsn: {
-        PAGE_CACHE_READ_HITS
-            .get_metric_with_label_values(&["materialized_page", "older_lsn"])
-            .unwrap()
-    },
+                read_hits_materialized_page_older_lsn: {
+                    PAGE_CACHE_READ_HITS
+                        .get_metric_with_label_values(&[
+                            task_kind,
+                            "materialized_page",
+                            content_kind,
+                            "older_lsn",
+                        ])
+                        .unwrap()
+                },
+            }
+        }))
+    })),
 });

+impl PageCacheMetrics {
+    pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind {
+        &self.map[ctx.task_kind()][ctx.page_content_kind()]
+    }
+}
+
 pub struct PageCacheSizeMetrics {
    pub max_bytes: UIntGauge,

@@ -537,7 +558,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    30.000,   // 30000 ms
 ];

-/// Tracks time taken by fs operations near VirtualFile.
+/// VirtualFile fs operation variants.
 ///
 /// Operations:
 /// - open ([`std::fs::OpenOptions::open`])
@@ -548,15 +569,66 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
 /// - seek (modify internal position or file length query)
 /// - fsync ([`std::fs::File::sync_all`])
 /// - metadata ([`std::fs::File::metadata`])
-pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_io_operations_seconds",
-        "Time spent in IO operations",
-        &["operation"],
-        STORAGE_IO_TIME_BUCKETS.into()
-    )
-    .expect("failed to define a metric")
-});
+#[derive(
+    Debug, Clone, Copy, strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr,
+)]
+pub(crate) enum StorageIoOperation {
+    Open,
+    Close,
+    CloseByReplace,
+    Read,
+    Write,
+    Seek,
+    Fsync,
+    Metadata,
+}
+
+impl StorageIoOperation {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            StorageIoOperation::Open => "open",
+            StorageIoOperation::Close => "close",
+            StorageIoOperation::CloseByReplace => "close-by-replace",
+            StorageIoOperation::Read => "read",
+            StorageIoOperation::Write => "write",
+            StorageIoOperation::Seek => "seek",
+            StorageIoOperation::Fsync => "fsync",
+            StorageIoOperation::Metadata => "metadata",
+        }
+    }
+}
+
+/// Tracks time taken by fs operations near VirtualFile.
+#[derive(Debug)]
+pub(crate) struct StorageIoTime {
+    metrics: [Histogram; StorageIoOperation::COUNT],
+}
+
+impl StorageIoTime {
+    fn new() -> Self {
+        let storage_io_histogram_vec = register_histogram_vec!(
+            "pageserver_io_operations_seconds",
+            "Time spent in IO operations",
+            &["operation"],
+            STORAGE_IO_TIME_BUCKETS.into()
+        )
+        .expect("failed to define a metric");
+        let metrics = std::array::from_fn(|i| {
+            let op = StorageIoOperation::from_repr(i).unwrap();
+            let metric = storage_io_histogram_vec
+                .get_metric_with_label_values(&[op.as_str()])
+                .unwrap();
+            metric
+        });
+        Self { metrics }
+    }
+
+    pub(crate) fn get(&self, op: StorageIoOperation) -> &Histogram {
+        &self.metrics[op as usize]
+    }
+}
+
+pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);

 const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];

@@ -1165,6 +1237,12 @@ impl TimelineMetrics {
            ),
        }
    }
+
+    pub fn record_new_file_metrics(&self, sz: u64) {
+        self.resident_physical_size_gauge.add(sz);
+        self.num_persistent_files_created.inc_by(1);
+        self.persistent_bytes_written.inc_by(sz);
+    }
 }

 impl Drop for TimelineMetrics {
@@ -1223,6 +1301,9 @@ use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};

+use crate::context::{PageContentKind, RequestContext};
+use crate::task_mgr::TaskKind;
+
 pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -85,7 +85,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::{metrics::PageCacheSizeMetrics, repository::Key};
+use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -346,8 +346,10 @@ impl PageCache {
        timeline_id: TimelineId,
        key: &Key,
        lsn: Lsn,
+        ctx: &RequestContext,
    ) -> Option<(Lsn, PageReadGuard)> {
        crate::metrics::PAGE_CACHE
+            .for_ctx(ctx)
            .read_accesses_materialized_page
            .inc();

@@ -368,10 +370,12 @@ impl PageCache {
            {
                if available_lsn == lsn {
                    crate::metrics::PAGE_CACHE
+                        .for_ctx(ctx)
                        .read_hits_materialized_page_exact
                        .inc();
                } else {
                    crate::metrics::PAGE_CACHE
+                        .for_ctx(ctx)
                        .read_hits_materialized_page_older_lsn
                        .inc();
                }
@@ -426,10 +430,11 @@ impl PageCache {
        &self,
        file_id: FileId,
        blkno: u32,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

-        self.lock_for_read(&mut cache_key).await
+        self.lock_for_read(&mut cache_key, ctx).await
    }

    //
@@ -497,14 +502,20 @@ impl PageCache {
    /// }
    /// ```
    ///
-    async fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
+    async fn lock_for_read(
+        &self,
+        cache_key: &mut CacheKey,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ReadBufResult> {
        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
            }
            CacheKey::ImmutableFilePage { .. } => (
-                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
-                &crate::metrics::PAGE_CACHE.read_hits_immutable,
+                &crate::metrics::PAGE_CACHE
+                    .for_ctx(ctx)
+                    .read_accesses_immutable,
+                &crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable,
            ),
        };
        read_access.inc();
@@ -799,8 +810,9 @@ impl PageCache {
    fn new(num_pages: usize) -> Self {
        assert!(num_pages > 0, "page cache size must be > 0");

-        // We use Box::leak here and into_boxed_slice to avoid leaking uninitialized
-        // memory that Vec's might contain.
+        // We could use Vec::leak here, but that potentially also leaks
+        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
+        // this is avoided.
        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1138,7 +1138,7 @@ impl<'a> DatadirModification<'a> {
    /// retains all the metadata, but data pages are flushed. That's again OK
    /// for bulk import, where you are just loading data pages and won't try to
    /// modify the same pages twice.
-    pub async fn flush(&mut self) -> anyhow::Result<()> {
+    pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        // Unless we have accumulated a decent amount of changes, it's not worth it
        // to scan through the pending_updates list.
        let pending_nblocks = self.pending_nblocks;
@@ -1154,7 +1154,7 @@ impl<'a> DatadirModification<'a> {
            if is_rel_block_key(key) || is_slru_block_key(key) {
                // This bails out on first error without modifying pending_updates.
                // That's Ok, cf this function's doc comment.
-                writer.put(key, self.lsn, &value).await?;
+                writer.put(key, self.lsn, &value, ctx).await?;
            } else {
                retained_pending_updates.insert(key, value);
            }
@@ -1174,14 +1174,14 @@ impl<'a> DatadirModification<'a> {
    /// underlying timeline.
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
-    pub async fn commit(&mut self) -> anyhow::Result<()> {
+    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        let writer = self.tline.writer().await;
        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value).await?;
+            writer.put(key, lsn, &value, ctx).await?;
        }
        for key_range in self.pending_deletions.drain(..) {
            writer.delete(key_range, lsn).await?;
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -187,6 +187,7 @@ task_local! {
    Debug,
    // NB: enumset::EnumSetType derives PartialEq, Eq, Clone, Copy
    enumset::EnumSetType,
+    enum_map::Enum,
    serde::Serialize,
    serde::Deserialize,
    strum_macros::IntoStaticStr,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1504,7 +1504,7 @@ impl Tenant {
            .init_empty_test_timeline()
            .context("init_empty_test_timeline")?;
        modification
-            .commit()
+            .commit(ctx)
            .await
            .context("commit init_empty_test_timeline modification")?;

@@ -3538,14 +3538,24 @@ mod tests {

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x10),
+                &Value::Image(TEST_IMG("foo at 0x10")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x10));
        drop(writer);

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x20),
+                &Value::Image(TEST_IMG("foo at 0x20")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x20));
        drop(writer);
@@ -3619,19 +3629,19 @@ mod tests {

        // Insert a value on the timeline
        writer
-            .put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))
+            .put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"), &ctx)
            .await?;
        writer
-            .put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))
+            .put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"), &ctx)
            .await?;
        writer.finish_write(Lsn(0x20));

        writer
-            .put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))
+            .put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"), &ctx)
            .await?;
        writer.finish_write(Lsn(0x30));
        writer
-            .put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))
+            .put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"), &ctx)
            .await?;
        writer.finish_write(Lsn(0x40));

@@ -3646,7 +3656,7 @@ mod tests {
            .expect("Should have a local timeline");
        let new_writer = newtline.writer().await;
        new_writer
-            .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))
+            .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
            .await?;
        new_writer.finish_write(Lsn(0x40));

@@ -3669,7 +3679,11 @@ mod tests {
        Ok(())
    }

-    async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
+    async fn make_some_layers(
+        tline: &Timeline,
+        start_lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        let mut lsn = start_lsn;
        #[allow(non_snake_case)]
        {
@@ -3680,6 +3694,7 @@ mod tests {
                    *TEST_KEY,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -3689,6 +3704,7 @@ mod tests {
                    *TEST_KEY,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -3702,6 +3718,7 @@ mod tests {
                    *TEST_KEY,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -3711,6 +3728,7 @@ mod tests {
                    *TEST_KEY,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -3727,7 +3745,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
        // FIXME: this doesn't actually remove any layer currently, given how the flushing
@@ -3801,7 +3819,7 @@ mod tests {
            .load();

        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
        let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
@@ -3823,7 +3841,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3832,7 +3850,7 @@ mod tests {
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");

-        make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+        make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;

        tline.set_broken("test".to_owned());

@@ -3873,7 +3891,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3898,7 +3916,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3907,7 +3925,7 @@ mod tests {
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");

-        make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+        make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;

        // run gc on parent
        tenant
@@ -3932,7 +3950,7 @@ mod tests {
            let tline = tenant
                .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
                .await?;
-            make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
+            make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
            // so that all uploads finish & we can call harness.load() below again
            tenant
                .shutdown(Default::default(), true)
@@ -3961,7 +3979,7 @@ mod tests {
                .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
                .await?;

-            make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+            make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

            let child_tline = tenant
                .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3972,7 +3990,7 @@ mod tests {
                .get_timeline(NEW_TIMELINE_ID, true)
                .expect("Should have a local timeline");

-            make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+            make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;

            // so that all uploads finish & we can call harness.load() below again
            tenant
@@ -4004,7 +4022,7 @@ mod tests {
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
-        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        let layer_map = tline.layers.read().await;
        let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
@@ -4087,7 +4105,12 @@ mod tests {

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x10),
+                &Value::Image(TEST_IMG("foo at 0x10")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x10));
        drop(writer);
@@ -4097,7 +4120,12 @@ mod tests {

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x20),
+                &Value::Image(TEST_IMG("foo at 0x20")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x20));
        drop(writer);
@@ -4107,7 +4135,12 @@ mod tests {

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x30),
+                &Value::Image(TEST_IMG("foo at 0x30")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x30));
        drop(writer);
@@ -4117,7 +4150,12 @@ mod tests {

        let writer = tline.writer().await;
        writer
-            .put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))
+            .put(
+                *TEST_KEY,
+                Lsn(0x40),
+                &Value::Image(TEST_IMG("foo at 0x40")),
+                &ctx,
+            )
            .await?;
        writer.finish_write(Lsn(0x40));
        drop(writer);
@@ -4175,6 +4213,7 @@ mod tests {
                        test_key,
                        lsn,
                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
                    )
                    .await?;
                writer.finish_write(lsn);
@@ -4227,6 +4266,7 @@ mod tests {
                    test_key,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -4247,6 +4287,7 @@ mod tests {
                        test_key,
                        lsn,
                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
                    )
                    .await?;
                writer.finish_write(lsn);
@@ -4306,6 +4347,7 @@ mod tests {
                    test_key,
                    lsn,
                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
                )
                .await?;
            writer.finish_write(lsn);
@@ -4334,6 +4376,7 @@ mod tests {
                        test_key,
                        lsn,
                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
                    )
                    .await?;
                println!("updating {} at {}", blknum, lsn);
@@ -4402,6 +4445,7 @@ mod tests {
                        test_key,
                        lsn,
                        &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
+                        &ctx,
                    )
                    .await?;
                println!("updating [{}][{}] at {}", idx, blknum, lsn);
@@ -4474,7 +4518,7 @@ mod tests {
            .init_empty_test_timeline()
            .context("init_empty_test_timeline")?;
        modification
-            .commit()
+            .commit(&ctx)
            .await
            .context("commit init_empty_test_timeline modification")?;

--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -11,6 +11,7 @@
 //! len <  128: 0XXXXXXX
 //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
+use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
 use crate::virtual_file::VirtualFile;
@@ -19,9 +20,13 @@ use std::io::{Error, ErrorKind};

 impl<'a> BlockCursor<'a> {
    /// Read a blob into a new buffer.
-    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub async fn read_blob(
+        &self,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
-        self.read_blob_into_buf(offset, &mut buf).await?;
+        self.read_blob_into_buf(offset, &mut buf, ctx).await?;
        Ok(buf)
    }
    /// Read blob into the given buffer. Any previous contents in the buffer
@@ -30,11 +35,12 @@ impl<'a> BlockCursor<'a> {
        &self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
+        ctx: &RequestContext,
    ) -> Result<(), std::io::Error> {
        let mut blknum = (offset / PAGE_SZ as u64) as u32;
        let mut off = (offset % PAGE_SZ as u64) as usize;

-        let mut buf = self.read_blk(blknum).await?;
+        let mut buf = self.read_blk(blknum, ctx).await?;

        // peek at the first byte, to determine if it's a 1- or 4-byte length
        let first_len_byte = buf[off];
@@ -50,7 +56,7 @@ impl<'a> BlockCursor<'a> {
                // it is split across two pages
                len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
                blknum += 1;
-                buf = self.read_blk(blknum).await?;
+                buf = self.read_blk(blknum, ctx).await?;
                len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
                off = 4 - thislen;
            } else {
@@ -71,7 +77,7 @@ impl<'a> BlockCursor<'a> {
            if page_remain == 0 {
                // continue on next page
                blknum += 1;
-                buf = self.read_blk(blknum).await?;
+                buf = self.read_blk(blknum, ctx).await?;
                off = 0;
                page_remain = PAGE_SZ;
            }
@@ -228,12 +234,13 @@ impl BlobWriter<false> {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::block_io::BlockReaderRef;
+    use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
        let temp_dir = tempfile::tempdir()?;
        let path = temp_dir.path().join("file");
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
@@ -255,7 +262,7 @@ mod tests {
        let rdr = BlockReaderRef::VirtualFile(&file);
        let rdr = BlockCursor::new(rdr);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
-            let blob_read = rdr.read_blob(*offset).await?;
+            let blob_read = rdr.read_blob(*offset, &ctx).await?;
            assert_eq!(
                blob, &blob_read,
                "mismatch for idx={idx} at offset={offset}"
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -4,6 +4,7 @@

 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
+use crate::context::RequestContext;
 use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
@@ -71,7 +72,7 @@ impl<'a> Deref for BlockLease<'a> {
 ///
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
-    FileBlockReaderVirtual(&'a FileBlockReader),
+    FileBlockReader(&'a FileBlockReader),
    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
    #[cfg(test)]
@@ -82,12 +83,16 @@ pub(crate) enum BlockReaderRef<'a> {

 impl<'a> BlockReaderRef<'a> {
    #[inline(always)]
-    async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, std::io::Error> {
        use BlockReaderRef::*;
        match self {
-            FileBlockReaderVirtual(r) => r.read_blk(blknum).await,
-            EphemeralFile(r) => r.read_blk(blknum).await,
-            Adapter(r) => r.read_blk(blknum).await,
+            FileBlockReader(r) => r.read_blk(blknum, ctx).await,
+            EphemeralFile(r) => r.read_blk(blknum, ctx).await,
+            Adapter(r) => r.read_blk(blknum, ctx).await,
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
            #[cfg(test)]
@@ -105,11 +110,13 @@ impl<'a> BlockReaderRef<'a> {
 ///
 /// ```no_run
 /// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
+/// # use pageserver::context::RequestContext;
 /// # let reader: FileBlockReader = unimplemented!("stub");
+/// # let ctx: RequestContext = unimplemented!("stub");
 /// let cursor = reader.block_cursor();
-/// let buf = cursor.read_blk(1);
+/// let buf = cursor.read_blk(1, &ctx);
 /// // do stuff with 'buf'
-/// let buf = cursor.read_blk(2);
+/// let buf = cursor.read_blk(2, &ctx);
 /// // do stuff with 'buf'
 /// ```
 ///
@@ -124,7 +131,7 @@ impl<'a> BlockCursor<'a> {
    // Needed by cli
    pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
        BlockCursor {
-            reader: BlockReaderRef::FileBlockReaderVirtual(reader),
+            reader: BlockReaderRef::FileBlockReader(reader),
        }
    }

@@ -134,8 +141,12 @@ impl<'a> BlockCursor<'a> {
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
    #[inline(always)]
-    pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.reader.read_blk(blknum).await
+    pub async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, std::io::Error> {
+        self.reader.read_blk(blknum, ctx).await
    }
 }

@@ -169,11 +180,15 @@ impl FileBlockReader {
    /// Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
-    pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    pub async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
        loop {
            match cache
-                .read_immutable_buf(self.file_id, blknum)
+                .read_immutable_buf(self.file_id, blknum, ctx)
                .await
                .map_err(|e| {
                    std::io::Error::new(
@@ -197,7 +212,7 @@ impl FileBlockReader {

 impl BlockReader for FileBlockReader {
    fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
+        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
    }
 }

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -26,7 +26,11 @@ use std::{cmp::Ordering, io, result};
 use thiserror::Error;
 use tracing::error;

-use crate::tenant::block_io::{BlockReader, BlockWriter};
+use crate::{
+    context::{DownloadBehavior, RequestContext},
+    task_mgr::TaskKind,
+    tenant::block_io::{BlockReader, BlockWriter},
+};

 // The maximum size of a value stored in the B-tree. 5 bytes is enough currently.
 pub const VALUE_SZ: usize = 5;
@@ -231,14 +235,19 @@ where
    ///
    /// Read the value for given key. Returns the value, or None if it doesn't exist.
    ///
-    pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
+    pub async fn get(&self, search_key: &[u8; L], ctx: &RequestContext) -> Result<Option<u64>> {
        let mut result: Option<u64> = None;
-        self.visit(search_key, VisitDirection::Forwards, |key, value| {
-            if key == search_key {
-                result = Some(value);
-            }
-            false
-        })
+        self.visit(
+            search_key,
+            VisitDirection::Forwards,
+            |key, value| {
+                if key == search_key {
+                    result = Some(value);
+                }
+                false
+            },
+            ctx,
+        )
        .await?;
        Ok(result)
    }
@@ -253,6 +262,7 @@ where
        search_key: &[u8; L],
        dir: VisitDirection,
        mut visitor: V,
+        ctx: &RequestContext,
    ) -> Result<bool>
    where
        V: FnMut(&[u8], u64) -> bool,
@@ -262,7 +272,9 @@ where
        let block_cursor = self.reader.block_cursor();
        while let Some((node_blknum, opt_iter)) = stack.pop() {
            // Locate the node.
-            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?;
+            let node_buf = block_cursor
+                .read_blk(self.start_blk + node_blknum, ctx)
+                .await?;

            let node = OnDiskNode::deparse(node_buf.as_ref())?;
            let prefix_len = node.prefix_len as usize;
@@ -351,13 +363,14 @@ where
    #[allow(dead_code)]
    pub async fn dump(&self) -> Result<()> {
        let mut stack = Vec::new();
+        let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

        stack.push((self.root_blk, String::new(), 0, 0, 0));

        let block_cursor = self.reader.block_cursor();

        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = block_cursor.read_blk(self.start_blk + blknum).await?;
+            let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?;
            let buf: &[u8] = blk.as_ref();
            let node = OnDiskNode::<L>::deparse(buf)?;

@@ -688,6 +701,8 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 pub(crate) mod tests {
    use super::*;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
    use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
    use rand::Rng;
    use std::collections::BTreeMap;
@@ -725,6 +740,8 @@ pub(crate) mod tests {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
        let all_keys: Vec<&[u8; 6]> = vec![
            b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb",
        ];
@@ -745,12 +762,12 @@ pub(crate) mod tests {

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
-            assert_eq!(reader.get(key).await?, Some(*val));
+            assert_eq!(reader.get(key, &ctx).await?, Some(*val));
        }
        // And on some keys that don't exist
-        assert_eq!(reader.get(b"aaaaaa").await?, None);
-        assert_eq!(reader.get(b"zzzzzz").await?, None);
-        assert_eq!(reader.get(b"xaaabx").await?, None);
+        assert_eq!(reader.get(b"aaaaaa", &ctx).await?, None);
+        assert_eq!(reader.get(b"zzzzzz", &ctx).await?, None);
+        assert_eq!(reader.get(b"xaaabx", &ctx).await?, None);

        // Test search with `visit` function
        let search_key = b"xabaaa";
@@ -762,10 +779,15 @@ pub(crate) mod tests {

        let mut data = Vec::new();
        reader
-            .visit(search_key, VisitDirection::Forwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
+            .visit(
+                search_key,
+                VisitDirection::Forwards,
+                |key, value| {
+                    data.push((key.to_vec(), value));
+                    true
+                },
+                &ctx,
+            )
            .await?;
        assert_eq!(data, expected);

@@ -778,18 +800,28 @@ pub(crate) mod tests {
        expected.reverse();
        let mut data = Vec::new();
        reader
-            .visit(search_key, VisitDirection::Backwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
+            .visit(
+                search_key,
+                VisitDirection::Backwards,
+                |key, value| {
+                    data.push((key.to_vec(), value));
+                    true
+                },
+                &ctx,
+            )
            .await?;
        assert_eq!(data, expected);

        // Backward scan where nothing matches
        reader
-            .visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
-                panic!("found unexpected key {}: {}", hex::encode(key), value);
-            })
+            .visit(
+                b"aaaaaa",
+                VisitDirection::Backwards,
+                |key, value| {
+                    panic!("found unexpected key {}: {}", hex::encode(key), value);
+                },
+                &ctx,
+            )
            .await?;

        // Full scan
@@ -799,10 +831,15 @@ pub(crate) mod tests {
            .collect();
        let mut data = Vec::new();
        reader
-            .visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
+            .visit(
+                &[0u8; 6],
+                VisitDirection::Forwards,
+                |key, value| {
+                    data.push((key.to_vec(), value));
+                    true
+                },
+                &ctx,
+            )
            .await?;
        assert_eq!(data, expected);

@@ -813,6 +850,7 @@ pub(crate) mod tests {
    async fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        const NUM_KEYS: u64 = 1000;

@@ -851,14 +889,14 @@ pub(crate) mod tests {
        for search_key_int in 0..(NUM_KEYS * 2 + 10) {
            let search_key = u64::to_be_bytes(search_key_int);
            assert_eq!(
-                reader.get(&search_key).await?,
+                reader.get(&search_key, &ctx).await?,
                all_data.get(&search_key_int).cloned()
            );

            // Test a forward scan starting with this key
            result.lock().unwrap().clear();
            reader
-                .visit(&search_key, VisitDirection::Forwards, take_ten)
+                .visit(&search_key, VisitDirection::Forwards, take_ten, &ctx)
                .await?;
            let expected = all_data
                .range(search_key_int..)
@@ -870,7 +908,7 @@ pub(crate) mod tests {
            // And a backwards scan
            result.lock().unwrap().clear();
            reader
-                .visit(&search_key, VisitDirection::Backwards, take_ten)
+                .visit(&search_key, VisitDirection::Backwards, take_ten, &ctx)
                .await?;
            let expected = all_data
                .range(..=search_key_int)
@@ -886,7 +924,7 @@ pub(crate) mod tests {
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
        reader
-            .visit(&search_key, VisitDirection::Forwards, take_ten)
+            .visit(&search_key, VisitDirection::Forwards, take_ten, &ctx)
            .await?;
        let expected = all_data
            .iter()
@@ -899,7 +937,7 @@ pub(crate) mod tests {
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
        reader
-            .visit(&search_key, VisitDirection::Backwards, take_ten)
+            .visit(&search_key, VisitDirection::Backwards, take_ten, &ctx)
            .await?;
        let expected = all_data
            .iter()
@@ -913,6 +951,8 @@ pub(crate) mod tests {

    #[tokio::test]
    async fn random_data() -> Result<()> {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
        // Generate random keys with exponential distribution, to
        // exercise the prefix compression
        const NUM_KEYS: usize = 100000;
@@ -939,22 +979,24 @@ pub(crate) mod tests {
        // Test get() operation on all the keys
        for (&key, &val) in all_data.iter() {
            let search_key = u128::to_be_bytes(key);
-            assert_eq!(reader.get(&search_key).await?, Some(val));
+            assert_eq!(reader.get(&search_key, &ctx).await?, Some(val));
        }

        // Test get() operations on random keys, most of which will not exist
        for _ in 0..100000 {
            let key_int = rand::thread_rng().gen::<u128>();
            let search_key = u128::to_be_bytes(key_int);
-            assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
+            assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned());
        }

        // Test boundary cases
        assert!(
-            reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
+            reader.get(&u128::to_be_bytes(u128::MIN), &ctx).await?
+                == all_data.get(&u128::MIN).cloned()
        );
        assert!(
-            reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
+            reader.get(&u128::to_be_bytes(u128::MAX), &ctx).await?
+                == all_data.get(&u128::MAX).cloned()
        );

        Ok(())
@@ -985,6 +1027,7 @@ pub(crate) mod tests {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        for (key, val) in disk_btree_test_data::TEST_DATA {
            writer.append(&key, val)?;
@@ -997,16 +1040,21 @@ pub(crate) mod tests {

        // Test get() operation on all the keys
        for (key, val) in disk_btree_test_data::TEST_DATA {
-            assert_eq!(reader.get(&key).await?, Some(val));
+            assert_eq!(reader.get(&key, &ctx).await?, Some(val));
        }

        // Test full scan
        let mut count = 0;
        reader
-            .visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
-                count += 1;
-                true
-            })
+            .visit(
+                &[0u8; 26],
+                VisitDirection::Forwards,
+                |_key, _value| {
+                    count += 1;
+                    true
+                },
+                &ctx,
+            )
            .await?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -2,6 +2,7 @@
 //! used to keep in-memory layers spilled on disk.

 use crate::config::PageServerConf;
+use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
@@ -61,13 +62,17 @@ impl EphemeralFile {
        self.len
    }

-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, io::Error> {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
            loop {
                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum)
+                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
                    .await
                    .map_err(|e| {
                        std::io::Error::new(
@@ -103,7 +108,11 @@ impl EphemeralFile {
        }
    }

-    pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
+    pub(crate) async fn write_blob(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> Result<u64, io::Error> {
        struct Writer<'a> {
            ephemeral_file: &'a mut EphemeralFile,
            /// The block to which the next [`push_bytes`] will write.
@@ -120,7 +129,11 @@ impl EphemeralFile {
                })
            }
            #[inline(always)]
-            async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
+            async fn push_bytes(
+                &mut self,
+                src: &[u8],
+                ctx: &RequestContext,
+            ) -> Result<(), io::Error> {
                let mut src_remaining = src;
                while !src_remaining.is_empty() {
                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
@@ -146,6 +159,7 @@ impl EphemeralFile {
                                    .read_immutable_buf(
                                        self.ephemeral_file.page_cache_file_id,
                                        self.blknum,
+                                        ctx,
                                    )
                                    .await
                                {
@@ -199,15 +213,15 @@ impl EphemeralFile {
        if srcbuf.len() < 0x80 {
            // short one-byte length header
            let len_buf = [srcbuf.len() as u8];
-            writer.push_bytes(&len_buf).await?;
+            writer.push_bytes(&len_buf, ctx).await?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            writer.push_bytes(&len_buf).await?;
+            writer.push_bytes(&len_buf, ctx).await?;
        }

        // Write the payload
-        writer.push_bytes(srcbuf).await?;
+        writer.push_bytes(srcbuf, ctx).await?;

        if srcbuf.len() < 0x80 {
            self.len += 1;
@@ -261,6 +275,8 @@ impl BlockReader for EphemeralFile {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::context::DownloadBehavior;
+    use crate::task_mgr::TaskKind;
    use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
    use rand::{thread_rng, RngCore};
    use std::fs;
@@ -268,7 +284,15 @@ mod tests {

    fn harness(
        test_name: &str,
-    ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
+    ) -> Result<
+        (
+            &'static PageServerConf,
+            TenantId,
+            TimelineId,
+            RequestContext,
+        ),
+        io::Error,
+    > {
        let repo_dir = PageServerConf::test_repo_dir(test_name);
        let _ = fs::remove_dir_all(&repo_dir);
        let conf = PageServerConf::dummy_conf(repo_dir);
@@ -280,46 +304,57 @@ mod tests {
        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
        fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;

-        Ok((conf, tenant_id, timeline_id))
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+
+        Ok((conf, tenant_id, timeline_id, ctx))
    }

    #[tokio::test]
    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
+        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;

        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;

-        let pos_foo = file.write_blob(b"foo").await?;
+        let pos_foo = file.write_blob(b"foo", &ctx).await?;
        assert_eq!(
            b"foo",
-            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+            file.block_cursor()
+                .read_blob(pos_foo, &ctx)
+                .await?
+                .as_slice()
        );
-        let pos_bar = file.write_blob(b"bar").await?;
+        let pos_bar = file.write_blob(b"bar", &ctx).await?;
        assert_eq!(
            b"foo",
-            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+            file.block_cursor()
+                .read_blob(pos_foo, &ctx)
+                .await?
+                .as_slice()
        );
        assert_eq!(
            b"bar",
-            file.block_cursor().read_blob(pos_bar).await?.as_slice()
+            file.block_cursor()
+                .read_blob(pos_bar, &ctx)
+                .await?
+                .as_slice()
        );

        let mut blobs = Vec::new();
        for i in 0..10000 {
            let data = Vec::from(format!("blob{}", i).as_bytes());
-            let pos = file.write_blob(&data).await?;
+            let pos = file.write_blob(&data, &ctx).await?;
            blobs.push((pos, data));
        }
        // also test with a large blobs
        for i in 0..100 {
            let data = format!("blob{}", i).as_bytes().repeat(100);
-            let pos = file.write_blob(&data).await?;
+            let pos = file.write_blob(&data, &ctx).await?;
            blobs.push((pos, data));
        }

        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
        for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos).await?;
+            let actual = cursor.read_blob(pos, &ctx).await?;
            assert_eq!(actual, expected);
        }

@@ -327,8 +362,8 @@ mod tests {
        let mut large_data = Vec::new();
        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
-        let pos_large = file.write_blob(&large_data).await?;
-        let result = file.block_cursor().read_blob(pos_large).await?;
+        let pos_large = file.write_blob(&large_data, &ctx).await?;
+        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
        assert_eq!(result, large_data);

        Ok(())
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1,9 +1,10 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.

+use rand::{distributions::Alphanumeric, Rng};
 use std::collections::{hash_map, HashMap};
 use std::ffi::OsStr;
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use tokio::fs;

@@ -70,6 +71,11 @@ impl TenantsMap {
 /// This is pageserver-specific, as it relies on future processes after a crash to check
 /// for TEMP_FILE_SUFFIX when loading things.
 async fn safe_remove_tenant_dir_all(path: impl AsRef<Path>) -> std::io::Result<()> {
+    let tmp_path = safe_rename_tenant_dir(path).await?;
+    fs::remove_dir_all(tmp_path).await
+}
+
+async fn safe_rename_tenant_dir(path: impl AsRef<Path>) -> std::io::Result<PathBuf> {
    let parent = path
        .as_ref()
        .parent()
@@ -79,11 +85,16 @@ async fn safe_remove_tenant_dir_all(path: impl AsRef<Path>) -> std::io::Result<(
            std::io::ErrorKind::InvalidInput,
            "Path must be absolute",
        ))?;
-
-    let tmp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
+    let rand_suffix = rand::thread_rng()
+        .sample_iter(&Alphanumeric)
+        .take(8)
+        .map(char::from)
+        .collect::<String>()
+        + TEMP_FILE_SUFFIX;
+    let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
    fs::rename(&path, &tmp_path).await?;
    fs::File::open(parent).await?.sync_all().await?;
-    fs::remove_dir_all(tmp_path).await
+    Ok(tmp_path)
 }

 static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
@@ -492,6 +503,8 @@ pub enum GetTenantError {

 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
+///
+/// This method is cancel-safe.
 pub async fn get_tenant(
    tenant_id: TenantId,
    active_only: bool,
@@ -551,7 +564,24 @@ pub async fn detach_tenant(
    tenant_id: TenantId,
    detach_ignored: bool,
 ) -> Result<(), TenantStateError> {
-    detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
+    let tmp_path = detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await?;
+    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+    let task_tenant_id = None;
+    task_mgr::spawn(
+        task_mgr::BACKGROUND_RUNTIME.handle(),
+        TaskKind::MgmtRequest,
+        task_tenant_id,
+        None,
+        "tenant_files_delete",
+        false,
+        async move {
+            fs::remove_dir_all(tmp_path.as_path())
+                .await
+                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+        },
+    );
+    Ok(())
 }

 async fn detach_tenant0(
@@ -559,20 +589,16 @@ async fn detach_tenant0(
    tenants: &tokio::sync::RwLock<TenantsMap>,
    tenant_id: TenantId,
    detach_ignored: bool,
-) -> Result<(), TenantStateError> {
-    let local_files_cleanup_operation = |tenant_id_to_clean| async move {
+) -> Result<PathBuf, TenantStateError> {
+    let tenant_dir_rename_operation = |tenant_id_to_clean| async move {
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
-        safe_remove_tenant_dir_all(&local_tenant_directory)
+        safe_rename_tenant_dir(&local_tenant_directory)
            .await
-            .with_context(|| {
-                format!("local tenant directory {local_tenant_directory:?} removal")
-            })?;
-        Ok(())
+            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
    };

    let removal_result =
-        remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
-            .await;
+        remove_tenant_from_memory(tenants, tenant_id, tenant_dir_rename_operation(tenant_id)).await;

    // Ignored tenants are not present in memory and will bail the removal from memory operation.
    // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
@@ -580,10 +606,10 @@ async fn detach_tenant0(
        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
        if tenant_ignore_mark.exists() {
            info!("Detaching an ignored tenant");
-            local_files_cleanup_operation(tenant_id)
+            let tmp_path = tenant_dir_rename_operation(tenant_id)
                .await
-                .with_context(|| format!("Ignored tenant {tenant_id} local files cleanup"))?;
-            return Ok(());
+                .with_context(|| format!("Ignored tenant {tenant_id} local directory rename"))?;
+            return Ok(tmp_path);
        }
    }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -28,7 +28,7 @@
 //! "values" part.
 //!
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
@@ -317,11 +317,11 @@ impl DeltaLayer {

        tree_reader.dump().await?;

-        let keys = DeltaLayerInner::load_keys(&inner).await?;
+        let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;

        // A subroutine to dump a single blob
-        async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
+        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
            let val = Value::des(&buf)?;
            let desc = match val {
                Value::Image(img) => {
@@ -342,7 +342,7 @@ impl DeltaLayer {

        for entry in keys {
            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val).await {
+            let desc = match dump_blob(val, ctx).await {
                Ok(desc) => desc,
                Err(err) => {
                    let err: anyhow::Error = err;
@@ -370,7 +370,7 @@ impl DeltaLayer {
            .load(LayerAccessKind::GetValueReconstructData, ctx)
            .await?;
        inner
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
            .await
    }

@@ -453,12 +453,12 @@ impl DeltaLayer {
        self.access_stats.record_access(access_kind, ctx);
        // Quick exit if already loaded
        self.inner
-            .get_or_try_init(|| self.load_inner())
+            .get_or_try_init(|| self.load_inner(ctx))
            .await
            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
+    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

        let summary = match &self.path_or_conf {
@@ -466,7 +466,7 @@ impl DeltaLayer {
            PathOrConf::Path(_) => None,
        };

-        let loaded = DeltaLayerInner::load(&path, summary).await?;
+        let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;

        if let PathOrConf::Path(ref path) = self.path_or_conf {
            // not production code
@@ -554,7 +554,7 @@ impl DeltaLayer {
            .load(LayerAccessKind::KeyIter, ctx)
            .await
            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner)
+        DeltaLayerInner::load_keys(inner, ctx)
            .await
            .context("Layer index is corrupted")
    }
@@ -849,13 +849,14 @@ impl DeltaLayerInner {
    pub(super) async fn load(
        path: &std::path::Path,
        summary: Option<Summary>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);

-        let summary_blk = file.read_blk(0).await?;
+        let summary_blk = file.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
@@ -883,6 +884,7 @@ impl DeltaLayerInner {
        key: Key,
        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
        let mut need_image = true;
        // Scan the page versions backwards, starting from `lsn`.
@@ -897,27 +899,38 @@ impl DeltaLayerInner {
        let mut offsets: Vec<(Lsn, u64)> = Vec::new();

        tree_reader
-            .visit(&search_key.0, VisitDirection::Backwards, |key, value| {
-                let blob_ref = BlobRef(value);
-                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                    return false;
-                }
-                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                if entry_lsn < lsn_range.start {
-                    return false;
-                }
-                offsets.push((entry_lsn, blob_ref.pos()));
+            .visit(
+                &search_key.0,
+                VisitDirection::Backwards,
+                |key, value| {
+                    let blob_ref = BlobRef(value);
+                    if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                        return false;
+                    }
+                    let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                    if entry_lsn < lsn_range.start {
+                        return false;
+                    }
+                    offsets.push((entry_lsn, blob_ref.pos()));

-                !blob_ref.will_init()
-            })
+                    !blob_ref.will_init()
+                },
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                    .build(),
+            )
            .await?;

+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerValue)
+            .build();
+
        // Ok, 'offsets' now contains the offsets of all the entries we need to read
        let cursor = file.block_cursor();
        let mut buf = Vec::new();
        for (entry_lsn, pos) in offsets {
            cursor
-                .read_blob_into_buf(pos, &mut buf)
+                .read_blob_into_buf(pos, &mut buf, ctx)
                .await
                .with_context(|| {
                    format!(
@@ -958,9 +971,10 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
-        this: &T,
-    ) -> Result<Vec<DeltaEntry<'_>>> {
+    pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
+        this: &'a T,
+        ctx: &'b RequestContext,
+    ) -> Result<Vec<DeltaEntry<'a>>> {
        let dl = this.as_ref();
        let file = &dl.file;

@@ -997,6 +1011,9 @@ impl DeltaLayerInner {
                    all_keys.push(entry);
                    true
                },
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                    .build(),
            )
            .await?;
        if let Some(last) = all_keys.last_mut() {
@@ -1026,9 +1043,9 @@ pub struct ValueRef<'a> {

 impl<'a> ValueRef<'a> {
    /// Loads the value from disk
-    pub async fn load(&self) -> Result<Value> {
+    pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
        // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
+        let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
        let val = Value::des(&buf)?;
        Ok(val)
    }
@@ -1037,7 +1054,11 @@ impl<'a> ValueRef<'a> {
 pub(crate) struct Adapter<T>(T);

 impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum).await
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<BlockLease, std::io::Error> {
+        self.0.as_ref().file.read_blk(blknum, ctx).await
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -24,7 +24,7 @@
 //! mapping from Key to an offset in the "values" part.  The
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
@@ -237,10 +237,15 @@ impl ImageLayer {
        tree_reader.dump().await?;

        tree_reader
-            .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
-                println!("key: {} offset {}", hex::encode(key), value);
-                true
-            })
+            .visit(
+                &[0u8; KEY_SIZE],
+                VisitDirection::Forwards,
+                |key, value| {
+                    println!("key: {} offset {}", hex::encode(key), value);
+                    true
+                },
+                ctx,
+            )
            .await?;

        Ok(())
@@ -261,7 +266,7 @@ impl ImageLayer {
            .load(LayerAccessKind::GetValueReconstructData, ctx)
            .await?;
        inner
-            .get_value_reconstruct_data(key, reconstruct_state)
+            .get_value_reconstruct_data(key, reconstruct_state, ctx)
            .await
            // FIXME: makes no sense to dump paths
            .with_context(|| format!("read {}", self.path().display()))
@@ -335,12 +340,12 @@ impl ImageLayer {
    ) -> Result<&ImageLayerInner> {
        self.access_stats.record_access(access_kind, ctx);
        self.inner
-            .get_or_try_init(|| self.load_inner())
+            .get_or_try_init(|| self.load_inner(ctx))
            .await
            .with_context(|| format!("Failed to load image layer {}", self.path().display()))
    }

-    async fn load_inner(&self) -> Result<ImageLayerInner> {
+    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

        let expected_summary = match &self.path_or_conf {
@@ -349,7 +354,8 @@ impl ImageLayer {
        };

        let loaded =
-            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
+                .await?;

        if let PathOrConf::Path(ref path) = self.path_or_conf {
            // not production code
@@ -436,12 +442,13 @@ impl ImageLayerInner {
        path: &std::path::Path,
        lsn: Lsn,
        summary: Option<Summary>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
            .await
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0).await?;
+        let summary_blk = file.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
@@ -470,16 +477,30 @@ impl ImageLayerInner {
        &self,
        key: Key,
        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
        let file = &self.file;
        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader.get(&keybuf).await? {
+        if let Some(offset) = tree_reader
+            .get(
+                &keybuf,
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+                    .build(),
+            )
+            .await?
+        {
            let blob = file
                .block_cursor()
-                .read_blob(offset)
+                .read_blob(
+                    offset,
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::ImageLayerValue)
+                        .build(),
+                )
                .await
                .with_context(|| format!("failed to read value from offset {}", offset))?;
            let value = Bytes::from(blob);
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -5,7 +5,7 @@
 //! its position in the file, is kept in memory, though.
 //!
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
@@ -106,7 +106,7 @@ impl InMemoryLayer {
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
-    pub async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().await;

        let end_str = self.end_lsn_or_max();
@@ -125,7 +125,7 @@ impl InMemoryLayer {
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf).await?;
+                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
                let val = Value::des(&buf);
                match val {
                    Ok(Value::Image(img)) => {
@@ -158,11 +158,15 @@ impl InMemoryLayer {
        key: Key,
        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValueReconstructState,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
        ensure!(lsn_range.start >= self.start_lsn);
        let mut need_image = true;

+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+
        let inner = self.inner.read().await;

        let reader = inner.file.block_cursor();
@@ -171,7 +175,7 @@ impl InMemoryLayer {
        if let Some(vec_map) = inner.index.get(&key) {
            let slice = vec_map.slice_range(lsn_range);
            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos).await?;
+                let buf = reader.read_blob(*pos, &ctx).await?;
                let value = Value::des(&buf)?;
                match value {
                    Value::Image(img) => {
@@ -263,7 +267,13 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    pub async fn put_value(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
        let inner: &mut _ = &mut *self.inner.write().await;
        self.assert_writable();
@@ -275,7 +285,15 @@ impl InMemoryLayer {
            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
            buf.clear();
            val.ser_into(&mut buf)?;
-            inner.file.write_blob(&buf).await?
+            inner
+                .file
+                .write_blob(
+                    &buf,
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::InMemoryLayer)
+                        .build(),
+                )
+                .await?
        };

        let vec_map = inner.index.entry(key).or_default();
@@ -313,7 +331,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(&self) -> Result<DeltaLayer> {
+    pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -343,11 +361,14 @@ impl InMemoryLayer {
        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
        keys.sort_by_key(|k| k.0);

+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
        for (key, vec_map) in keys.iter() {
            let key = **key;
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf).await?;
+                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
                let will_init = Value::des(&buf)?.will_init();
                delta_layer_writer
                    .put_value_bytes(key, *lsn, &buf, will_init)
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -74,7 +74,7 @@ impl Layer for RemoteLayer {
        _reconstruct_state: &mut ValueReconstructState,
        _ctx: &RequestContext,
    ) -> Result<ValueReconstructResult> {
-        bail!("layer {self} needs to be downloaded");
+        Err(anyhow::anyhow!("layer {self} needs to be downloaded"))
    }
 }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -471,7 +471,7 @@ impl Timeline {
        // The cached image can be returned directly if there is no WAL between the cached image
        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
        // for redo.
-        let cached_page_img = match self.lookup_cached_page(&key, lsn).await {
+        let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
            Some((cached_lsn, cached_img)) => {
                match cached_lsn.cmp(&lsn) {
                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
@@ -585,15 +585,7 @@ impl Timeline {
            Err(e) => {
                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
                drop(_timer);
-                let walreceiver_status = {
-                    match &*self.walreceiver.lock().unwrap() {
-                        None => "stopping or stopped".to_string(),
-                        Some(walreceiver) => match walreceiver.status() {
-                            Some(status) => status.to_human_readable_string(),
-                            None => "Not active".to_string(),
-                        },
-                    }
-                };
+                let walreceiver_status = self.walreceiver_status();
                Err(anyhow::Error::new(e).context({
                    format!(
                        "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
@@ -607,6 +599,16 @@ impl Timeline {
        }
    }

+    pub(crate) fn walreceiver_status(&self) -> String {
+        match &*self.walreceiver.lock().unwrap() {
+            None => "stopping or stopped".to_string(),
+            Some(walreceiver) => match walreceiver.status() {
+                Some(status) => status.to_human_readable_string(),
+                None => "Not active".to_string(),
+            },
+        }
+    }
+
    /// Check that it is valid to request operations with that lsn.
    pub fn check_lsn_is_in_scope(
        &self,
@@ -2516,13 +2518,18 @@ impl Timeline {
        }
    }

-    async fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
+    async fn lookup_cached_page(
+        &self,
+        key: &Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Option<(Lsn, Bytes)> {
        let cache = page_cache::get();

        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
        // We should look at the key to determine if it's a cacheable object
        let (lsn, read_guard) = cache
-            .lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)
+            .lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn, ctx)
            .await?;
        let img = Bytes::from(read_guard.to_vec());
        Some((lsn, img))
@@ -2556,10 +2563,16 @@ impl Timeline {
        Ok(layer)
    }

-    async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
+    async fn put_value(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        //info!("PUT: key {} at {}", key, lsn);
        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val).await?;
+        layer.put_value(key, lsn, val, ctx).await?;
        Ok(())
    }

@@ -2731,7 +2744,7 @@ impl Timeline {
                // Normal case, write out a L0 delta layer file.
                // `create_delta_layer` will not modify the layer map.
                // We will remove frozen layer and add delta layer in one atomic operation later.
-                let layer = self.create_delta_layer(&frozen_layer).await?;
+                let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
                (
                    HashMap::from([(
                        layer.filename(),
@@ -2756,9 +2769,7 @@ impl Timeline {

                // update metrics
                let sz = l.layer_desc().file_size;
-                self.metrics.resident_physical_size_gauge.add(sz);
-                self.metrics.num_persistent_files_created.inc_by(1);
-                self.metrics.persistent_bytes_written.inc_by(sz);
+                self.metrics.record_new_file_metrics(sz);
            }

            guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer);
@@ -2856,19 +2867,21 @@ impl Timeline {
    async fn create_delta_layer(
        self: &Arc<Self>,
        frozen_layer: &Arc<InMemoryLayer>,
+        ctx: &RequestContext,
    ) -> anyhow::Result<DeltaLayer> {
        let span = tracing::info_span!("blocking");
        let new_delta: DeltaLayer = tokio::task::spawn_blocking({
            let _g = span.entered();
            let self_clone = Arc::clone(self);
            let frozen_layer = Arc::clone(frozen_layer);
+            let ctx = ctx.attached_child();
            move || {
                // Write it out
                // Keep this inside `spawn_blocking` and `Handle::current`
                // as long as the write path is still sync and the read impl
                // is still not fully async. Otherwise executor threads would
                // be blocked.
-                let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
+                let new_delta = Handle::current().block_on(frozen_layer.write_to_disk(&ctx))?;
                let new_delta_path = new_delta.path();

                // Sync it to disk.
@@ -3133,9 +3146,8 @@ impl Timeline {
                LayerFileMetadata::new(metadata.len(), self.generation),
            );

-            self.metrics
-                .resident_physical_size_gauge
-                .add(metadata.len());
+            // update metrics
+            self.metrics.record_new_file_metrics(metadata.len());
            let l = Arc::new(l);
            l.access_stats().record_residence_event(
                LayerResidenceStatus::Resident,
@@ -3575,7 +3587,7 @@ impl Timeline {
            key, lsn, ref val, ..
        } in all_values_iter
        {
-            let value = val.load().await?;
+            let value = val.load(ctx).await?;
            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
            // We need to check key boundaries once we reach next key or end of layer with the same key
            if !same_key || lsn == dup_end_lsn {
@@ -3817,10 +3829,8 @@ impl Timeline {
                )?;
            }

-            // update the timeline's physical size
-            self.metrics
-                .resident_physical_size_gauge
-                .add(metadata.len());
+            // update metrics, including the timeline's physical size
+            self.metrics.record_new_file_metrics(metadata.len());

            new_layer_paths.insert(
                new_delta_path,
@@ -4702,8 +4712,14 @@ impl<'a> TimelineWriter<'a> {
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
-    pub async fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
-        self.tl.put_value(key, lsn, value).await
+    pub async fn put(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.tl.put_value(key, lsn, value, ctx).await
    }

    pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -328,9 +328,24 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else {
-            // likely, we're shutting down
-            return ControlFlow::Break(());
+        //
+        // It is critical we are responsive to cancellation here. Otherwise, we deadlock with
+        // tenant deletion (holds TENANTS in read mode) any other task that attempts to
+        // acquire TENANTS in write mode before we here call get_tenant.
+        // See https://github.com/neondatabase/neon/issues/5284.
+        let res = tokio::select! {
+            _ = cancel.cancelled() => {
+                return ControlFlow::Break(());
+            }
+            res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
+                res
+            }
+        };
+        let tenant = match res {
+            Ok(t) => t,
+            Err(_) => {
+                return ControlFlow::Break(());
+            }
        };
        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -135,7 +135,7 @@ impl WalReceiver {
        .await;
    }

-    pub(super) fn status(&self) -> Option<ConnectionManagerStatus> {
+    pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {
        self.manager_status.read().unwrap().clone()
    }
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,16 +10,15 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
-use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
+use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 use crate::tenant::TENANTS_SEGMENT_NAME;
-use futures::Future;
 use once_cell::sync::OnceCell;
 use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use tokio::sync::{RwLock, RwLockWriteGuard};
+use std::sync::{RwLock, RwLockWriteGuard};

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -111,7 +110,7 @@ impl OpenFiles {
    ///
    /// On return, we hold a lock on the slot, and its 'tag' has been updated
    /// recently_used has been set. It's all ready for reuse.
-    async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
+    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
        //
        // Run the clock algorithm to find a slot to replace.
        //
@@ -143,7 +142,7 @@ impl OpenFiles {
                }
                retries += 1;
            } else {
-                slot_guard = slot.inner.write().await;
+                slot_guard = slot.inner.write().unwrap();
                index = next;
                break;
            }
@@ -156,8 +155,8 @@ impl OpenFiles {
        if let Some(old_file) = slot_guard.file.take() {
            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
            // distinguish the two.
-            STORAGE_IO_TIME
-                .with_label_values(&["close-by-replace"])
+            STORAGE_IO_TIME_METRIC
+                .get(StorageIoOperation::CloseByReplace)
                .observe_closure_duration(|| drop(old_file));
        }

@@ -245,9 +244,10 @@ impl VirtualFile {
            tenant_id = "*".to_string();
            timeline_id = "*".to_string();
        }
-        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
-        let file = STORAGE_IO_TIME
-            .with_label_values(&["open"])
+        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
+
+        let file = STORAGE_IO_TIME_METRIC
+            .get(StorageIoOperation::Open)
            .observe_closure_duration(|| open_options.open(path))?;

        // Strip all options other than read and write.
@@ -331,22 +331,21 @@ impl VirtualFile {

    /// Call File::sync_all() on the underlying File.
    pub async fn sync_all(&self) -> Result<(), Error> {
-        self.with_file("fsync", |file| async move { file.sync_all() })
+        self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
            .await?
    }

    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        self.with_file("metadata", |file| async move { file.metadata() })
+        self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
            .await?
    }

    /// Helper function that looks up the underlying File for this VirtualFile,
    /// opening it and evicting some other File if necessary. It calls 'func'
    /// with the physical File.
-    async fn with_file<F, R, FR>(&self, _op: &str, func: F) -> Result<R, Error>
+    async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
    where
-        F: FnOnce(&File) -> FR,
-        FR: Future<Output = R>,
+        F: FnMut(&File) -> R,
    {
        let open_files = get_open_files();

@@ -357,17 +356,19 @@ impl VirtualFile {
            // We only need to hold the handle lock while we read the current handle. If
            // another thread closes the file and recycles the slot for a different file,
            // we will notice that the handle we read is no longer valid and retry.
-            let mut handle = *self.handle.read().await;
+            let mut handle = *self.handle.read().unwrap();
            loop {
                // Check if the slot contains our File
                {
                    let slot = &open_files.slots[handle.index];
-                    let slot_guard = slot.inner.read().await;
+                    let slot_guard = slot.inner.read().unwrap();
                    if slot_guard.tag == handle.tag {
                        if let Some(file) = &slot_guard.file {
                            // Found a cached file descriptor.
                            slot.recently_used.store(true, Ordering::Relaxed);
-                            return Ok(func(file).await);
+                            return Ok(STORAGE_IO_TIME_METRIC
+                                .get(op)
+                                .observe_closure_duration(|| func(file)));
                        }
                    }
                }
@@ -375,7 +376,7 @@ impl VirtualFile {
                // The slot didn't contain our File. We will have to open it ourselves,
                // but before that, grab a write lock on handle in the VirtualFile, so
                // that no other thread will try to concurrently open the same file.
-                let handle_guard = self.handle.write().await;
+                let handle_guard = self.handle.write().unwrap();

                // If another thread changed the handle while we were not holding the lock,
                // then the handle might now be valid again. Loop back to retry.
@@ -389,15 +390,17 @@ impl VirtualFile {

        // We need to open the file ourselves. The handle in the VirtualFile is
        // now locked in write-mode. Find a free slot to put it in.
-        let (handle, mut slot_guard) = open_files.find_victim_slot().await;
+        let (handle, mut slot_guard) = open_files.find_victim_slot();

        // Open the physical file
-        let file = STORAGE_IO_TIME
-            .with_label_values(&["open"])
+        let file = STORAGE_IO_TIME_METRIC
+            .get(StorageIoOperation::Open)
            .observe_closure_duration(|| self.open_options.open(&self.path))?;

        // Perform the requested operation on it
-        let result = func(&file).await;
+        let result = STORAGE_IO_TIME_METRIC
+            .get(op)
+            .observe_closure_duration(|| func(&file));

        // Store the File in the slot and update the handle in the VirtualFile
        // to point to it.
@@ -421,7 +424,7 @@ impl VirtualFile {
            }
            SeekFrom::End(offset) => {
                self.pos = self
-                    .with_file("seek", |mut file| async move {
+                    .with_file(StorageIoOperation::Seek, |mut file| {
                        file.seek(SeekFrom::End(offset))
                    })
                    .await??
@@ -513,7 +516,7 @@ impl VirtualFile {

    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
        let result = self
-            .with_file("read", |file| async move { file.read_at(buf, offset) })
+            .with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
@@ -525,7 +528,7 @@ impl VirtualFile {

    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
        let result = self
-            .with_file("write", |file| async move { file.write_at(buf, offset) })
+            .with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
            .await?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
@@ -568,25 +571,18 @@ impl VirtualFile {
 impl Drop for VirtualFile {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
-        let handle = self.handle.get_mut();
+        let handle = self.handle.get_mut().unwrap();

-        // We don't have async drop so we cannot wait for the lock here.
-        // Instead, do a best-effort attempt at closing the underlying
-        // file descriptor by using `try_write`.
-        // This best-effort attempt should be quite good though
-        // as we have `&mut self` access. In other words, if the slot
-        // is still occupied by our file, we should be the only ones
-        // accessing it (and if it has been reassigned since, we don't
-        // need to bother with dropping anyways).
+        // We could check with a read-lock first, to avoid waiting on an
+        // unrelated I/O.
        let slot = &get_open_files().slots[handle.index];
-        let Ok(mut slot_guard) = slot.inner.try_write() else { return };
-
+        let mut slot_guard = slot.inner.write().unwrap();
        if slot_guard.tag == handle.tag {
            slot.recently_used.store(false, Ordering::Relaxed);
            // there is also operation "close-by-replace" for closes done on eviction for
            // comparison.
-            STORAGE_IO_TIME
-                .with_label_values(&["close"])
+            STORAGE_IO_TIME_METRIC
+                .get(StorageIoOperation::Close)
                .observe_closure_duration(|| drop(slot_guard.file.take()));
        }
    }
@@ -654,6 +650,12 @@ mod tests {
        File(File),
    }

+    impl From<VirtualFile> for MaybeVirtualFile {
+        fn from(vf: VirtualFile) -> Self {
+            MaybeVirtualFile::VirtualFile(vf)
+        }
+    }
+
    impl MaybeVirtualFile {
        async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
            match self {
@@ -891,4 +893,54 @@ mod tests {

        Ok(())
    }
+
+    #[tokio::test]
+    async fn test_atomic_overwrite_basic() {
+        let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
+        std::fs::create_dir_all(&testdir).unwrap();
+
+        let path = testdir.join("myfile");
+        let tmp_path = testdir.join("myfile.tmp");
+
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
+            .await
+            .unwrap();
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let post = file.read_string().await.unwrap();
+        assert_eq!(post, "foo");
+        assert!(!tmp_path.exists());
+        drop(file);
+
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar")
+            .await
+            .unwrap();
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let post = file.read_string().await.unwrap();
+        assert_eq!(post, "bar");
+        assert!(!tmp_path.exists());
+        drop(file);
+    }
+
+    #[tokio::test]
+    async fn test_atomic_overwrite_preexisting_tmp() {
+        let testdir =
+            crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
+        std::fs::create_dir_all(&testdir).unwrap();
+
+        let path = testdir.join("myfile");
+        let tmp_path = testdir.join("myfile.tmp");
+
+        std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
+        assert!(tmp_path.exists());
+
+        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
+            .await
+            .unwrap();
+
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
+        let post = file.read_string().await.unwrap();
+        assert_eq!(post, "foo");
+        assert!(!tmp_path.exists());
+        drop(file);
+    }
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -25,7 +25,7 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};

-use anyhow::{Context, Result};
+use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;

@@ -106,6 +106,10 @@ impl<'a> WalIngest<'a> {
            self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
                .await?;
        }
+        if decoded.xl_rmid == pg_constants::RM_NEON_ID {
+            self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
+                .await?;
+        }
        // Handle other special record types
        if decoded.xl_rmid == pg_constants::RM_SMGR_ID
            && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
@@ -172,6 +176,32 @@ impl<'a> WalIngest<'a> {
                            .await?;
                    }
                }
+            } else if self.timeline.pg_version == 16 {
+                if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
+                    == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG
+                {
+                    debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+                } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
+                    == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY
+                {
+                    // The XLOG record was renamed between v14 and v15,
+                    // but the record format is the same.
+                    // So we can reuse XlCreateDatabase here.
+                    debug!("XLOG_DBASE_CREATE_FILE_COPY");
+                    let createdb = XlCreateDatabase::decode(&mut buf);
+                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
+                        .await?;
+                } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
+                    == postgres_ffi::v16::bindings::XLOG_DBASE_DROP
+                {
+                    let dropdb = XlDropDatabase::decode(&mut buf);
+                    for tablespace_id in dropdb.tablespace_ids {
+                        trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
+                        modification
+                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                            .await?;
+                    }
+                }
            }
        } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
            trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
@@ -333,7 +363,7 @@ impl<'a> WalIngest<'a> {

        // Now that this record has been fully handled, including updating the
        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit().await?;
+        modification.commit(ctx).await?;

        Ok(())
    }
@@ -414,57 +444,215 @@ impl<'a> WalIngest<'a> {
        // need to clear the corresponding bits in the visibility map.
        let mut new_heap_blkno: Option<u32> = None;
        let mut old_heap_blkno: Option<u32> = None;
-        if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
-            let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-            if info == pg_constants::XLOG_HEAP_INSERT {
-                let xlrec = XlHeapInsert::decode(buf);
-                assert_eq!(0, buf.remaining());
-                if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                    new_heap_blkno = Some(decoded.blocks[0].blkno);
-                }
-            } else if info == pg_constants::XLOG_HEAP_DELETE {
-                let xlrec = XlHeapDelete::decode(buf);
-                assert_eq!(0, buf.remaining());
-                if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                    new_heap_blkno = Some(decoded.blocks[0].blkno);
-                }
-            } else if info == pg_constants::XLOG_HEAP_UPDATE
-                || info == pg_constants::XLOG_HEAP_HOT_UPDATE
-            {
-                let xlrec = XlHeapUpdate::decode(buf);
-                // the size of tuple data is inferred from the size of the record.
-                // we can't validate the remaining number of bytes without parsing
-                // the tuple data.
-                if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                    old_heap_blkno = Some(decoded.blocks[0].blkno);
-                }
-                if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                    // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                    // non-HOT update where the new tuple goes to different page than
-                    // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                    // set.
-                    new_heap_blkno = Some(decoded.blocks[1].blkno);
-                }
-            }
-        } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
-            let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-            if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
-                let xlrec = XlHeapMultiInsert::decode(buf);
+        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;

-                let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                    // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                    0
+        match self.timeline.pg_version {
+            14 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v14::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v14::XlHeapDelete::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v14::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v14::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v14::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v14::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
                } else {
-                    std::mem::size_of::<u16>() * xlrec.ntuples as usize
-                };
-                assert_eq!(offset_array_len, buf.remaining());
-
-                if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                    new_heap_blkno = Some(decoded.blocks[0].blkno);
+                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
+            15 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v15::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v15::XlHeapDelete::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v15::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v15::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v15::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v15::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
+            16 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v16::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v16::XlHeapDelete::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v16::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v16::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v16::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v16::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
+            _ => {}
        }
-        // FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED?

        // Clear the VM bits if required.
        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
@@ -508,7 +696,7 @@ impl<'a> WalIngest<'a> {
                        NeonWalRecord::ClearVisibilityMapFlags {
                            new_heap_blkno,
                            old_heap_blkno,
-                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                            flags,
                        },
                        ctx,
                    )
@@ -524,7 +712,7 @@ impl<'a> WalIngest<'a> {
                            NeonWalRecord::ClearVisibilityMapFlags {
                                new_heap_blkno,
                                old_heap_blkno: None,
-                                flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                                flags,
                            },
                            ctx,
                        )
@@ -538,7 +726,178 @@ impl<'a> WalIngest<'a> {
                            NeonWalRecord::ClearVisibilityMapFlags {
                                new_heap_blkno: None,
                                old_heap_blkno,
-                                flags: pg_constants::VISIBILITYMAP_VALID_BITS,
+                                flags,
+                            },
+                            ctx,
+                        )
+                        .await?;
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn ingest_neonrmgr_record(
+        &mut self,
+        buf: &mut Bytes,
+        modification: &mut DatadirModification<'_>,
+        decoded: &mut DecodedWALRecord,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Handle VM bit updates that are implicitly part of heap records.
+
+        // First, look at the record to determine which VM bits need
+        // to be cleared. If either of these variables is set, we
+        // need to clear the corresponding bits in the visibility map.
+        let mut new_heap_blkno: Option<u32> = None;
+        let mut old_heap_blkno: Option<u32> = None;
+        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
+
+        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
+
+        match self.timeline.pg_version {
+            16 => {
+                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                match info {
+                    pg_constants::XLOG_NEON_HEAP_INSERT => {
+                        let xlrec = v16::rm_neon::XlNeonHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_DELETE => {
+                        let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_UPDATE
+                    | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
+                        let xlrec = v16::rm_neon::XlNeonHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
+                        let xlrec = v16::rm_neon::XlNeonHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                std::mem::size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_LOCK => {
+                        let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                    info => bail!("Unknown WAL record type for Neon RMGR: {}", info),
+                }
+            }
+            _ => bail!(
+                "Neon RMGR has no known compatibility with PostgreSQL version {}",
+                self.timeline.pg_version
+            ),
+        }
+
+        // Clear the VM bits if required.
+        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
+            let vm_rel = RelTag {
+                forknum: VISIBILITYMAP_FORKNUM,
+                spcnode: decoded.blocks[0].rnode_spcnode,
+                dbnode: decoded.blocks[0].rnode_dbnode,
+                relnode: decoded.blocks[0].rnode_relnode,
+            };
+
+            let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
+            let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
+
+            // Sometimes, Postgres seems to create heap WAL records with the
+            // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
+            // not set. In fact, it's possible that the VM page does not exist at all.
+            // In that case, we don't want to store a record to clear the VM bit;
+            // replaying it would fail to find the previous image of the page, because
+            // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
+            // record if it doesn't.
+            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
+            if let Some(blknum) = new_vm_blk {
+                if blknum >= vm_size {
+                    new_vm_blk = None;
+                }
+            }
+            if let Some(blknum) = old_vm_blk {
+                if blknum >= vm_size {
+                    old_vm_blk = None;
+                }
+            }
+
+            if new_vm_blk.is_some() || old_vm_blk.is_some() {
+                if new_vm_blk == old_vm_blk {
+                    // An UPDATE record that needs to clear the bits for both old and the
+                    // new page, both of which reside on the same VM page.
+                    self.put_rel_wal_record(
+                        modification,
+                        vm_rel,
+                        new_vm_blk.unwrap(),
+                        NeonWalRecord::ClearVisibilityMapFlags {
+                            new_heap_blkno,
+                            old_heap_blkno,
+                            flags,
+                        },
+                        ctx,
+                    )
+                    .await?;
+                } else {
+                    // Clear VM bits for one heap page, or for two pages that reside on
+                    // different VM pages.
+                    if let Some(new_vm_blk) = new_vm_blk {
+                        self.put_rel_wal_record(
+                            modification,
+                            vm_rel,
+                            new_vm_blk,
+                            NeonWalRecord::ClearVisibilityMapFlags {
+                                new_heap_blkno,
+                                old_heap_blkno: None,
+                                flags,
+                            },
+                            ctx,
+                        )
+                        .await?;
+                    }
+                    if let Some(old_vm_blk) = old_vm_blk {
+                        self.put_rel_wal_record(
+                            modification,
+                            vm_rel,
+                            old_vm_blk,
+                            NeonWalRecord::ClearVisibilityMapFlags {
+                                new_heap_blkno: None,
+                                old_heap_blkno,
+                                flags,
                            },
                            ctx,
                        )
@@ -1202,7 +1561,7 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
-        m.commit().await?;
+        m.commit(ctx).await?;
        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;

        Ok(walingest)
@@ -1221,22 +1580,22 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;

        assert_current_logical_size(&tline, Lsn(0x50));

@@ -1322,7 +1681,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_current_logical_size(&tline, Lsn(0x60));

        // Check reported size and contents after truncation
@@ -1364,7 +1723,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1377,7 +1736,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1402,7 +1761,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_eq!(
            tline
                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1441,7 +1800,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1460,7 +1819,7 @@ mod tests {
        // Drop rel
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;

        // Check that rel is not visible anymore
        assert_eq!(
@@ -1478,7 +1837,7 @@ mod tests {
        walingest
            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;

        // Check that rel exists and size is correct
        assert_eq!(
@@ -1517,7 +1876,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit().await?;
+        m.commit(&ctx).await?;

        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
@@ -1562,7 +1921,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;

        // Check reported size and contents after truncation
        assert_eq!(
@@ -1611,7 +1970,7 @@ mod tests {
                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                .await?;
        }
-        m.commit().await?;
+        m.commit(&ctx).await?;

        assert_eq!(
            tline
@@ -1658,7 +2017,7 @@ mod tests {
            walingest
                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                .await?;
-            m.commit().await?;
+            m.commit(&ctx).await?;
        }

        assert_current_logical_size(&tline, Lsn(lsn));
@@ -1674,7 +2033,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE
@@ -1687,7 +2046,7 @@ mod tests {
        walingest
            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
            .await?;
-        m.commit().await?;
+        m.commit(&ctx).await?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
            RELSEG_SIZE - 1
@@ -1703,7 +2062,7 @@ mod tests {
            walingest
                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                .await?;
-            m.commit().await?;
+            m.commit(&ctx).await?;
            assert_eq!(
                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                size as BlockNumber
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -4,9 +4,10 @@

 use anyhow::Result;
 use bytes::{Buf, Bytes};
+use postgres_ffi::dispatch_pgversion;
 use postgres_ffi::pg_constants;
 use postgres_ffi::BLCKSZ;
-use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz};
+use postgres_ffi::{BlockNumber, TimestampTz};
 use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
 use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
 use serde::{Deserialize, Serialize};
@@ -76,9 +77,12 @@ pub struct DecodedBkpBlock {
    pub flags: u8,

    /* Information on full-page image, if any */
-    pub has_image: bool,   /* has image, even for consistency checking */
-    pub apply_image: bool, /* has image that should be restored */
-    pub will_init: bool,   /* record doesn't need previous page version to apply */
+    pub has_image: bool,
+    /* has image, even for consistency checking */
+    pub apply_image: bool,
+    /* has image that should be restored */
+    pub will_init: bool,
+    /* record doesn't need previous page version to apply */
    //char	   *bkp_image;
    pub hole_offset: u16,
    pub hole_length: u16,
@@ -134,6 +138,325 @@ impl XlRelmapUpdate {
    }
 }

+pub mod v14 {
+    use bytes::{Buf, Bytes};
+    use postgres_ffi::{OffsetNumber, TransactionId};
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapInsert {
+        pub offnum: OffsetNumber,
+        pub flags: u8,
+    }
+
+    impl XlHeapInsert {
+        pub fn decode(buf: &mut Bytes) -> XlHeapInsert {
+            XlHeapInsert {
+                offnum: buf.get_u16_le(),
+                flags: buf.get_u8(),
+            }
+        }
+    }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapMultiInsert {
+        pub flags: u8,
+        pub _padding: u8,
+        pub ntuples: u16,
+    }
+
+    impl XlHeapMultiInsert {
+        pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert {
+            XlHeapMultiInsert {
+                flags: buf.get_u8(),
+                _padding: buf.get_u8(),
+                ntuples: buf.get_u16_le(),
+            }
+        }
+    }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapDelete {
+        pub xmax: TransactionId,
+        pub offnum: OffsetNumber,
+        pub _padding: u16,
+        pub t_cid: u32,
+        pub infobits_set: u8,
+        pub flags: u8,
+    }
+
+    impl XlHeapDelete {
+        pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
+            XlHeapDelete {
+                xmax: buf.get_u32_le(),
+                offnum: buf.get_u16_le(),
+                _padding: buf.get_u16_le(),
+                t_cid: buf.get_u32_le(),
+                infobits_set: buf.get_u8(),
+                flags: buf.get_u8(),
+            }
+        }
+    }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapUpdate {
+        pub old_xmax: TransactionId,
+        pub old_offnum: OffsetNumber,
+        pub old_infobits_set: u8,
+        pub flags: u8,
+        pub t_cid: u32,
+        pub new_xmax: TransactionId,
+        pub new_offnum: OffsetNumber,
+    }
+
+    impl XlHeapUpdate {
+        pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
+            XlHeapUpdate {
+                old_xmax: buf.get_u32_le(),
+                old_offnum: buf.get_u16_le(),
+                old_infobits_set: buf.get_u8(),
+                flags: buf.get_u8(),
+                t_cid: buf.get_u32_le(),
+                new_xmax: buf.get_u32_le(),
+                new_offnum: buf.get_u16_le(),
+            }
+        }
+    }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapLock {
+        pub locking_xid: TransactionId,
+        pub offnum: OffsetNumber,
+        pub _padding: u16,
+        pub t_cid: u32,
+        pub infobits_set: u8,
+        pub flags: u8,
+    }
+
+    impl XlHeapLock {
+        pub fn decode(buf: &mut Bytes) -> XlHeapLock {
+            XlHeapLock {
+                locking_xid: buf.get_u32_le(),
+                offnum: buf.get_u16_le(),
+                _padding: buf.get_u16_le(),
+                t_cid: buf.get_u32_le(),
+                infobits_set: buf.get_u8(),
+                flags: buf.get_u8(),
+            }
+        }
+    }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapLockUpdated {
+        pub xmax: TransactionId,
+        pub offnum: OffsetNumber,
+        pub infobits_set: u8,
+        pub flags: u8,
+    }
+
+    impl XlHeapLockUpdated {
+        pub fn decode(buf: &mut Bytes) -> XlHeapLockUpdated {
+            XlHeapLockUpdated {
+                xmax: buf.get_u32_le(),
+                offnum: buf.get_u16_le(),
+                infobits_set: buf.get_u8(),
+                flags: buf.get_u8(),
+            }
+        }
+    }
+}
+
+pub mod v15 {
+    pub use super::v14::{
+        XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate,
+    };
+}
+
+pub mod v16 {
+    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert};
+    use bytes::{Buf, Bytes};
+    use postgres_ffi::{OffsetNumber, TransactionId};
+
+    pub struct XlHeapDelete {
+        pub xmax: TransactionId,
+        pub offnum: OffsetNumber,
+        pub infobits_set: u8,
+        pub flags: u8,
+    }
+
+    impl XlHeapDelete {
+        pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
+            XlHeapDelete {
+                xmax: buf.get_u32_le(),
+                offnum: buf.get_u16_le(),
+                infobits_set: buf.get_u8(),
+                flags: buf.get_u8(),
+            }
+        }
+    }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapUpdate {
+        pub old_xmax: TransactionId,
+        pub old_offnum: OffsetNumber,
+        pub old_infobits_set: u8,
+        pub flags: u8,
+        pub new_xmax: TransactionId,
+        pub new_offnum: OffsetNumber,
+    }
+
+    impl XlHeapUpdate {
+        pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
+            XlHeapUpdate {
+                old_xmax: buf.get_u32_le(),
+                old_offnum: buf.get_u16_le(),
+                old_infobits_set: buf.get_u8(),
+                flags: buf.get_u8(),
+                new_xmax: buf.get_u32_le(),
+                new_offnum: buf.get_u16_le(),
+            }
+        }
+    }
+
+    #[repr(C)]
+    #[derive(Debug)]
+    pub struct XlHeapLock {
+        pub locking_xid: TransactionId,
+        pub offnum: OffsetNumber,
+        pub infobits_set: u8,
+        pub flags: u8,
+    }
+
+    impl XlHeapLock {
+        pub fn decode(buf: &mut Bytes) -> XlHeapLock {
+            XlHeapLock {
+                locking_xid: buf.get_u32_le(),
+                offnum: buf.get_u16_le(),
+                infobits_set: buf.get_u8(),
+                flags: buf.get_u8(),
+            }
+        }
+    }
+
+    /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */
+    pub mod rm_neon {
+        use bytes::{Buf, Bytes};
+        use postgres_ffi::{OffsetNumber, TransactionId};
+
+        #[repr(C)]
+        #[derive(Debug)]
+        pub struct XlNeonHeapInsert {
+            pub offnum: OffsetNumber,
+            pub flags: u8,
+        }
+
+        impl XlNeonHeapInsert {
+            pub fn decode(buf: &mut Bytes) -> XlNeonHeapInsert {
+                XlNeonHeapInsert {
+                    offnum: buf.get_u16_le(),
+                    flags: buf.get_u8(),
+                }
+            }
+        }
+
+        #[repr(C)]
+        #[derive(Debug)]
+        pub struct XlNeonHeapMultiInsert {
+            pub flags: u8,
+            pub _padding: u8,
+            pub ntuples: u16,
+            pub t_cid: u32,
+        }
+
+        impl XlNeonHeapMultiInsert {
+            pub fn decode(buf: &mut Bytes) -> XlNeonHeapMultiInsert {
+                XlNeonHeapMultiInsert {
+                    flags: buf.get_u8(),
+                    _padding: buf.get_u8(),
+                    ntuples: buf.get_u16_le(),
+                    t_cid: buf.get_u32_le(),
+                }
+            }
+        }
+
+        #[repr(C)]
+        #[derive(Debug)]
+        pub struct XlNeonHeapDelete {
+            pub xmax: TransactionId,
+            pub offnum: OffsetNumber,
+            pub infobits_set: u8,
+            pub flags: u8,
+            pub t_cid: u32,
+        }
+
+        impl XlNeonHeapDelete {
+            pub fn decode(buf: &mut Bytes) -> XlNeonHeapDelete {
+                XlNeonHeapDelete {
+                    xmax: buf.get_u32_le(),
+                    offnum: buf.get_u16_le(),
+                    infobits_set: buf.get_u8(),
+                    flags: buf.get_u8(),
+                    t_cid: buf.get_u32_le(),
+                }
+            }
+        }
+
+        #[repr(C)]
+        #[derive(Debug)]
+        pub struct XlNeonHeapUpdate {
+            pub old_xmax: TransactionId,
+            pub old_offnum: OffsetNumber,
+            pub old_infobits_set: u8,
+            pub flags: u8,
+            pub t_cid: u32,
+            pub new_xmax: TransactionId,
+            pub new_offnum: OffsetNumber,
+        }
+
+        impl XlNeonHeapUpdate {
+            pub fn decode(buf: &mut Bytes) -> XlNeonHeapUpdate {
+                XlNeonHeapUpdate {
+                    old_xmax: buf.get_u32_le(),
+                    old_offnum: buf.get_u16_le(),
+                    old_infobits_set: buf.get_u8(),
+                    flags: buf.get_u8(),
+                    t_cid: buf.get_u32(),
+                    new_xmax: buf.get_u32_le(),
+                    new_offnum: buf.get_u16_le(),
+                }
+            }
+        }
+
+        #[repr(C)]
+        #[derive(Debug)]
+        pub struct XlNeonHeapLock {
+            pub locking_xid: TransactionId,
+            pub t_cid: u32,
+            pub offnum: OffsetNumber,
+            pub infobits_set: u8,
+            pub flags: u8,
+        }
+
+        impl XlNeonHeapLock {
+            pub fn decode(buf: &mut Bytes) -> XlNeonHeapLock {
+                XlNeonHeapLock {
+                    locking_xid: buf.get_u32_le(),
+                    t_cid: buf.get_u32_le(),
+                    offnum: buf.get_u16_le(),
+                    infobits_set: buf.get_u8(),
+                    flags: buf.get_u8(),
+                }
+            }
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlSmgrCreate {
@@ -223,90 +546,6 @@ impl XlDropDatabase {
    }
 }

-#[repr(C)]
-#[derive(Debug)]
-pub struct XlHeapInsert {
-    pub offnum: OffsetNumber,
-    pub flags: u8,
-}
-
-impl XlHeapInsert {
-    pub fn decode(buf: &mut Bytes) -> XlHeapInsert {
-        XlHeapInsert {
-            offnum: buf.get_u16_le(),
-            flags: buf.get_u8(),
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlHeapMultiInsert {
-    pub flags: u8,
-    pub _padding: u8,
-    pub ntuples: u16,
-}
-
-impl XlHeapMultiInsert {
-    pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert {
-        XlHeapMultiInsert {
-            flags: buf.get_u8(),
-            _padding: buf.get_u8(),
-            ntuples: buf.get_u16_le(),
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlHeapDelete {
-    pub xmax: TransactionId,
-    pub offnum: OffsetNumber,
-    pub _padding: u16,
-    pub t_cid: u32,
-    pub infobits_set: u8,
-    pub flags: u8,
-}
-
-impl XlHeapDelete {
-    pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
-        XlHeapDelete {
-            xmax: buf.get_u32_le(),
-            offnum: buf.get_u16_le(),
-            _padding: buf.get_u16_le(),
-            t_cid: buf.get_u32_le(),
-            infobits_set: buf.get_u8(),
-            flags: buf.get_u8(),
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlHeapUpdate {
-    pub old_xmax: TransactionId,
-    pub old_offnum: OffsetNumber,
-    pub old_infobits_set: u8,
-    pub flags: u8,
-    pub t_cid: u32,
-    pub new_xmax: TransactionId,
-    pub new_offnum: OffsetNumber,
-}
-
-impl XlHeapUpdate {
-    pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
-        XlHeapUpdate {
-            old_xmax: buf.get_u32_le(),
-            old_offnum: buf.get_u16_le(),
-            old_infobits_set: buf.get_u8(),
-            flags: buf.get_u8(),
-            t_cid: buf.get_u32(),
-            new_xmax: buf.get_u32_le(),
-            new_offnum: buf.get_u16_le(),
-        }
-    }
-}
-
 ///
 /// Note: Parsing some fields is missing, because they're not needed.
 ///
@@ -321,9 +560,10 @@ pub struct XlXactParsedRecord {
    pub xact_time: TimestampTz,
    pub xinfo: u32,

-    pub db_id: Oid, /* MyDatabaseId */
-    pub ts_id: Oid, /* MyDatabaseTableSpace */
-
+    pub db_id: Oid,
+    /* MyDatabaseId */
+    pub ts_id: Oid,
+    /* MyDatabaseTableSpace */
    pub subxacts: Vec<TransactionId>,

    pub xnodes: Vec<RelFileNode>,
@@ -455,9 +695,12 @@ impl MultiXactMember {
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlMultiXactCreate {
-    pub mid: MultiXactId,      /* new MultiXact's ID */
-    pub moff: MultiXactOffset, /* its starting offset in members file */
-    pub nmembers: u32,         /* number of member XIDs */
+    pub mid: MultiXactId,
+    /* new MultiXact's ID */
+    pub moff: MultiXactOffset,
+    /* its starting offset in members file */
+    pub nmembers: u32,
+    /* number of member XIDs */
    pub members: Vec<MultiXactMember>,
 }

@@ -484,7 +727,8 @@ impl XlMultiXactCreate {
 pub struct XlMultiXactTruncate {
    pub oldest_multi_db: Oid,
    /* to-be-truncated range of multixact offsets */
-    pub start_trunc_off: MultiXactId, /* just for completeness' sake */
+    pub start_trunc_off: MultiXactId,
+    /* just for completeness' sake */
    pub end_trunc_off: MultiXactId,

    /* to-be-truncated range of multixact members */
@@ -626,12 +870,10 @@ pub fn decode_wal_record(
                    blk.hole_offset = buf.get_u16_le();
                    blk.bimg_info = buf.get_u8();

-                    blk.apply_image = if pg_version == 14 {
-                        (blk.bimg_info & postgres_ffi::v14::bindings::BKPIMAGE_APPLY) != 0
-                    } else {
-                        assert_eq!(pg_version, 15);
-                        (blk.bimg_info & postgres_ffi::v15::bindings::BKPIMAGE_APPLY) != 0
-                    };
+                    blk.apply_image = dispatch_pgversion!(
+                        pg_version,
+                        (blk.bimg_info & pgv::bindings::BKPIMAGE_APPLY) != 0
+                    );

                    let blk_img_is_compressed =
                        postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?;
--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -2,4 +2,3 @@ comment = '** Deprecated ** Please use pg_embedding instead'
 default_version = '0.1.0'
 module_pathname = '$libdir/hnsw'
 relocatable = true
-trusted = true
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -19,13 +19,16 @@
 #include <fcntl.h>

 #include "postgres.h"
+
+#include "neon_pgversioncompat.h"
+
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "pagestore_client.h"
 #include "access/parallel.h"
 #include "postmaster/bgworker.h"
-#include "storage/relfilenode.h"
+#include RELFILEINFO_HDR
 #include "storage/buf_internals.h"
 #include "storage/latch.h"
 #include "storage/ipc.h"
@@ -77,6 +80,7 @@ typedef struct FileCacheEntry

 typedef struct FileCacheControl
 {
+	uint64 generation; /* generation is needed to handle correct hash reenabling */
 	uint32 size; /* size of cache file in chunks */
 	uint32 used; /* number of used chunks */
 	dlist_head lru; /* double linked list for LRU replacement algorithm */
@@ -88,7 +92,6 @@ static LWLockId lfc_lock;
 static int   lfc_max_size;
 static int   lfc_size_limit;
 static int   lfc_free_space_watermark;
-static bool	lfc_disabled_by_failure = false;
 static char* lfc_path;
 static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
@@ -97,10 +100,62 @@ static shmem_request_hook_type prev_shmem_request_hook;
 #endif
 static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */

-#define DISABLE_LFC() (lfc_max_size = 0, lfc_disabled_by_failure = true, lfc_desc = -1)
-
 void FileCacheMonitorMain(Datum main_arg);

+/*
+ * Local file cache is mandatory and Neon can work without it.
+ * In case of any any errors with this cache, we should disable it but to not throw error.
+ * Also we should allow  re-enable it if source of failure (lack of disk space, permissions,...) is fixed.
+ * All cache content should be invalidated to avoid reading of stale or corrupted data
+ */
+static void
+lfc_disable(char const* op)
+{
+	HASH_SEQ_STATUS status;
+	FileCacheEntry* entry;
+
+	elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
+
+	if (lfc_desc > 0)
+		close(lfc_desc);
+
+	lfc_desc = -1;
+	lfc_size_limit = 0;
+
+	/* Invalidate hash */
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+
+	hash_seq_init(&status, lfc_hash);
+	while ((entry = hash_seq_search(&status)) != NULL)
+	{
+		hash_search(lfc_hash, &entry->key, HASH_REMOVE, NULL);
+		memset(entry->bitmap, 0, sizeof entry->bitmap);
+	}
+	hash_seq_term(&status);
+	lfc_ctl->generation += 1;
+	lfc_ctl->size = 0;
+	lfc_ctl->used = 0;
+	dlist_init(&lfc_ctl->lru);
+
+	LWLockRelease(lfc_lock);
+}
+
+static bool
+lfc_ensure_opened(void)
+{
+	/* Open cache file if not done yet */
+	if (lfc_desc <= 0)
+	{
+		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
+
+		if (lfc_desc < 0) {
+			lfc_disable("open");
+			return false;
+		}
+	}
+	return true;
+}
+
 static void
 lfc_shmem_startup(void)
 {
@@ -126,6 +181,7 @@ lfc_shmem_startup(void)
 								 lfc_size+1, lfc_size+1,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
+		lfc_ctl->generation = 0;
 		lfc_ctl->size = 0;
 		lfc_ctl->used = 0;
 		dlist_init(&lfc_ctl->lru);
@@ -166,8 +222,9 @@ lfc_change_limit_hook(int newval, void *extra)
 	/*
 	 * Stats collector detach shared memory, so we should not try to access shared memory here.
 	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
+	 * The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
 	 */
-	if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker())
+	if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker())
 		return;

 	/* Open cache file if not done yet */
@@ -331,7 +388,7 @@ lfc_init(void)
 							   NULL,
 							   NULL);

-	if (lfc_max_size == 0 || lfc_disabled_by_failure)
+	if (lfc_max_size == 0)
 		return;

 	if (lfc_free_space_watermark != 0)
@@ -352,7 +409,7 @@ lfc_init(void)
 * Returns true if page is found in local cache.
 */
 bool
-lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
+lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
 	BufferTag tag;
 	FileCacheEntry* entry;
@@ -360,10 +417,10 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 	bool found;
 	uint32 hash;

-	if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return false;

-	tag.rnode = rnode;
+	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);
@@ -379,7 +436,7 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 * Evict a page (if present) from the local file cache
 */
 void
-lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
+lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 {
 	BufferTag tag;
 	FileCacheEntry* entry;
@@ -387,10 +444,12 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
 	uint32 hash;

-	if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return;

-	INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
+	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forkNum;
+	tag.blockNum = (blkno & ~(BLOCKS_PER_CHUNK - 1));

 	hash = get_hash_value(lfc_hash, &tag);

@@ -448,7 +507,7 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
 * In case of error lfc_size_limit is set to zero to disable any further opera-tins with cache.
 */
 bool
-lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 char *buffer)
 {
 	BufferTag tag;
@@ -457,11 +516,16 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
 	bool result = true;
 	uint32 hash;
+	uint64 generation;
+	uint32 entry_offset;

-	if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
 		return false;

-	tag.rnode = rnode;
+	if (!lfc_ensure_opened())
+		return false;
+
+	CopyNRelFileInfoToBufTag(tag, rinfo);
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
 	hash = get_hash_value(lfc_hash, &tag);
@@ -477,37 +541,29 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	/* Unlink entry from LRU list to pin it for the duration of IO operation */
 	if (entry->access_count++ == 0)
 		dlist_delete(&entry->lru_node);
+	generation = lfc_ctl->generation;
+	entry_offset = entry->offset;
+
 	LWLockRelease(lfc_lock);

-	/* Open cache file if not done yet */
-	if (lfc_desc <= 0)
+	rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+	if (rc != BLCKSZ)
 	{
-		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
-
-		if (lfc_desc < 0) {
-			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
-			DISABLE_LFC();
-			result = false;
-		}
-	}
-
-
-	if (lfc_desc > 0)
-	{
-		rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
-		if (rc != BLCKSZ)
-		{
-			elog(INFO, "Failed to read file cache: %m");
-			DISABLE_LFC();
-			result = false;
-		}
+		lfc_disable("read");
+		return false;
 	}

 	/* Place entry to the head of LRU list */
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-	Assert(entry->access_count > 0);
-	if (--entry->access_count == 0)
-		dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+	if (lfc_ctl->generation == generation)
+	{
+		Assert(entry->access_count > 0);
+		if (--entry->access_count == 0)
+			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+	}
+	else
+		result = false;
+
 	LWLockRelease(lfc_lock);

 	return result;
@@ -518,8 +574,12 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 * If cache is full then evict some other page.
 */
 void
-lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+#if PG_MAJORVERSION_NUM < 16
 		  char *buffer)
+#else
+		  const void *buffer)
+#endif
 {
 	BufferTag tag;
 	FileCacheEntry* entry;
@@ -528,12 +588,17 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
 	uint32 hash;

-	if (lfc_size_limit == 0 || lfc_disabled_by_failure) /* fast exit if file cache is disabled */
+	if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
+		return;
+
+	if (!lfc_ensure_opened())
 		return;

-	tag.rnode = rnode;
 	tag.forkNum = forkNum;
 	tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
+	
+	CopyNRelFileInfoToBufTag(tag, rinfo);
+	
 	hash = get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -572,34 +637,23 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 		entry->access_count = 1;
 		memset(entry->bitmap, 0, sizeof entry->bitmap);
 	}
-	LWLockRelease(lfc_lock);

-	/* Open cache file if not done yet */
-	if (lfc_desc <= 0)
+	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
+	if (rc != BLCKSZ)
 	{
-		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
-		if (lfc_desc < 0) {
-			elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
-			DISABLE_LFC(); /* disable file cache */
-		}
+		LWLockRelease(lfc_lock);
+		lfc_disable("write");
 	}
-	if (lfc_desc > 0)
+	else
 	{
-		rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
-		if (rc != BLCKSZ)
-		{
-			elog(WARNING, "Failed to write file cache: %m, disabling file cache");
-			DISABLE_LFC(); /* disable file cache */
-		}
-	}
-	/* Place entry to the head of LRU list */
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-	Assert(entry->access_count > 0);
-	if (--entry->access_count == 0)
-		dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
-	if (lfc_size_limit != 0)
+		/* Place entry to the head of LRU list */
+		Assert(entry->access_count > 0);
+		if (--entry->access_count == 0)
+			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
+
 		entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
-	LWLockRelease(lfc_lock);
+		LWLockRelease(lfc_lock);
+	}
 }

 /*
@@ -676,8 +730,13 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
 		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "pageoffs",
 						   INT8OID, -1, 0);
+#if PG_MAJORVERSION_NUM < 16
 		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
 						   OIDOID, -1, 0);
+#else
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenumber",
+						   OIDOID, -1, 0);
+#endif
 		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
 						   OIDOID, -1, 0);
 		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
@@ -699,6 +758,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0;
 		}
+		hash_seq_term(&status);
 		fctx->record = (LocalCachePagesRec *)
 			MemoryContextAllocHuge(CurrentMemoryContext,
 								   sizeof(LocalCachePagesRec) * n_pages);
@@ -727,9 +787,9 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				if (entry->bitmap[i >> 5] & (1 << (i & 31)))
 				{
 					fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
-					fctx->record[n_pages].relfilenode = entry->key.rnode.relNode;
-					fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode;
-					fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode;
+					fctx->record[n_pages].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
+					fctx->record[n_pages].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
+					fctx->record[n_pages].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
 					fctx->record[n_pages].forknum = entry->key.forkNum;
 					fctx->record[n_pages].blocknum = entry->key.blockNum + i;
 					fctx->record[n_pages].accesscount = entry->access_count;
@@ -737,6 +797,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				}
 			}
 		}
+		hash_seq_term(&status);
 		Assert(n_pages == funcctx->max_calls);
 		LWLockRelease(lfc_lock);
 	}
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -442,7 +442,7 @@ pg_init_libpagestore(void)
 							"Maximal attempts to reconnect to pages server (with 1 second timeout)",
 							NULL,
 							&max_reconnect_attempts,
-							10, 0, INT_MAX,
+							60, 0, INT_MAX,
 							PGC_USERSET,
 							0,
 							NULL, NULL, NULL);
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -33,6 +33,14 @@ void		_PG_init(void);
 void
 _PG_init(void)
 {
+	/*
+	 * Also load 'neon_rmgr'. This makes it unnecessary to list both 'neon'
+	 * and 'neon_rmgr' in shared_preload_libraries.
+	 */
+#if PG_VERSION_NUM >= 160000
+	load_file("$libdir/neon_rmgr", false);
+#endif
+
 	pg_init_libpagestore();
 	pg_init_walproposer();

@@ -40,9 +48,9 @@ _PG_init(void)

 	pg_init_extension_server();

-        // Important: This must happen after other parts of the extension
-        // are loaded, otherwise any settings to GUCs that were set before
-        // the extension was loaded will be removed.
+	// Important: This must happen after other parts of the extension
+	// are loaded, otherwise any settings to GUCs that were set before
+	// the extension was loaded will be removed.
 	EmitWarningsOnPlaceholders("neon");
 }

--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -0,0 +1,112 @@
+/*
+ * Compatibility macros to cover up differences between supported PostgreSQL versions,
+ * to help with compiling the same sources for all of them.
+ */
+
+#ifndef NEON_PGVERSIONCOMPAT_H
+#define NEON_PGVERSIONCOMPAT_H
+
+#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
+
+#define RelFileInfoEquals(a, b) ( \
+	NInfoGetSpcOid(a) == NInfoGetSpcOid(b) && \
+	NInfoGetDbOid(a) == NInfoGetDbOid(b) && \
+	NInfoGetRelNumber(a) == NInfoGetRelNumber(b) \
+)
+
+/* buftag population & RelFileNode/RelFileLocator rework */
+#if PG_MAJORVERSION_NUM < 16
+
+#define InitBufferTag(tag, rfn, fn, bn) INIT_BUFFERTAG(*tag, *rfn, fn, bn)
+
+#define USE_RELFILENODE
+
+#define RELFILEINFO_HDR "storage/relfilenode.h"
+
+#define NRelFileInfo RelFileNode
+#define NRelFileInfoBackend RelFileNodeBackend
+#define NRelFileNumber Oid
+
+#define InfoFromRelation(rel) (rel)->rd_node
+#define InfoFromSMgrRel(srel) (srel)->smgr_rnode.node
+#define InfoBFromSMgrRel(srel) (srel)->smgr_rnode
+#define InfoFromNInfoB(ninfob) ninfob.node
+
+#define RelFileInfoFmt(rinfo) \
+	(rinfo).spcNode, \
+	(rinfo).dbNode, \
+	(rinfo).relNode
+
+#define RelFileInfoBackendFmt(ninfob) \
+	(ninfob).backend, \
+	(ninfob).node.spcNode, \
+	(ninfob).node.dbNode, \
+	(ninfob).node.relNode
+
+#define NInfoGetSpcOid(ninfo)		(ninfo).spcNode
+#define NInfoGetDbOid(ninfo)		(ninfo).dbNode
+#define NInfoGetRelNumber(ninfo)	(ninfo).relNode
+
+#define CopyNRelFileInfoToBufTag(tag, rinfo) \
+	do { \
+		(tag).rnode = (rinfo); \
+	} while (false);
+
+#define BufTagGetNRelFileInfo(tag) tag.rnode
+
+#define SMgrRelGetRelInfo(reln) \
+	(reln->smgr_rnode.node)
+
+#define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
+
+#else /* major version >= 16 */
+
+#define USE_RELFILELOCATOR
+
+#define BUFFERTAGS_EQUAL(a, b) BufferTagsEqual(&(a), &(b))
+
+#define RELFILEINFO_HDR "storage/relfilelocator.h"
+
+#define NRelFileInfo RelFileLocator
+#define NRelFileInfoBackend RelFileLocatorBackend
+
+#define InfoFromRelation(rel) (rel)->rd_locator
+#define InfoFromSMgrRel(srel) (srel)->smgr_rlocator.locator
+#define InfoBFromSMgrRel(srel) (srel)->smgr_rlocator
+#define InfoFromNInfoB(ninfob) (ninfob).locator
+
+#define RelFileInfoFmt(rinfo) \
+	(rinfo).spcOid, \
+	(rinfo).dbOid, \
+	(rinfo).relNumber
+#define RelFileInfoBackendFmt(ninfob) \
+	(ninfob).backend, \
+	(ninfob).locator.spcOid, \
+	(ninfob).locator.dbOid, \
+	(ninfob).locator.relNumber
+
+#define NInfoGetSpcOid(ninfo)		(ninfo).spcOid
+#define NInfoGetDbOid(ninfo)		(ninfo).dbOid
+#define NInfoGetRelNumber(ninfo)	(ninfo).relNumber
+
+#define CopyNRelFileInfoToBufTag(tag, rinfo) \
+	do { \
+		(tag).spcOid = (rinfo).spcOid; \
+		(tag).dbOid = (rinfo).dbOid; \
+		(tag).relNumber = (rinfo).relNumber; \
+	} while (false);
+
+#define BufTagGetNRelFileInfo(tag) \
+	((RelFileLocator) { \
+		.spcOid = (tag).spcOid, \
+		.dbOid = (tag).dbOid, \
+		.relNumber = (tag).relNumber, \
+	})
+
+#define SMgrRelGetRelInfo(reln) \
+	((reln)->smgr_rlocator)
+
+#define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
+#endif
+
+#endif //NEON_PGVERSIONCOMPAT_H
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -14,9 +14,10 @@
 #define pageserver_h

 #include "postgres.h"
+#include "neon_pgversioncompat.h"

 #include "access/xlogdefs.h"
-#include "storage/relfilenode.h"
+#include RELFILEINFO_HDR
 #include "storage/block.h"
 #include "storage/smgr.h"
 #include "lib/stringinfo.h"
@@ -71,14 +72,14 @@ typedef struct
 typedef struct
 {
 	NeonRequest req;
-	RelFileNode rnode;
+	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 }			NeonExistsRequest;

 typedef struct
 {
 	NeonRequest req;
-	RelFileNode rnode;
+	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 }			NeonNblocksRequest;

@@ -91,7 +92,7 @@ typedef struct
 typedef struct
 {
 	NeonRequest req;
-	RelFileNode rnode;
+	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 	BlockNumber blkno;
 }			NeonGetPageRequest;
@@ -164,7 +165,7 @@ extern char *neon_tenant;
 extern bool wal_redo;
 extern int32 max_cluster_size;

-extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode);
+extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);

@@ -175,19 +176,35 @@ extern void neon_open(SMgrRelation reln);
 extern void neon_close(SMgrRelation reln, ForkNumber forknum);
 extern void neon_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern bool neon_exists(SMgrRelation reln, ForkNumber forknum);
-extern void neon_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void neon_unlink(NRelFileInfoBackend rnode, ForkNumber forknum, bool isRedo);
+#if PG_MAJORVERSION_NUM < 16
 extern void neon_extend(SMgrRelation reln, ForkNumber forknum,
 						BlockNumber blocknum, char *buffer, bool skipFsync);
+#else
+extern void neon_extend(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum, int nbuffers, bool skipFsync);
+#endif
+
 extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum);
+
+#if PG_MAJORVERSION_NUM < 16
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
-
-extern void neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
 							 XLogRecPtr request_lsn, bool request_latest, char *buffer);
-
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
+#else
+extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					  void *buffer);
+extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
+							 XLogRecPtr request_lsn, bool request_latest, void *buffer);
+extern void neon_write(SMgrRelation reln, ForkNumber forknum,
+					   BlockNumber blocknum, const void *buffer, bool skipFsync);
+#endif
 extern void neon_writeback(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum);
@@ -198,16 +215,22 @@ extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);

 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
-extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size);
-extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
-extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
-extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
+extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size);
+extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
+extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
+extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);

 /* functions for local file cache */
-extern void lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
-extern bool lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, char *buffer);
-extern bool lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
-extern void lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno);
+#if PG_MAJORVERSION_NUM < 16
+extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+					  char *buffer);
+#else
+extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+					  const void *buffer);
+#endif
+extern bool lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer);
+extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno);
+extern void lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno);
 extern void lfc_init(void);


--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -58,7 +58,6 @@
 #include "postmaster/autovacuum.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
-#include "storage/relfilenode.h"
 #include "storage/buf_internals.h"
 #include "storage/smgr.h"
 #include "storage/md.h"
@@ -86,7 +85,10 @@
 static char *hexdump_page(char *page);
 #endif

-#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId)
+#define IS_LOCAL_REL(reln) (\
+	NInfoGetDbOid(InfoFromSMgrRel(reln)) != 0 && \
+		NInfoGetRelNumber(InfoFromSMgrRel(reln)) > FirstNormalObjectId \
+)

 const int	SmgrTrace = DEBUG5;

@@ -160,6 +162,7 @@ typedef enum PrefetchStatus {
 typedef struct PrefetchRequest {
 	BufferTag	buftag; /* must be first entry in the struct */
 	XLogRecPtr	effective_request_lsn;
+	XLogRecPtr	actual_request_lsn;
 	NeonResponse *response; /* may be null */
 	PrefetchStatus status;
 	uint64		my_ring_index;
@@ -255,7 +258,7 @@ static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);

-static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode,
+static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
 									   ForkNumber forknum, BlockNumber blkno);

 static bool
@@ -314,6 +317,7 @@ compact_prefetch_buffers(void)
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
 		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
+		target_slot->actual_request_lsn = source_slot->actual_request_lsn;
 		target_slot->my_ring_index = empty_ring_index;

 		prfh_delete(MyPState->prf_hash, source_slot);
@@ -634,7 +638,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 		.req.tag = T_NeonGetPageRequest,
 		.req.latest = false,
 		.req.lsn = 0,
-		.rnode = slot->buftag.rnode,
+		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
 		.forknum = slot->buftag.forkNum,
 		.blkno = slot->buftag.blockNum,
 	};
@@ -643,13 +647,13 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	{
 		request.req.lsn = *force_lsn;
 		request.req.latest = *force_latest;
-		slot->effective_request_lsn = *force_lsn;
+		slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn;
 	}
 	else
 	{
 		XLogRecPtr lsn = neon_get_request_lsn(
 			&request.req.latest,
-			slot->buftag.rnode,
+			BufTagGetNRelFileInfo(slot->buftag),
 			slot->buftag.forkNum,
 			slot->buftag.blockNum
 		);
@@ -671,7 +675,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 		 * The best LSN to use for effective_request_lsn would be
 		 * XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
 		 */
-		request.req.lsn = lsn;
+		slot->actual_request_lsn = request.req.lsn = lsn;
 		prefetch_lsn = Max(prefetch_lsn, lsn);
 		slot->effective_request_lsn = prefetch_lsn;
 	}
@@ -893,9 +897,9 @@ nm_pack_request(NeonRequest * msg)

 				pq_sendbyte(&s, msg_req->req.latest);
 				pq_sendint64(&s, msg_req->req.lsn);
-				pq_sendint32(&s, msg_req->rnode.spcNode);
-				pq_sendint32(&s, msg_req->rnode.dbNode);
-				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
 				pq_sendbyte(&s, msg_req->forknum);

 				break;
@@ -906,9 +910,9 @@ nm_pack_request(NeonRequest * msg)

 				pq_sendbyte(&s, msg_req->req.latest);
 				pq_sendint64(&s, msg_req->req.lsn);
-				pq_sendint32(&s, msg_req->rnode.spcNode);
-				pq_sendint32(&s, msg_req->rnode.dbNode);
-				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
 				pq_sendbyte(&s, msg_req->forknum);

 				break;
@@ -929,9 +933,9 @@ nm_pack_request(NeonRequest * msg)

 				pq_sendbyte(&s, msg_req->req.latest);
 				pq_sendint64(&s, msg_req->req.lsn);
-				pq_sendint32(&s, msg_req->rnode.spcNode);
-				pq_sendint32(&s, msg_req->rnode.dbNode);
-				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
 				pq_sendbyte(&s, msg_req->forknum);
 				pq_sendint32(&s, msg_req->blkno);

@@ -1063,10 +1067,7 @@ nm_to_string(NeonMessage * msg)
 				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;

 				appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
-				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-								 msg_req->rnode.spcNode,
-								 msg_req->rnode.dbNode,
-								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
 				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
@@ -1079,10 +1080,7 @@ nm_to_string(NeonMessage * msg)
 				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;

 				appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
-				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-								 msg_req->rnode.spcNode,
-								 msg_req->rnode.dbNode,
-								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
 				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
@@ -1095,10 +1093,7 @@ nm_to_string(NeonMessage * msg)
 				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;

 				appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\"");
-				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-								 msg_req->rnode.spcNode,
-								 msg_req->rnode.dbNode,
-								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
 				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
 				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
 				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
@@ -1187,13 +1182,13 @@ nm_to_string(NeonMessage * msg)
 * directly because it skips the logging if the LSN is new enough.
 */
 static XLogRecPtr
-log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
+log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 Page page, bool page_std)
 {
 	PGAlignedBlock copied_buffer;

 	memcpy(copied_buffer.data, page, BLCKSZ);
-	return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std);
+	return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std);
 }

 /*
@@ -1210,9 +1205,14 @@ PageIsEmptyHeapPage(char *buffer)
 }

 static void
-neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
+neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+#if PG_MAJORVERSION_NUM < 16
+				 char *buffer, bool force)
+#else
+				 const char *buffer, bool force) 
+#endif
 {
-	XLogRecPtr	lsn = PageGetLSN(buffer);
+	XLogRecPtr	lsn = PageGetLSN((Page) buffer);

 	if (ShutdownRequestPending)
 		return;
@@ -1232,15 +1232,14 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr	recptr;

-		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
+								  (Page) buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
 		ereport(SmgrTrace,
 				(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
 						blocknum,
-						reln->smgr_rnode.node.spcNode,
-						reln->smgr_rnode.node.dbNode,
-						reln->smgr_rnode.node.relNode,
+						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
 	else if (lsn == InvalidXLogRecPtr)
@@ -1263,24 +1262,20 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 		 * sign: it implies that the page was not WAL-logged, and its contents
 		 * will be lost when it's evicted.
 		 */
-		if (PageIsNew(buffer))
+		if (PageIsNew((Page) buffer))
 		{
 			ereport(SmgrTrace,
 					(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
 							blocknum,
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
+							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
-		else if (PageIsEmptyHeapPage(buffer))
+		else if (PageIsEmptyHeapPage((Page) buffer))
 		{
 			ereport(SmgrTrace,
 					(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
 							blocknum,
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
+							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
 		else
@@ -1288,9 +1283,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 			ereport(PANIC,
 					(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
+							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
 	}
@@ -1299,9 +1292,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 		ereport(SmgrTrace,
 				(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
 						blocknum,
-						reln->smgr_rnode.node.spcNode,
-						reln->smgr_rnode.node.dbNode,
-						reln->smgr_rnode.node.relNode,
+						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
 	}

@@ -1309,7 +1300,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 	 * Remember the LSN on this page. When we read the page again, we must
 	 * read the same or newer version of it.
 	 */
-	SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum);
+	SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forknum, blocknum);
 }

 /*
@@ -1379,7 +1370,7 @@ nm_adjust_lsn(XLogRecPtr lsn)
 * Return LSN for requesting pages and number of blocks from page server
 */
 static XLogRecPtr
-neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno)
+neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 {
 	XLogRecPtr	lsn;

@@ -1394,7 +1385,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN
 		/*
 		 * Get the last written LSN of this page.
 		 */
-		lsn = GetLastWrittenLSN(rnode, forknum, blkno);
+		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 		lsn = nm_adjust_lsn(lsn);

 		elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
@@ -1416,7 +1407,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN
 		 * so our request cannot concern those.
 		 */
 		*latest = true;
-		lsn = GetLastWrittenLSN(rnode, forknum, blkno);
+		lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
@@ -1485,7 +1476,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
+	if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
 	{
 		return true;
 	}
@@ -1500,20 +1491,26 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	 *
 	 * For now, handle that special case here.
 	 */
+#if PG_MAJORVERSION_NUM >= 16
+	if (reln->smgr_rlocator.locator.spcOid == 0 &&
+		reln->smgr_rlocator.locator.dbOid == 0 &&
+		reln->smgr_rlocator.locator.relNumber == 0)
+#else
 	if (reln->smgr_rnode.node.spcNode == 0 &&
 		reln->smgr_rnode.node.dbNode == 0 &&
 		reln->smgr_rnode.node.relNode == 0)
+#endif
 	{
 		return false;
 	}

-	request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO);
+	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
 			.req.latest = latest,
 			.req.lsn = request_lsn,
-			.rnode = reln->smgr_rnode.node,
+			.rinfo = InfoFromSMgrRel(reln),
 		.forknum = forkNum};

 		resp = page_server_request(&request);
@@ -1529,9 +1526,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
+							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
@@ -1571,9 +1566,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	}

 	elog(SmgrTrace, "Create relation %u/%u/%u.%u",
-		 reln->smgr_rnode.node.spcNode,
-		 reln->smgr_rnode.node.dbNode,
-		 reln->smgr_rnode.node.relNode,
+		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forkNum);

 	/*
@@ -1597,12 +1590,12 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 */
 	if (isRedo)
 	{
-		update_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
-		get_cached_relsize(reln->smgr_rnode.node, forkNum,
+		update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
+		get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
 						   &reln->smgr_cached_nblocks[forkNum]);
 	}
 	else
-		set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1629,17 +1622,17 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 * we are usually not in a transaction anymore when this is called.
 */
 void
-neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
+neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
 {
 	/*
 	 * Might or might not exist locally, depending on whether it's an unlogged
 	 * or permanent relation (or if DEBUG_COMPARE_LOCAL is set). Try to
 	 * unlink, it won't do any harm if the file doesn't exist.
 	 */
-	mdunlink(rnode, forkNum, isRedo);
-	if (!RelFileNodeBackendIsTemp(rnode))
+	mdunlink(rinfo, forkNum, isRedo);
+	if (!NRelFileInfoBackendIsTemp(rinfo))
 	{
-		forget_cached_relsize(rnode.node, forkNum);
+		forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
 	}
 }

@@ -1653,8 +1646,13 @@ neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 *		causes intervening file space to become filled with zeroes.
 */
 void
+#if PG_MAJORVERSION_NUM < 16
 neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			char *buffer, bool skipFsync)
+#else
+neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+			const void *buffer, bool skipFsync)
+#endif
 {
 	XLogRecPtr	lsn;
 	BlockNumber	n_blocks = 0;
@@ -1707,17 +1705,15 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);

 	neon_wallog_page(reln, forkNum, blkno, buffer, false);
-	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
+	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);

-	lsn = PageGetLSN(buffer);
+	lsn = PageGetLSN((Page) buffer);
 	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
-		 reln->smgr_rnode.node.spcNode,
-		 reln->smgr_rnode.node.dbNode,
-		 reln->smgr_rnode.node.relNode,
+		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);

-	lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer);
+	lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1732,11 +1728,98 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	if (lsn == InvalidXLogRecPtr)
 	{
 		lsn = GetXLogInsertRecPtr();
-		SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno);
+		SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
 	}
-	SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum);
+	SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum);
 }

+#if PG_MAJORVERSION_NUM >= 16
+void
+neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
+				int nblocks, bool skipFsync)
+{
+	const PGAlignedBlock buffer = {0};
+	BlockNumber curblocknum = blocknum;
+	int			remblocks = nblocks;
+	XLogRecPtr	lsn = 0;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	if (max_cluster_size > 0 &&
+		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
+		!IsAutoVacuumWorkerProcess())
+	{
+		uint64		current_size = GetZenithCurrentClusterSize();
+
+		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
+			ereport(ERROR,
+					(errcode(ERRCODE_DISK_FULL),
+					 errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
+							max_cluster_size),
+					 errhint("This limit is defined by neon.max_cluster_size GUC")));
+	}
+
+	/*
+	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+	 * more --- we mustn't create a block whose number actually is
+	 * InvalidBlockNumber or larger.
+	 */
+	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+					errmsg("cannot extend file \"%s\" beyond %u blocks",
+						   relpath(reln->smgr_rlocator, forkNum),
+						   InvalidBlockNumber)));
+
+	/* Don't log any pages if we're not allowed to do so. */
+	if (!XLogInsertAllowed())
+		return;
+
+	while (remblocks > 0)
+	{
+		int			count = Min(remblocks, XLR_MAX_BLOCK_ID);
+
+		XLogBeginInsert();
+
+		for (int i = 0; i < count; i++)
+			XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i,
+							  (char *) buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+
+		lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+
+		for (int i = 0; i < count; i++)
+		{
+			lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
+			SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum,
+									  blocknum + i);
+		}
+
+		blocknum += count;
+		remblocks -= count;
+	}
+
+	Assert(lsn != 0);
+
+	SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum);
+	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
+}
+#endif
+
 /*
 *  neon_open() -- Initialize newly-opened relation.
 */
@@ -1792,14 +1875,14 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum))
+	if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
 		return false;

 	tag = (BufferTag) {
-		.rnode = reln->smgr_rnode.node,
 		.forkNum = forknum,
 		.blockNum = blocknum
 	};
+	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));

 	ring_index = prefetch_register_buffer(tag, NULL, NULL);

@@ -1851,9 +1934,15 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 * While function is defined in the neon extension it's used within neon_test_utils directly.
 * To avoid breaking tests in the runtime please keep function signature in sync.
 */
-void
-neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+#if PG_MAJORVERSION_NUM < 16
+void PGDLLEXPORT
+neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 XLogRecPtr request_lsn, bool request_latest, char *buffer)
+#else
+void PGDLLEXPORT
+neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+				 XLogRecPtr request_lsn, bool request_latest, void *buffer)
+#endif
 {
 	NeonResponse *resp;
 	BufferTag	buftag;
@@ -1862,11 +1951,12 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	PrefetchRequest *slot;

 	buftag = (BufferTag) {
-		.rnode = rnode,
 		.forkNum = forkNum,
 		.blockNum = blkno,
 	};

+	CopyNRelFileInfoToBufTag(buftag, rinfo);
+
 	/*
 	 * The redo process does not lock pages that it needs to replay but are
 	 * not in the shared buffers, so a concurrent process may request the
@@ -1957,7 +2047,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	{
 		case T_NeonGetPageResponse:
 			memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
-			lfc_write(rnode, forkNum, blkno, buffer);
+			lfc_write(rinfo, forkNum, blkno, buffer);
 			break;

 		case T_NeonErrorResponse:
@@ -1965,9 +2055,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							blkno,
-							rnode.spcNode,
-							rnode.dbNode,
-							rnode.relNode,
+							RelFileInfoFmt(rinfo),
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
@@ -1987,7 +2075,11 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 */
 void
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+#if PG_MAJORVERSION_NUM < 16
 		  char *buffer)
+#else
+		  void *buffer)
+#endif
 {
 	bool		latest;
 	XLogRecPtr	request_lsn;
@@ -2010,13 +2102,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	}

 	/* Try to read from local file cache */
-	if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
+	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
 	{
 		return;
 	}

-	request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno);
-	neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
+	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2030,27 +2122,23 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		memcpy(pageserver_masked, buffer, BLCKSZ);
 		memcpy(mdbuf_masked, mdbuf, BLCKSZ);

-		if (PageIsNew(mdbuf))
+		if (PageIsNew((Page) mdbuf))
 		{
-			if (!PageIsNew(pageserver_masked))
+			if (!PageIsNew((Page) pageserver_masked))
 			{
 				elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
 					 blkno,
-					 reln->smgr_rnode.node.spcNode,
-					 reln->smgr_rnode.node.dbNode,
-					 reln->smgr_rnode.node.relNode,
+					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 					 forkNum,
 					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 					 hexdump_page(buffer));
 			}
 		}
-		else if (PageIsNew(buffer))
+		else if (PageIsNew((Page) buffer))
 		{
 			elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
 				 blkno,
-				 reln->smgr_rnode.node.spcNode,
-				 reln->smgr_rnode.node.dbNode,
-				 reln->smgr_rnode.node.relNode,
+				 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 				 forkNum,
 				 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 				 hexdump_page(mdbuf));
@@ -2065,9 +2153,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			{
 				elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 					 blkno,
-					 reln->smgr_rnode.node.spcNode,
-					 reln->smgr_rnode.node.dbNode,
-					 reln->smgr_rnode.node.relNode,
+					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 					 forkNum,
 					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 					 hexdump_page(mdbuf_masked),
@@ -2086,9 +2172,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 				{
 					elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 						 blkno,
-						 reln->smgr_rnode.node.spcNode,
-						 reln->smgr_rnode.node.dbNode,
-						 reln->smgr_rnode.node.relNode,
+						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						 forkNum,
 						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 						 hexdump_page(mdbuf_masked),
@@ -2130,7 +2214,11 @@ hexdump_page(char *page)
 */
 void
 neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+#if PG_MAJORVERSION_NUM < 16
 		   char *buffer, bool skipFsync)
+#else
+		   const void *buffer, bool skipFsync)
+#endif
 {
 	XLogRecPtr	lsn;

@@ -2168,15 +2256,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,

 	neon_wallog_page(reln, forknum, blocknum, buffer, false);

-	lsn = PageGetLSN(buffer);
+	lsn = PageGetLSN((Page) buffer);
 	elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
-		 reln->smgr_rnode.node.spcNode,
-		 reln->smgr_rnode.node.dbNode,
-		 reln->smgr_rnode.node.relNode,
+		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);

-	lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer);
+	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2212,23 +2298,21 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
+	if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
 	{
 		elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
+			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 			 forknum, n_blocks);
 		return n_blocks;
 	}

-	request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO);
+	request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
 			.req.latest = latest,
 			.req.lsn = request_lsn,
-			.rnode = reln->smgr_rnode.node,
+			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};

@@ -2245,9 +2329,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
+							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
@@ -2257,12 +2339,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		default:
 			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
 	}
-	update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks);
+	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);

 	elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-		 reln->smgr_rnode.node.spcNode,
-		 reln->smgr_rnode.node.dbNode,
-		 reln->smgr_rnode.node.relNode,
+		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forknum,
 		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
 		 n_blocks);
@@ -2281,7 +2361,7 @@ neon_dbsize(Oid dbNode)
 	int64		db_size;
 	XLogRecPtr	request_lsn;
 	bool		latest;
-	RelFileNode dummy_node = {InvalidOid, InvalidOid, InvalidOid};
+	NRelFileInfo dummy_node = {0};

 	request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
 	{
@@ -2350,7 +2430,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
+	set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);

 	/*
 	 * Truncating a relation drops all its buffers from the buffer cache
@@ -2378,7 +2458,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 * for the extended pages, so there's no harm in leaving behind obsolete
 	 * entries for the truncated chunks.
 	 */
-	SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum);
+	SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forknum);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2448,9 +2528,7 @@ neon_start_unlogged_build(SMgrRelation reln)

 	ereport(SmgrTrace,
 			(errmsg("starting unlogged build of relation %u/%u/%u",
-					reln->smgr_rnode.node.spcNode,
-					reln->smgr_rnode.node.dbNode,
-					reln->smgr_rnode.node.relNode)));
+					RelFileInfoFmt(InfoFromSMgrRel(reln)))));

 	switch (reln->smgr_relpersistence)
 	{
@@ -2500,9 +2578,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)

 	ereport(SmgrTrace,
 			(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
-					reln->smgr_rnode.node.spcNode,
-					reln->smgr_rnode.node.dbNode,
-					reln->smgr_rnode.node.relNode)));
+					RelFileInfoFmt(InfoFromSMgrRel(reln)))));

 	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
 		return;
@@ -2525,18 +2601,16 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 static void
 neon_end_unlogged_build(SMgrRelation reln)
 {
+	NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
+
 	Assert(unlogged_build_rel == reln);

 	ereport(SmgrTrace,
 			(errmsg("ending unlogged build of relation %u/%u/%u",
-					reln->smgr_rnode.node.spcNode,
-					reln->smgr_rnode.node.dbNode,
-					reln->smgr_rnode.node.relNode)));
+					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));

 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
 	{
-		RelFileNodeBackend rnode;
-
 		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
 		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

@@ -2544,19 +2618,17 @@ neon_end_unlogged_build(SMgrRelation reln)
 		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;

 		/* Remove local copy */
-		rnode = reln->smgr_rnode;
+		rinfob = InfoBFromSMgrRel(reln);
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
 			elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
-				 rnode.node.spcNode,
-				 rnode.node.dbNode,
-				 rnode.node.relNode,
+				 RelFileInfoFmt(InfoFromNInfoB(rinfob)),
 				 forknum);

-			forget_cached_relsize(rnode.node, forknum);
+			forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
 			mdclose(reln, forknum);
 			/* use isRedo == true, so that we drop it immediately */
-			mdunlink(rnode, forknum, true);
+			mdunlink(rinfob, forknum, true);
 		}
 	}

@@ -2608,6 +2680,9 @@ static const struct f_smgr neon_smgr =
 	.smgr_exists = neon_exists,
 	.smgr_unlink = neon_unlink,
 	.smgr_extend = neon_extend,
+#if PG_MAJORVERSION_NUM >= 16
+	.smgr_zeroextend = neon_zeroextend,
+#endif
 	.smgr_prefetch = neon_prefetch,
 	.smgr_read = neon_read,
 	.smgr_write = neon_write,
@@ -2622,12 +2697,12 @@ static const struct f_smgr neon_smgr =
 };

 const f_smgr *
-smgr_neon(BackendId backend, RelFileNode rnode)
+smgr_neon(BackendId backend, NRelFileInfo rinfo)
 {

 	/* Don't use page server for temp relations */
 	if (backend != InvalidBackendId)
-		return smgr_standard(backend, rnode);
+		return smgr_standard(backend, rinfo);
 	else
 		return &neon_smgr;
 }
@@ -2681,7 +2756,7 @@ bool
 neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 {
 	XLogRecPtr	end_recptr = record->EndRecPtr;
-	RelFileNode	rnode;
+	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 	BlockNumber	blkno;
 	BufferTag	tag;
@@ -2695,10 +2770,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 		return true;

 #if PG_VERSION_NUM < 150000
-	if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
+	if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno))
 		elog(PANIC, "failed to locate backup block with ID %d", block_id);
 #else
-	XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno);
+	XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
 #endif

 	/*
@@ -2706,10 +2781,13 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	 * regardless of whether the block is stored in shared buffers.
 	 * See also this function's top comment.
 	 */
-	if (!OidIsValid(rnode.dbNode))
+	if (!OidIsValid(NInfoGetDbOid(rinfo)))
 		return false;

-	INIT_BUFFERTAG(tag, rnode, forknum, blkno);
+	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forknum;
+	tag.blockNum = blkno;
+
 	hash = BufTableHashCode(&tag);
 	partitionLock = BufMappingPartitionLock(hash);

@@ -2725,24 +2803,24 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	no_redo_needed = buffer < 0;

 	/* In both cases st lwlsn past this WAL record */
-	SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
+	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);

 	/* we don't have the buffer in memory, update lwLsn past this record,
 	 * also evict page fro file cache
 	 */
 	if (no_redo_needed)
-		lfc_evict(rnode, forknum, blkno);
+		lfc_evict(rinfo, forknum, blkno);


 	LWLockRelease(partitionLock);

 	/* Extend the relation if we know its size */
-	if (get_cached_relsize(rnode, forknum, &relsize))
+	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
 		if (relsize < blkno + 1)
 		{
-			update_cached_relsize(rnode, forknum, blkno + 1);
-			SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
+			update_cached_relsize(rinfo, forknum, blkno + 1);
+			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
 		}
 	}
 	else
@@ -2763,7 +2841,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 				.latest = false,
 				.tag = T_NeonNblocksRequest,
 			},
-			.rnode = rnode,
+			.rinfo = rinfo,
 			.forknum = forknum,
 		};

@@ -2774,8 +2852,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 		Assert(nbresponse->n_blocks > blkno);

-		set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
-		SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
+		set_cached_relsize(rinfo, forknum, nbresponse->n_blocks);
+		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);

 		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
 	}
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -14,8 +14,10 @@
 */
 #include "postgres.h"

+#include "neon_pgversioncompat.h"
+
 #include "pagestore_client.h"
-#include "storage/relfilenode.h"
+#include RELFILEINFO_HDR
 #include "storage/smgr.h"
 #include "storage/lwlock.h"
 #include "storage/ipc.h"
@@ -30,7 +32,7 @@

 typedef struct
 {
-	RelFileNode rnode;
+	NRelFileInfo rinfo;
 	ForkNumber	forknum;
 } RelTag;

@@ -75,7 +77,7 @@ neon_smgr_shmem_startup(void)
 }

 bool
-get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size)
+get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 {
 	bool		found = false;

@@ -84,7 +86,7 @@ get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size)
 		RelTag		tag;
 		RelSizeEntry *entry;

-		tag.rnode = rnode;
+		tag.rinfo = rinfo;
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_SHARED);
 		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
@@ -99,14 +101,14 @@ get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size)
 }

 void
-set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
+set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
 		RelSizeEntry *entry;

-		tag.rnode = rnode;
+		tag.rinfo = rinfo;
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
 		entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL);
@@ -116,7 +118,7 @@ set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
 }

 void
-update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
+update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
 	if (relsize_hash_size > 0)
 	{
@@ -124,7 +126,7 @@ update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
 		RelSizeEntry *entry;
 		bool		found;

-		tag.rnode = rnode;
+		tag.rinfo = rinfo;
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
 		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
@@ -135,13 +137,13 @@ update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
 }

 void
-forget_cached_relsize(RelFileNode rnode, ForkNumber forknum)
+forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
 {
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;

-		tag.rnode = rnode;
+		tag.rinfo = rinfo;
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
 		hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -51,6 +51,9 @@
 #include "libpq/pqformat.h"
 #include "replication/slot.h"
 #include "replication/walreceiver.h"
+#if PG_VERSION_NUM >= 160000
+#include "replication/walsender_private.h"
+#endif
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
 #include "postmaster/postmaster.h"
@@ -73,10 +76,10 @@

 static bool syncSafekeepers = false;

-char	   *wal_acceptors_list;
-int			wal_acceptor_reconnect_timeout;
-int			wal_acceptor_connection_timeout;
-bool		am_wal_proposer;
+char	   *wal_acceptors_list = "";
+int			wal_acceptor_reconnect_timeout = 1000;
+int			wal_acceptor_connection_timeout = 10000;
+bool		am_wal_proposer = false;

 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"

@@ -191,7 +194,7 @@ pg_init_walproposer(void)
 /*
 * Entry point for `postgres --sync-safekeepers`.
 */
-void
+PGDLLEXPORT void
 WalProposerSync(int argc, char *argv[])
 {
 	struct stat stat_buf;
@@ -315,7 +318,7 @@ nwp_shmem_startup_hook(void)
 /*
 * WAL proposer bgworker entry point.
 */
-void
+PGDLLEXPORT void
 WalProposerMain(Datum main_arg)
 {
 #if PG_VERSION_NUM >= 150000
@@ -383,21 +386,55 @@ WalProposerPoll(void)
 {
 	while (true)
 	{
-		Safekeeper *sk;
-		int			rc;
-		WaitEvent	event;
+		Safekeeper *sk = NULL;
+		bool		wait_timeout = false;
+		bool		late_cv_trigger = false;
+		WaitEvent	event = {0};
+		int			rc = 0;
 		TimestampTz now = GetCurrentTimestamp();
+		long		timeout = TimeToReconnect(now);

-		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
+#if PG_MAJORVERSION_NUM >= 16
+		if (WalSndCtl != NULL)
+			ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv);
+#endif
+
+		/*
+		 * Wait for a wait event to happen, or timeout:
+		 *  - Safekeeper socket can become available for READ or WRITE
+		 *  - Our latch got set, because
+		 *     * PG15-: We got woken up by a process triggering the WalSender
+		 *     * PG16+: WalSndCtl->wal_flush_cv was triggered
+		 */
+		rc = WaitEventSetWait(waitEvents, timeout,
 							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		sk = (Safekeeper *) event.user_data;
+#if PG_MAJORVERSION_NUM >= 16
+		if (WalSndCtl != NULL)
+			late_cv_trigger = ConditionVariableCancelSleep();
+#endif

+		/*
+		 * If wait is terminated by latch set (walsenders' latch is set on
+		 * each wal flush), then exit loop. (no need for pm death check due to
+		 * WL_EXIT_ON_PM_DEATH)
+		 */
+		if ((rc == 1 && event.events & WL_LATCH_SET) || late_cv_trigger)
+		{
+			/* Reset our latch */
+			ResetLatch(MyLatch);
+
+			break;
+		}
+		
 		/*
 		 * If the event contains something that one of our safekeeper states
 		 * was waiting for, we'll advance its state.
 		 */
-		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
+		if (rc == 1 && (event.events & (WL_SOCKET_MASK)))
+		{
+			sk = (Safekeeper *) event.user_data;
 			AdvancePollState(sk, event.events);
+		}

 		/*
 		 * If the timeout expired, attempt to reconnect to any safekeepers
@@ -405,15 +442,26 @@ WalProposerPoll(void)
 		 */
 		ReconnectSafekeepers();

-		/*
-		 * If wait is terminated by latch set (walsenders' latch is set on
-		 * each wal flush), then exit loop. (no need for pm death check due to
-		 * WL_EXIT_ON_PM_DEATH)
-		 */
-		if (rc != 0 && (event.events & WL_LATCH_SET))
+		if (rc == 0) /* timeout expired */
 		{
-			ResetLatch(MyLatch);
-			break;
+			wait_timeout = true;
+
+			/*
+			 * Ensure flushrecptr is set to a recent value. This fixes a case
+			 * where we've not been notified of new WAL records when we were
+			 * planning on consuming them.
+			 */
+			if (!syncSafekeepers) {
+				XLogRecPtr flushed;
+
+#if PG_MAJORVERSION_NUM < 15
+				flushed = GetFlushRecPtr();
+#else
+				flushed = GetFlushRecPtr(NULL);
+#endif
+				if (flushed > availableLsn)
+					break;
+			}
 		}

 		now = GetCurrentTimestamp();
@@ -611,7 +659,8 @@ UpdateEventSet(Safekeeper *sk, uint32 events)
 	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
 }

-/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
+/*
+ * Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
 *
 * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
 */
@@ -1408,7 +1457,12 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 			elog(FATAL, "could not append password to the safekeeper connection string");
 	}

+#if PG_MAJORVERSION_NUM < 16
 	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
+#else
+	wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err);
+#endif
+
 	if (!wrconn)
 	{
 		ereport(WARNING,
@@ -2242,9 +2296,10 @@ HandleSafekeeperResponse(void)
 			if (synced)
 				n_synced++;
 		}
+
 		if (n_synced >= quorum)
 		{
-			/* All safekeepers synced! */
+			/* A quorum of safekeepers has been synced! */
 			
 			/*
 			 * Send empty message to broadcast latest truncateLsn to all safekeepers.
@@ -2539,8 +2594,15 @@ backpressure_throttling_impl(void)
 	? PrevProcessInterruptsCallback()
 	: false;

-	/* Don't throttle read only transactions and wal sender. */
-	if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+	/*
+	 * Don't throttle read only transactions or wal sender.
+	 * Do throttle CREATE INDEX CONCURRENTLY, however. It performs some
+	 * stages outside a transaction, even though it writes a lot of WAL. 
+	 * Check PROC_IN_SAFE_IC flag to cover that case.
+	 */
+	if (am_walsender
+		|| (!(MyProc->statusFlags & PROC_IN_SAFE_IC)
+			&& !TransactionIdIsValid(GetCurrentTransactionIdIfAny())))
 		return retry;

 	/* Calculate replicas lag */
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -379,8 +379,8 @@ typedef struct Safekeeper
 	AppendResponse appendResponse;	/* feedback for master */
 } Safekeeper;

-extern void WalProposerSync(int argc, char *argv[]);
-extern void WalProposerMain(Datum main_arg);
+extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
+extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
 extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
 extern void WalProposerPoll(void);
 extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -25,6 +25,9 @@
 #include "access/xlogutils.h"
 #include "access/xlogrecovery.h"
 #endif
+#if PG_MAJORVERSION_NUM >= 16
+#include "utils/guc.h"
+#endif

 /*
 * These variables are used similarly to openLogFile/SegNo,
@@ -558,11 +561,11 @@ StartProposerReplication(StartReplicationCmd *cmd)
 static void
 WalSndLoop(void)
 {
+	/* Clear any already-pending wakeups */
+	ResetLatch(MyLatch);
+
 	for (;;)
 	{
-		/* Clear any already-pending wakeups */
-		ResetLatch(MyLatch);
-
 		CHECK_FOR_INTERRUPTS();

 		XLogBroadcastWalProposer();
--- a/pgxn/neon_rmgr/Makefile
+++ b/pgxn/neon_rmgr/Makefile
@@ -0,0 +1,19 @@
+# pgxs/neon/Makefile
+
+
+MODULE_big = neon_rmgr
+OBJS = \
+	$(WIN32RES) \
+	neon_rmgr.o \
+	neon_rmgr_decode.o \
+	neon_rmgr_desc.o
+
+
+EXTENSION = neon_rmgr
+DATA = 
+PGFILEDESC = "Neon WAL Resource Manager - custom WAL records used to make Neon work (since PG 16)"
+
+
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
--- a/pgxn/neon_rmgr/neon_rmgr.c
+++ b/pgxn/neon_rmgr/neon_rmgr.c
@@ -0,0 +1,886 @@
+#include "postgres.h"
+#include "fmgr.h"
+
+#if PG_MAJORVERSION_NUM >= 16
+#include "access/bufmask.h"
+#include "access/heapam_xlog.h"
+#include "access/htup_details.h"
+#include "access/neon_xlog.h"
+#include "access/rmgr.h"
+#include "access/visibilitymap.h"
+#include "access/xlog_internal.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+#include "storage/freespace.h"
+#include "neon_rmgr.h"
+
+PG_MODULE_MAGIC;
+void		_PG_init(void);
+
+static void neon_rm_redo(XLogReaderState *record);
+static void neon_rm_startup(void);
+static void neon_rm_cleanup(void);
+static void neon_rm_mask(char *pagedata, BlockNumber blkno);
+
+static void redo_neon_heap_insert(XLogReaderState *record);
+static void redo_neon_heap_delete(XLogReaderState *record);
+static void redo_neon_heap_update(XLogReaderState *record, bool hot_update);
+static void redo_neon_heap_lock(XLogReaderState *record);
+static void redo_neon_heap_multi_insert(XLogReaderState *record);
+
+const static RmgrData NeonRmgr = {
+	.rm_name = "neon",
+	.rm_redo = neon_rm_redo,
+	.rm_desc = neon_rm_desc,
+	.rm_identify = neon_rm_identify,
+	.rm_startup = neon_rm_startup,
+	.rm_cleanup = neon_rm_cleanup,
+	.rm_mask = neon_rm_mask,
+	.rm_decode = neon_rm_decode,
+};
+
+void
+_PG_init(void)
+{
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+
+	RegisterCustomRmgr(RM_NEON_ID, &NeonRmgr);
+}
+
+static void
+neon_rm_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	switch (info & XLOG_NEON_OPMASK)
+	{
+		case XLOG_NEON_HEAP_INSERT:
+			redo_neon_heap_insert(record);
+			break;
+		case XLOG_NEON_HEAP_DELETE:
+			redo_neon_heap_delete(record);
+			break;
+		case XLOG_NEON_HEAP_UPDATE:
+			redo_neon_heap_update(record, false);
+			break;
+		case XLOG_NEON_HEAP_HOT_UPDATE:
+			redo_neon_heap_update(record, true);
+			break;
+		case XLOG_NEON_HEAP_LOCK:
+			redo_neon_heap_lock(record);
+			break;
+		case XLOG_NEON_HEAP_MULTI_INSERT:
+			redo_neon_heap_multi_insert(record);
+			break;
+		default:
+			elog(PANIC, "neon_rm_redo: unknown op code %u", info);
+	}
+}
+
+static void
+neon_rm_startup(void)
+{
+	/* nothing to do here */
+}
+
+static void
+neon_rm_cleanup(void)
+{
+	/* nothing to do here */
+}
+
+static void
+neon_rm_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	OffsetNumber off;
+
+	mask_page_lsn_and_checksum(page);
+
+	mask_page_hint_bits(page);
+	mask_unused_space(page);
+
+	for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
+	{
+		ItemId		iid = PageGetItemId(page, off);
+		char	   *page_item;
+
+		page_item = (char *) (page + ItemIdGetOffset(iid));
+
+		if (ItemIdIsNormal(iid))
+		{
+			HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
+
+			/*
+			 * If xmin of a tuple is not yet frozen, we should ignore
+			 * differences in hint bits, since they can be set without
+			 * emitting WAL.
+			 */
+			if (!HeapTupleHeaderXminFrozen(page_htup))
+				page_htup->t_infomask &= ~HEAP_XACT_MASK;
+			else
+			{
+				/* Still we need to mask xmax hint bits. */
+				page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
+				page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
+			}
+
+			/*
+			 * During replay, we set Command Id to FirstCommandId. Hence, mask
+			 * it. See heap_xlog_insert() for details.
+			 */
+			page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
+
+			/*
+			 * For a speculative tuple, heap_insert() does not set ctid in the
+			 * caller-passed heap tuple itself, leaving the ctid field to
+			 * contain a speculative token value - a per-backend monotonically
+			 * increasing identifier. Besides, it does not WAL-log ctid under
+			 * any circumstances.
+			 *
+			 * During redo, heap_xlog_insert() sets t_ctid to current block
+			 * number and self offset number. It doesn't care about any
+			 * speculative insertions on the primary. Hence, we set t_ctid to
+			 * current block number and self offset number to ignore any
+			 * inconsistency.
+			 */
+			if (HeapTupleHeaderIsSpeculative(page_htup))
+				ItemPointerSet(&page_htup->t_ctid, blkno, off);
+
+			/*
+			 * NB: Not ignoring ctid changes due to the tuple having moved
+			 * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
+			 * important information that needs to be in-sync between primary
+			 * and standby, and thus is WAL logged.
+			 */
+		}
+
+		/*
+		 * Ignore any padding bytes after the tuple, when the length of the
+		 * item is not MAXALIGNed.
+		 */
+		if (ItemIdHasStorage(iid))
+		{
+			int			len = ItemIdGetLength(iid);
+			int			padlen = MAXALIGN(len) - len;
+
+			if (padlen > 0)
+				memset(page_item + len, MASK_MARKER, padlen);
+		}
+	}
+}
+
+
+/*
+ * COPIED FROM heapam.c
+ * Given an "infobits" field from an XLog record, set the correct bits in the
+ * given infomask and infomask2 for the tuple touched by the record.
+ *
+ * (This is the reverse of compute_infobits).
+ */
+static void
+fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
+{
+	*infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
+				   HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
+	*infomask2 &= ~HEAP_KEYS_UPDATED;
+
+	if (infobits & XLHL_XMAX_IS_MULTI)
+		*infomask |= HEAP_XMAX_IS_MULTI;
+	if (infobits & XLHL_XMAX_LOCK_ONLY)
+		*infomask |= HEAP_XMAX_LOCK_ONLY;
+	if (infobits & XLHL_XMAX_EXCL_LOCK)
+		*infomask |= HEAP_XMAX_EXCL_LOCK;
+	/* note HEAP_XMAX_SHR_LOCK isn't considered here */
+	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
+		*infomask |= HEAP_XMAX_KEYSHR_LOCK;
+
+	if (infobits & XLHL_KEYS_UPDATED)
+		*infomask2 |= HEAP_KEYS_UPDATED;
+}
+
+static void
+redo_neon_heap_insert(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_neon_heap_insert *xlrec = (xl_neon_heap_insert *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	union
+	{
+		HeapTupleHeaderData hdr;
+		char		data[MaxHeapTupleSize];
+	}			tbuf;
+	HeapTupleHeader htup;
+	xl_neon_heap_header xlhdr;
+	uint32		newlen;
+	Size		freespace = 0;
+	RelFileLocator target_locator;
+	BlockNumber blkno;
+	ItemPointerData target_tid;
+	XLogRedoAction action;
+
+	XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno);
+	ItemPointerSetBlockNumber(&target_tid, blkno);
+	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
+
+	/*
+	 * The visibility map may need to be fixed even if the heap page is
+	 * already up-to-date.
+	 */
+	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
+	{
+		Relation	reln = CreateFakeRelcacheEntry(target_locator);
+		Buffer		vmbuffer = InvalidBuffer;
+
+		visibilitymap_pin(reln, blkno, &vmbuffer);
+		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
+		ReleaseBuffer(vmbuffer);
+		FreeFakeRelcacheEntry(reln);
+	}
+
+	/*
+	 * If we inserted the first and only tuple on the page, re-initialize the
+	 * page from scratch.
+	 */
+	if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
+	{
+		buffer = XLogInitBufferForRedo(record, 0);
+		page = BufferGetPage(buffer);
+		PageInit(page, BufferGetPageSize(buffer), 0);
+		action = BLK_NEEDS_REDO;
+	}
+	else
+		action = XLogReadBufferForRedo(record, 0, &buffer);
+	if (action == BLK_NEEDS_REDO)
+	{
+		Size		datalen;
+		char	   *data;
+
+		page = BufferGetPage(buffer);
+
+		if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
+			elog(PANIC, "neon_rm_redo: invalid max offset number");
+
+		data = XLogRecGetBlockData(record, 0, &datalen);
+
+		newlen = datalen - SizeOfNeonHeapHeader;
+		Assert(datalen > SizeOfNeonHeapHeader && newlen <= MaxHeapTupleSize);
+		memcpy((char *) &xlhdr, data, SizeOfNeonHeapHeader);
+		data += SizeOfNeonHeapHeader;
+
+		htup = &tbuf.hdr;
+		MemSet((char *) htup, 0, SizeofHeapTupleHeader);
+		/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
+		memcpy((char *) htup + SizeofHeapTupleHeader,
+			   data,
+			   newlen);
+		newlen += SizeofHeapTupleHeader;
+		htup->t_infomask2 = xlhdr.t_infomask2;
+		htup->t_infomask = xlhdr.t_infomask;
+		htup->t_hoff = xlhdr.t_hoff;
+		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
+		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
+		htup->t_ctid = target_tid;
+
+		if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
+						true, true) == InvalidOffsetNumber)
+			elog(PANIC, "neon_rm_redo: failed to add tuple");
+
+		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
+
+		PageSetLSN(page, lsn);
+
+		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
+			PageClearAllVisible(page);
+
+		/* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
+		if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
+			PageSetAllVisible(page);
+
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+
+	/*
+	 * If the page is running low on free space, update the FSM as well.
+	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
+	 * better than that without knowing the fill-factor for the table.
+	 *
+	 * XXX: Don't do this if the page was restored from full page image. We
+	 * don't bother to update the FSM in that case, it doesn't need to be
+	 * totally accurate anyway.
+	 */
+	if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
+		XLogRecordPageWithFreeSpace(target_locator, blkno, freespace);
+}
+
+static void
+redo_neon_heap_delete(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_neon_heap_delete *xlrec = (xl_neon_heap_delete *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	ItemId		lp = NULL;
+	HeapTupleHeader htup;
+	BlockNumber blkno;
+	RelFileLocator target_locator;
+	ItemPointerData target_tid;
+
+	XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno);
+	ItemPointerSetBlockNumber(&target_tid, blkno);
+	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
+
+	/*
+	 * The visibility map may need to be fixed even if the heap page is
+	 * already up-to-date.
+	 */
+	if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
+	{
+		Relation	reln = CreateFakeRelcacheEntry(target_locator);
+		Buffer		vmbuffer = InvalidBuffer;
+
+		visibilitymap_pin(reln, blkno, &vmbuffer);
+		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
+		ReleaseBuffer(vmbuffer);
+		FreeFakeRelcacheEntry(reln);
+	}
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = BufferGetPage(buffer);
+
+		if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
+			lp = PageGetItemId(page, xlrec->offnum);
+
+		if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
+			elog(PANIC, "neon_rm_redo: invalid lp");
+
+		htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+		HeapTupleHeaderClearHotUpdated(htup);
+		fix_infomask_from_infobits(xlrec->infobits_set,
+								   &htup->t_infomask, &htup->t_infomask2);
+		if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
+			HeapTupleHeaderSetXmax(htup, xlrec->xmax);
+		else
+			HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
+
+		/* Mark the page as a candidate for pruning */
+		PageSetPrunable(page, XLogRecGetXid(record));
+
+		if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
+			PageClearAllVisible(page);
+
+		/* Make sure t_ctid is set correctly */
+		if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
+			HeapTupleHeaderSetMovedPartitions(htup);
+		else
+			htup->t_ctid = target_tid;
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+}
+
+static void
+redo_neon_heap_update(XLogReaderState *record, bool hot_update)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_neon_heap_update *xlrec = (xl_neon_heap_update *) XLogRecGetData(record);
+	RelFileLocator rlocator;
+	BlockNumber oldblk;
+	BlockNumber newblk;
+	ItemPointerData newtid;
+	Buffer		obuffer,
+				nbuffer;
+	Page		page;
+	OffsetNumber offnum;
+	ItemId		lp = NULL;
+	HeapTupleData oldtup;
+	HeapTupleHeader htup;
+	uint16		prefixlen = 0,
+				suffixlen = 0;
+	char	   *newp;
+	union
+	{
+		HeapTupleHeaderData hdr;
+		char		data[MaxHeapTupleSize];
+	}			tbuf;
+	xl_neon_heap_header xlhdr;
+	uint32		newlen;
+	Size		freespace = 0;
+	XLogRedoAction oldaction;
+	XLogRedoAction newaction;
+
+	/* initialize to keep the compiler quiet */
+	oldtup.t_data = NULL;
+	oldtup.t_len = 0;
+
+	XLogRecGetBlockTag(record, 0, &rlocator, NULL, &newblk);
+	if (XLogRecGetBlockTagExtended(record, 1, NULL, NULL, &oldblk, NULL))
+	{
+		/* HOT updates are never done across pages */
+		Assert(!hot_update);
+	}
+	else
+		oldblk = newblk;
+
+	ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
+
+	/*
+	 * The visibility map may need to be fixed even if the heap page is
+	 * already up-to-date.
+	 */
+	if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
+	{
+		Relation	reln = CreateFakeRelcacheEntry(rlocator);
+		Buffer		vmbuffer = InvalidBuffer;
+
+		visibilitymap_pin(reln, oldblk, &vmbuffer);
+		visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
+		ReleaseBuffer(vmbuffer);
+		FreeFakeRelcacheEntry(reln);
+	}
+
+	/*
+	 * In normal operation, it is important to lock the two pages in
+	 * page-number order, to avoid possible deadlocks against other update
+	 * operations going the other way.  However, during WAL replay there can
+	 * be no other update happening, so we don't need to worry about that. But
+	 * we *do* need to worry that we don't expose an inconsistent state to Hot
+	 * Standby queries --- so the original page can't be unlocked before we've
+	 * added the new tuple to the new page.
+	 */
+
+	/* Deal with old tuple version */
+	oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
+									  &obuffer);
+	if (oldaction == BLK_NEEDS_REDO)
+	{
+		page = BufferGetPage(obuffer);
+		offnum = xlrec->old_offnum;
+		if (PageGetMaxOffsetNumber(page) >= offnum)
+			lp = PageGetItemId(page, offnum);
+
+		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
+			elog(PANIC, "neon_rm_redo: invalid lp");
+
+		htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+		oldtup.t_data = htup;
+		oldtup.t_len = ItemIdGetLength(lp);
+
+		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+		if (hot_update)
+			HeapTupleHeaderSetHotUpdated(htup);
+		else
+			HeapTupleHeaderClearHotUpdated(htup);
+		fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
+								   &htup->t_infomask2);
+		HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
+		/* Set forward chain link in t_ctid */
+		htup->t_ctid = newtid;
+
+		/* Mark the page as a candidate for pruning */
+		PageSetPrunable(page, XLogRecGetXid(record));
+
+		if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
+			PageClearAllVisible(page);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(obuffer);
+	}
+
+	/*
+	 * Read the page the new tuple goes into, if different from old.
+	 */
+	if (oldblk == newblk)
+	{
+		nbuffer = obuffer;
+		newaction = oldaction;
+	}
+	else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
+	{
+		nbuffer = XLogInitBufferForRedo(record, 0);
+		page = (Page) BufferGetPage(nbuffer);
+		PageInit(page, BufferGetPageSize(nbuffer), 0);
+		newaction = BLK_NEEDS_REDO;
+	}
+	else
+		newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
+
+	/*
+	 * The visibility map may need to be fixed even if the heap page is
+	 * already up-to-date.
+	 */
+	if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
+	{
+		Relation	reln = CreateFakeRelcacheEntry(rlocator);
+		Buffer		vmbuffer = InvalidBuffer;
+
+		visibilitymap_pin(reln, newblk, &vmbuffer);
+		visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
+		ReleaseBuffer(vmbuffer);
+		FreeFakeRelcacheEntry(reln);
+	}
+
+	/* Deal with new tuple */
+	if (newaction == BLK_NEEDS_REDO)
+	{
+		char	   *recdata;
+		char	   *recdata_end;
+		Size		datalen;
+		Size		tuplen;
+
+		recdata = XLogRecGetBlockData(record, 0, &datalen);
+		recdata_end = recdata + datalen;
+
+		page = BufferGetPage(nbuffer);
+
+		offnum = xlrec->new_offnum;
+		if (PageGetMaxOffsetNumber(page) + 1 < offnum)
+			elog(PANIC, "neon_rm_redo: invalid max offset number");
+
+		if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
+		{
+			Assert(newblk == oldblk);
+			memcpy(&prefixlen, recdata, sizeof(uint16));
+			recdata += sizeof(uint16);
+		}
+		if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
+		{
+			Assert(newblk == oldblk);
+			memcpy(&suffixlen, recdata, sizeof(uint16));
+			recdata += sizeof(uint16);
+		}
+
+		memcpy((char *) &xlhdr, recdata, SizeOfNeonHeapHeader);
+		recdata += SizeOfNeonHeapHeader;
+
+		tuplen = recdata_end - recdata;
+		Assert(tuplen <= MaxHeapTupleSize);
+
+		htup = &tbuf.hdr;
+		MemSet((char *) htup, 0, SizeofHeapTupleHeader);
+
+		/*
+		 * Reconstruct the new tuple using the prefix and/or suffix from the
+		 * old tuple, and the data stored in the WAL record.
+		 */
+		newp = (char *) htup + SizeofHeapTupleHeader;
+		if (prefixlen > 0)
+		{
+			int			len;
+
+			/* copy bitmap [+ padding] [+ oid] from WAL record */
+			len = xlhdr.t_hoff - SizeofHeapTupleHeader;
+			memcpy(newp, recdata, len);
+			recdata += len;
+			newp += len;
+
+			/* copy prefix from old tuple */
+			memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
+			newp += prefixlen;
+
+			/* copy new tuple data from WAL record */
+			len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
+			memcpy(newp, recdata, len);
+			recdata += len;
+			newp += len;
+		}
+		else
+		{
+			/*
+			 * copy bitmap [+ padding] [+ oid] + data from record, all in one
+			 * go
+			 */
+			memcpy(newp, recdata, tuplen);
+			recdata += tuplen;
+			newp += tuplen;
+		}
+		Assert(recdata == recdata_end);
+
+		/* copy suffix from old tuple */
+		if (suffixlen > 0)
+			memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
+
+		newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
+		htup->t_infomask2 = xlhdr.t_infomask2;
+		htup->t_infomask = xlhdr.t_infomask;
+		htup->t_hoff = xlhdr.t_hoff;
+
+		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
+		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
+		HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
+		/* Make sure there is no forward chain link in t_ctid */
+		htup->t_ctid = newtid;
+
+		offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
+		if (offnum == InvalidOffsetNumber)
+			elog(PANIC, "neon_rm_redo: failed to add tuple");
+
+		if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
+			PageClearAllVisible(page);
+
+		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(nbuffer);
+	}
+
+	if (BufferIsValid(nbuffer) && nbuffer != obuffer)
+		UnlockReleaseBuffer(nbuffer);
+	if (BufferIsValid(obuffer))
+		UnlockReleaseBuffer(obuffer);
+
+	/*
+	 * If the new page is running low on free space, update the FSM as well.
+	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
+	 * better than that without knowing the fill-factor for the table.
+	 *
+	 * However, don't update the FSM on HOT updates, because after crash
+	 * recovery, either the old or the new tuple will certainly be dead and
+	 * prunable. After pruning, the page will have roughly as much free space
+	 * as it did before the update, assuming the new tuple is about the same
+	 * size as the old one.
+	 *
+	 * XXX: Don't do this if the page was restored from full page image. We
+	 * don't bother to update the FSM in that case, it doesn't need to be
+	 * totally accurate anyway.
+	 */
+	if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
+		XLogRecordPageWithFreeSpace(rlocator, newblk, freespace);
+}
+
+static void
+redo_neon_heap_lock(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_neon_heap_lock *xlrec = (xl_neon_heap_lock *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	OffsetNumber offnum;
+	ItemId		lp = NULL;
+	HeapTupleHeader htup;
+
+	/*
+	 * The visibility map may need to be fixed even if the heap page is
+	 * already up-to-date.
+	 */
+	if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
+	{
+		RelFileLocator rlocator;
+		Buffer		vmbuffer = InvalidBuffer;
+		BlockNumber block;
+		Relation	reln;
+
+		XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block);
+		reln = CreateFakeRelcacheEntry(rlocator);
+
+		visibilitymap_pin(reln, block, &vmbuffer);
+		visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
+
+		ReleaseBuffer(vmbuffer);
+		FreeFakeRelcacheEntry(reln);
+	}
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+
+		offnum = xlrec->offnum;
+		if (PageGetMaxOffsetNumber(page) >= offnum)
+			lp = PageGetItemId(page, offnum);
+
+		if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
+			elog(PANIC, "neon_rm_redo: invalid lp");
+
+		htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+		htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+		htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+		fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
+								   &htup->t_infomask2);
+
+		/*
+		 * Clear relevant update flags, but only if the modified infomask says
+		 * there's no update.
+		 */
+		if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
+		{
+			HeapTupleHeaderClearHotUpdated(htup);
+			/* Make sure there is no forward chain link in t_ctid */
+			ItemPointerSet(&htup->t_ctid,
+						   BufferGetBlockNumber(buffer),
+						   offnum);
+		}
+		HeapTupleHeaderSetXmax(htup, xlrec->xmax);
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+}
+
+static void
+redo_neon_heap_multi_insert(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	xl_neon_heap_multi_insert *xlrec;
+	RelFileLocator rlocator;
+	BlockNumber blkno;
+	Buffer		buffer;
+	Page		page;
+	union
+	{
+		HeapTupleHeaderData hdr;
+		char		data[MaxHeapTupleSize];
+	}			tbuf;
+	HeapTupleHeader htup;
+	uint32		newlen;
+	Size		freespace = 0;
+	int			i;
+	bool		isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
+	XLogRedoAction action;
+
+	/*
+	 * Insertion doesn't overwrite MVCC data, so no conflict processing is
+	 * required.
+	 */
+	xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(record);
+
+	XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno);
+
+	/* check that the mutually exclusive flags are not both set */
+	Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) &&
+			 (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)));
+
+	/*
+	 * The visibility map may need to be fixed even if the heap page is
+	 * already up-to-date.
+	 */
+	if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
+	{
+		Relation	reln = CreateFakeRelcacheEntry(rlocator);
+		Buffer		vmbuffer = InvalidBuffer;
+
+		visibilitymap_pin(reln, blkno, &vmbuffer);
+		visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
+		ReleaseBuffer(vmbuffer);
+		FreeFakeRelcacheEntry(reln);
+	}
+
+	if (isinit)
+	{
+		buffer = XLogInitBufferForRedo(record, 0);
+		page = BufferGetPage(buffer);
+		PageInit(page, BufferGetPageSize(buffer), 0);
+		action = BLK_NEEDS_REDO;
+	}
+	else
+		action = XLogReadBufferForRedo(record, 0, &buffer);
+	if (action == BLK_NEEDS_REDO)
+	{
+		char	   *tupdata;
+		char	   *endptr;
+		Size		len;
+
+		/* Tuples are stored as block data */
+		tupdata = XLogRecGetBlockData(record, 0, &len);
+		endptr = tupdata + len;
+
+		page = (Page) BufferGetPage(buffer);
+
+		for (i = 0; i < xlrec->ntuples; i++)
+		{
+			OffsetNumber offnum;
+			xl_neon_multi_insert_tuple *xlhdr;
+
+			/*
+			 * If we're reinitializing the page, the tuples are stored in
+			 * order from FirstOffsetNumber. Otherwise there's an array of
+			 * offsets in the WAL record, and the tuples come after that.
+			 */
+			if (isinit)
+				offnum = FirstOffsetNumber + i;
+			else
+				offnum = xlrec->offsets[i];
+			if (PageGetMaxOffsetNumber(page) + 1 < offnum)
+				elog(PANIC, "neon_rm_redo: invalid max offset number");
+
+			xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(tupdata);
+			tupdata = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple;
+
+			newlen = xlhdr->datalen;
+			Assert(newlen <= MaxHeapTupleSize);
+			htup = &tbuf.hdr;
+			MemSet((char *) htup, 0, SizeofHeapTupleHeader);
+			/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
+			memcpy((char *) htup + SizeofHeapTupleHeader,
+				   (char *) tupdata,
+				   newlen);
+			tupdata += newlen;
+
+			newlen += SizeofHeapTupleHeader;
+			htup->t_infomask2 = xlhdr->t_infomask2;
+			htup->t_infomask = xlhdr->t_infomask;
+			htup->t_hoff = xlhdr->t_hoff;
+			HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
+			HeapTupleHeaderSetCmin(htup, xlrec->t_cid);
+			ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
+			ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
+
+			offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
+			if (offnum == InvalidOffsetNumber)
+				elog(PANIC, "neon_rm_redo: failed to add tuple");
+		}
+		if (tupdata != endptr)
+			elog(PANIC, "neon_rm_redo: total tuple length mismatch");
+
+		freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
+
+		PageSetLSN(page, lsn);
+
+		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
+			PageClearAllVisible(page);
+
+		/* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
+		if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
+			PageSetAllVisible(page);
+
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+
+	/*
+	 * If the page is running low on free space, update the FSM as well.
+	 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
+	 * better than that without knowing the fill-factor for the table.
+	 *
+	 * XXX: Don't do this if the page was restored from full page image. We
+	 * don't bother to update the FSM in that case, it doesn't need to be
+	 * totally accurate anyway.
+	 */
+	if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
+		XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
+}
+
+#else
+/* safeguard for older PostgreSQL versions */
+PG_MODULE_MAGIC;
+#endif
--- a/pgxn/neon_rmgr/neon_rmgr.control
+++ b/pgxn/neon_rmgr/neon_rmgr.control
@@ -0,0 +1,4 @@
+# neon_rmgr extension
+comment = 'Neon WAL Resource Manager - custom WAL records used to make Neon work (since PG 16)'
+default_version = '1.0'
+module_pathname = '$libdir/neon_rmgr'
--- a/pgxn/neon_rmgr/neon_rmgr.h
+++ b/pgxn/neon_rmgr/neon_rmgr.h
@@ -0,0 +1,13 @@
+#ifndef NEON_RMGR_H
+#define NEON_RMGR_H
+#if PG_MAJORVERSION_NUM >= 16
+#include "access/xlog_internal.h"
+#include "replication/decode.h"
+#include "replication/logical.h"
+
+extern void neon_rm_desc(StringInfo buf, XLogReaderState *record);
+extern void neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+extern const char *neon_rm_identify(uint8 info);
+
+#endif
+#endif //NEON_RMGR_H
--- a/pgxn/neon_rmgr/neon_rmgr_decode.c
+++ b/pgxn/neon_rmgr/neon_rmgr_decode.c
@@ -0,0 +1,404 @@
+#include "postgres.h"
+
+#if PG_MAJORVERSION_NUM >= 16
+#include "access/heapam_xlog.h"
+#include "access/neon_xlog.h"
+#include "replication/decode.h"
+#include "replication/logical.h"
+#include "replication/snapbuild.h"
+
+#include "neon_rmgr.h"
+
+/* individual record(group)'s handlers */
+static void DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+static void DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf);
+
+/* common function to decode tuples */
+static void DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple);
+
+
+void
+neon_rm_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	uint8		info = XLogRecGetInfo(buf->record) & XLOG_NEON_OPMASK;
+	TransactionId xid = XLogRecGetXid(buf->record);
+	SnapBuild  *builder = ctx->snapshot_builder;
+
+	ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr);
+
+	/*
+	 * If we don't have snapshot or we are just fast-forwarding, there is no
+	 * point in decoding data changes.
+	 */
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT ||
+		ctx->fast_forward)
+		return;
+
+	switch (info)
+	{
+		case XLOG_NEON_HEAP_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonInsert(ctx, buf);
+			break;
+		case XLOG_NEON_HEAP_DELETE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonDelete(ctx, buf);
+			break;
+		case XLOG_NEON_HEAP_UPDATE:
+		case XLOG_NEON_HEAP_HOT_UPDATE:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonUpdate(ctx, buf);
+			break;
+		case XLOG_NEON_HEAP_LOCK:
+			break;
+		case XLOG_NEON_HEAP_MULTI_INSERT:
+			if (SnapBuildProcessChange(builder, xid, buf->origptr))
+				DecodeNeonMultiInsert(ctx, buf);
+			break;
+		default:
+			elog(ERROR, "unexpected RM_HEAP_ID record type: %u", info);
+			break;
+	}
+}
+
+static inline bool
+FilterByOrigin(LogicalDecodingContext *ctx, RepOriginId origin_id)
+{
+	if (ctx->callbacks.filter_by_origin_cb == NULL)
+		return false;
+
+	return filter_by_origin_cb_wrapper(ctx, origin_id);
+}
+
+/*
+ * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs.
+ *
+ * Deletes can contain the new tuple.
+ */
+static void
+DecodeNeonInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	Size		datalen;
+	char	   *tupledata;
+	Size		tuplelen;
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_insert *xlrec;
+	ReorderBufferChange *change;
+	RelFileLocator target_locator;
+
+	xlrec = (xl_neon_heap_insert *) XLogRecGetData(r);
+
+	/*
+	 * Ignore insert records without new tuples (this does happen when
+	 * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL).
+	 */
+	if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
+		return;
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
+	if (target_locator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE))
+		change->action = REORDER_BUFFER_CHANGE_INSERT;
+	else
+		change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT;
+	change->origin_id = XLogRecGetOrigin(r);
+
+	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
+
+	tupledata = XLogRecGetBlockData(r, 0, &datalen);
+	tuplelen = datalen - SizeOfNeonHeapHeader;
+
+	change->data.tp.newtuple =
+		ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+	DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple);
+
+	change->data.tp.clear_toast_afterwards = true;
+
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change,
+							 xlrec->flags & XLH_INSERT_ON_TOAST_RELATION);
+}
+
+/*
+ * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs.
+ *
+ * Deletes can possibly contain the old primary key.
+ */
+static void
+DecodeNeonDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_delete *xlrec;
+	ReorderBufferChange *change;
+	RelFileLocator target_locator;
+
+	xlrec = (xl_neon_heap_delete *) XLogRecGetData(r);
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
+	if (target_locator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+
+	if (xlrec->flags & XLH_DELETE_IS_SUPER)
+		change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT;
+	else
+		change->action = REORDER_BUFFER_CHANGE_DELETE;
+
+	change->origin_id = XLogRecGetOrigin(r);
+
+	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
+
+	/* old primary key stored */
+	if (xlrec->flags & XLH_DELETE_CONTAINS_OLD)
+	{
+		Size		datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapHeader;
+		Size		tuplelen = datalen - SizeOfNeonHeapHeader;
+
+		Assert(XLogRecGetDataLen(r) > (SizeOfNeonHeapDelete + SizeOfNeonHeapHeader));
+
+		change->data.tp.oldtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+		DecodeXLogTuple((char *) xlrec + SizeOfNeonHeapDelete,
+						datalen, change->data.tp.oldtuple);
+	}
+
+	change->data.tp.clear_toast_afterwards = true;
+
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change, false);
+}
+
+/*
+ * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout
+ * in the record, from wal into proper tuplebufs.
+ *
+ * Updates can possibly contain a new tuple and the old primary key.
+ */
+static void
+DecodeNeonUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_update *xlrec;
+	ReorderBufferChange *change;
+	char	   *data;
+	RelFileLocator target_locator;
+
+	xlrec = (xl_neon_heap_update *) XLogRecGetData(r);
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &target_locator, NULL, NULL);
+	if (target_locator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	change = ReorderBufferGetChange(ctx->reorder);
+	change->action = REORDER_BUFFER_CHANGE_UPDATE;
+	change->origin_id = XLogRecGetOrigin(r);
+	memcpy(&change->data.tp.rlocator, &target_locator, sizeof(RelFileLocator));
+
+	if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE)
+	{
+		Size		datalen;
+		Size		tuplelen;
+
+		data = XLogRecGetBlockData(r, 0, &datalen);
+
+		tuplelen = datalen - SizeOfNeonHeapHeader;
+
+		change->data.tp.newtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+		DecodeXLogTuple(data, datalen, change->data.tp.newtuple);
+	}
+
+	if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD)
+	{
+		Size		datalen;
+		Size		tuplelen;
+
+		/* caution, remaining data in record is not aligned */
+		data = XLogRecGetData(r) + SizeOfNeonHeapUpdate;
+		datalen = XLogRecGetDataLen(r) - SizeOfNeonHeapUpdate;
+		tuplelen = datalen - SizeOfNeonHeapHeader;
+
+		change->data.tp.oldtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+
+		DecodeXLogTuple(data, datalen, change->data.tp.oldtuple);
+	}
+
+	change->data.tp.clear_toast_afterwards = true;
+
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change, false);
+}
+
+/*
+ * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs.
+ *
+ * Currently MULTI_INSERT will always contain the full tuples.
+ */
+static void
+DecodeNeonMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
+{
+	XLogReaderState *r = buf->record;
+	xl_neon_heap_multi_insert *xlrec;
+	int			i;
+	char	   *data;
+	char	   *tupledata;
+	Size		tuplelen;
+	RelFileLocator rlocator;
+
+	xlrec = (xl_neon_heap_multi_insert *) XLogRecGetData(r);
+
+	/*
+	 * Ignore insert records without new tuples.  This happens when a
+	 * multi_insert is done on a catalog or on a non-persistent relation.
+	 */
+	if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
+		return;
+
+	/* only interested in our database */
+	XLogRecGetBlockTag(r, 0, &rlocator, NULL, NULL);
+	if (rlocator.dbOid != ctx->slot->data.database)
+		return;
+
+	/* output plugin doesn't look for this origin, no need to queue */
+	if (FilterByOrigin(ctx, XLogRecGetOrigin(r)))
+		return;
+
+	/*
+	 * We know that this multi_insert isn't for a catalog, so the block should
+	 * always have data even if a full-page write of it is taken.
+	 */
+	tupledata = XLogRecGetBlockData(r, 0, &tuplelen);
+	Assert(tupledata != NULL);
+
+	data = tupledata;
+	for (i = 0; i < xlrec->ntuples; i++)
+	{
+		ReorderBufferChange *change;
+		xl_neon_multi_insert_tuple *xlhdr;
+		int			datalen;
+		ReorderBufferTupleBuf *tuple;
+		HeapTupleHeader header;
+
+		change = ReorderBufferGetChange(ctx->reorder);
+		change->action = REORDER_BUFFER_CHANGE_INSERT;
+		change->origin_id = XLogRecGetOrigin(r);
+
+		memcpy(&change->data.tp.rlocator, &rlocator, sizeof(RelFileLocator));
+
+		xlhdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(data);
+		data = ((char *) xlhdr) + SizeOfNeonMultiInsertTuple;
+		datalen = xlhdr->datalen;
+
+		change->data.tp.newtuple =
+			ReorderBufferGetTupleBuf(ctx->reorder, datalen);
+
+		tuple = change->data.tp.newtuple;
+		header = tuple->tuple.t_data;
+
+		/* not a disk based tuple */
+		ItemPointerSetInvalid(&tuple->tuple.t_self);
+
+		/*
+		 * We can only figure this out after reassembling the transactions.
+		 */
+		tuple->tuple.t_tableOid = InvalidOid;
+
+		tuple->tuple.t_len = datalen + SizeofHeapTupleHeader;
+
+		memset(header, 0, SizeofHeapTupleHeader);
+
+		memcpy((char *) tuple->tuple.t_data + SizeofHeapTupleHeader,
+			   (char *) data,
+			   datalen);
+		header->t_infomask = xlhdr->t_infomask;
+		header->t_infomask2 = xlhdr->t_infomask2;
+		header->t_hoff = xlhdr->t_hoff;
+
+		/*
+		 * Reset toast reassembly state only after the last row in the last
+		 * xl_multi_insert_tuple record emitted by one heap_multi_insert()
+		 * call.
+		 */
+		if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI &&
+			(i + 1) == xlrec->ntuples)
+			change->data.tp.clear_toast_afterwards = true;
+		else
+			change->data.tp.clear_toast_afterwards = false;
+
+		ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r),
+								 buf->origptr, change, false);
+
+		/* move to the next xl_neon_multi_insert_tuple entry */
+		data += datalen;
+	}
+	Assert(data == tupledata + tuplelen);
+}
+
+/*
+ * Read a HeapTuple as WAL logged by heap_insert, heap_update and heap_delete
+ * (but not by heap_multi_insert) into a tuplebuf.
+ *
+ * The size 'len' and the pointer 'data' in the record need to be
+ * computed outside as they are record specific.
+ */
+static void
+DecodeXLogTuple(char *data, Size len, ReorderBufferTupleBuf *tuple)
+{
+	xl_neon_heap_header xlhdr;
+	int			datalen = len - SizeOfNeonHeapHeader;
+	HeapTupleHeader header;
+
+	Assert(datalen >= 0);
+
+	tuple->tuple.t_len = datalen + SizeofHeapTupleHeader;
+	header = tuple->tuple.t_data;
+
+	/* not a disk based tuple */
+	ItemPointerSetInvalid(&tuple->tuple.t_self);
+
+	/* we can only figure this out after reassembling the transactions */
+	tuple->tuple.t_tableOid = InvalidOid;
+
+	/* data is not stored aligned, copy to aligned storage */
+	memcpy((char *) &xlhdr,
+		   data,
+		   SizeOfNeonHeapHeader);
+
+	memset(header, 0, SizeofHeapTupleHeader);
+
+	memcpy(((char *) tuple->tuple.t_data) + SizeofHeapTupleHeader,
+		   data + SizeOfNeonHeapHeader,
+		   datalen);
+
+	header->t_infomask = xlhdr.t_infomask;
+	header->t_infomask2 = xlhdr.t_infomask2;
+	header->t_hoff = xlhdr.t_hoff;
+}
+
+
+#endif
--- a/pgxn/neon_rmgr/neon_rmgr_desc.c
+++ b/pgxn/neon_rmgr/neon_rmgr_desc.c
@@ -0,0 +1,181 @@
+#include "postgres.h"
+#if PG_MAJORVERSION_NUM >= 16
+#include "access/heapam_xlog.h"
+#include "access/neon_xlog.h"
+#include "access/rmgr.h"
+#include "access/rmgrdesc_utils.h"
+#include "access/xlog_internal.h"
+#include "miscadmin.h"
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+
+#include "neon_rmgr.h"
+
+/*
+ * NOTE: "keyname" argument cannot have trailing spaces or punctuation
+ * characters
+ */
+static void
+infobits_desc(StringInfo buf, uint8 infobits, const char *keyname)
+{
+	appendStringInfo(buf, "%s: [", keyname);
+
+	Assert(buf->data[buf->len - 1] != ' ');
+
+	if (infobits & XLHL_XMAX_IS_MULTI)
+		appendStringInfoString(buf, "IS_MULTI, ");
+	if (infobits & XLHL_XMAX_LOCK_ONLY)
+		appendStringInfoString(buf, "LOCK_ONLY, ");
+	if (infobits & XLHL_XMAX_EXCL_LOCK)
+		appendStringInfoString(buf, "EXCL_LOCK, ");
+	if (infobits & XLHL_XMAX_KEYSHR_LOCK)
+		appendStringInfoString(buf, "KEYSHR_LOCK, ");
+	if (infobits & XLHL_KEYS_UPDATED)
+		appendStringInfoString(buf, "KEYS_UPDATED, ");
+
+	if (buf->data[buf->len - 1] == ' ')
+	{
+		/* Truncate-away final unneeded ", "  */
+		Assert(buf->data[buf->len - 2] == ',');
+		buf->len -= 2;
+		buf->data[buf->len] = '\0';
+	}
+
+	appendStringInfoString(buf, "]");
+}
+
+static void
+truncate_flags_desc(StringInfo buf, uint8 flags)
+{
+	appendStringInfoString(buf, "flags: [");
+
+	if (flags & XLH_TRUNCATE_CASCADE)
+		appendStringInfoString(buf, "CASCADE, ");
+	if (flags & XLH_TRUNCATE_RESTART_SEQS)
+		appendStringInfoString(buf, "RESTART_SEQS, ");
+
+	if (buf->data[buf->len - 1] == ' ')
+	{
+		/* Truncate-away final unneeded ", "  */
+		Assert(buf->data[buf->len - 2] == ',');
+		buf->len -= 2;
+		buf->data[buf->len] = '\0';
+	}
+
+	appendStringInfoString(buf, "]");
+}
+
+void
+neon_rm_desc(StringInfo buf, XLogReaderState *record)
+{
+	char	   *rec = XLogRecGetData(record);
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	info &= XLOG_NEON_OPMASK;
+
+	if (info == XLOG_NEON_HEAP_INSERT)
+	{
+		xl_neon_heap_insert *xlrec = (xl_neon_heap_insert *) rec;
+
+		appendStringInfo(buf, "off: %u, flags: 0x%02X",
+						 xlrec->offnum,
+						 xlrec->flags);
+	}
+	else if (info == XLOG_NEON_HEAP_DELETE)
+	{
+		xl_neon_heap_delete *xlrec = (xl_neon_heap_delete *) rec;
+
+		appendStringInfo(buf, "xmax: %u, off: %u, ",
+						 xlrec->xmax, xlrec->offnum);
+		infobits_desc(buf, xlrec->infobits_set, "infobits");
+		appendStringInfo(buf, ", flags: 0x%02X", xlrec->flags);
+	}
+	else if (info == XLOG_NEON_HEAP_UPDATE)
+	{
+		xl_neon_heap_update *xlrec = (xl_neon_heap_update *) rec;
+
+		appendStringInfo(buf, "old_xmax: %u, old_off: %u, ",
+						 xlrec->old_xmax, xlrec->old_offnum);
+		infobits_desc(buf, xlrec->old_infobits_set, "old_infobits");
+		appendStringInfo(buf, ", flags: 0x%02X, new_xmax: %u, new_off: %u",
+						 xlrec->flags, xlrec->new_xmax, xlrec->new_offnum);
+	}
+	else if (info == XLOG_NEON_HEAP_HOT_UPDATE)
+	{
+		xl_neon_heap_update *xlrec = (xl_neon_heap_update *) rec;
+
+		appendStringInfo(buf, "old_xmax: %u, old_off: %u, ",
+						 xlrec->old_xmax, xlrec->old_offnum);
+		infobits_desc(buf, xlrec->old_infobits_set, "old_infobits");
+		appendStringInfo(buf, ", flags: 0x%02X, new_xmax: %u, new_off: %u",
+						 xlrec->flags, xlrec->new_xmax, xlrec->new_offnum);
+	}
+	else if (info == XLOG_NEON_HEAP_LOCK)
+	{
+		xl_neon_heap_lock *xlrec = (xl_neon_heap_lock *) rec;
+
+		appendStringInfo(buf, "xmax: %u, off: %u, ",
+						 xlrec->xmax, xlrec->offnum);
+		infobits_desc(buf, xlrec->infobits_set, "infobits");
+		appendStringInfo(buf, ", flags: 0x%02X", xlrec->flags);
+	}
+	else if (info == XLOG_NEON_HEAP_MULTI_INSERT)
+	{
+		xl_neon_heap_multi_insert *xlrec = (xl_neon_heap_multi_insert *) rec;
+		bool		isinit = (XLogRecGetInfo(record) & XLOG_NEON_INIT_PAGE) != 0;
+
+		appendStringInfo(buf, "ntuples: %d, flags: 0x%02X", xlrec->ntuples,
+						 xlrec->flags);
+
+		if (XLogRecHasBlockData(record, 0) && !isinit)
+		{
+			appendStringInfoString(buf, ", offsets:");
+			array_desc(buf, xlrec->offsets, sizeof(OffsetNumber),
+					   xlrec->ntuples, &offset_elem_desc, NULL);
+		}
+	}
+}
+
+const char *
+neon_rm_identify(uint8 info)
+{
+	const char *id = NULL;
+
+	switch (info & ~XLR_INFO_MASK)
+	{
+		case XLOG_NEON_HEAP_INSERT:
+			id = "INSERT";
+			break;
+		case XLOG_NEON_HEAP_INSERT | XLOG_NEON_INIT_PAGE:
+			id = "INSERT+INIT";
+			break;
+		case XLOG_NEON_HEAP_DELETE:
+			id = "DELETE";
+			break;
+		case XLOG_NEON_HEAP_UPDATE:
+			id = "UPDATE";
+			break;
+		case XLOG_NEON_HEAP_UPDATE | XLOG_NEON_INIT_PAGE:
+			id = "UPDATE+INIT";
+			break;
+		case XLOG_NEON_HEAP_HOT_UPDATE:
+			id = "HOT_UPDATE";
+			break;
+		case XLOG_NEON_HEAP_HOT_UPDATE | XLOG_HEAP_INIT_PAGE:
+			id = "HOT_UPDATE+INIT";
+			break;
+		case XLOG_NEON_HEAP_LOCK:
+			id = "LOCK";
+			break;
+		case XLOG_NEON_HEAP_MULTI_INSERT:
+			id = "MULTI_INSERT";
+			break;
+		case XLOG_NEON_HEAP_MULTI_INSERT | XLOG_NEON_INIT_PAGE:
+			id = "MULTI_INSERT+INIT";
+			break;
+	}
+
+	return id;
+}
+
+#endif
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -10,6 +10,8 @@
 */
 #include "postgres.h"

+#include "../neon/neon_pgversioncompat.h"
+
 #include "access/relation.h"
 #include "access/xact.h"
 #include "access/xlog.h"
@@ -39,8 +41,13 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
 * Linkage to functions in neon module.
 * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c
 */
-typedef void (*neon_read_at_lsn_type) (RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+#if PG_MAJORVERSION_NUM < 16
+typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 									   XLogRecPtr request_lsn, bool request_latest, char *buffer);
+#else
+typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+									   XLogRecPtr request_lsn, bool request_latest, void *buffer);
+#endif

 static neon_read_at_lsn_type neon_read_at_lsn_ptr;

@@ -115,7 +122,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 			uint32		buf_state;
 			Buffer		bufferid;
 			bool		isvalid;
-			RelFileNode rnode;
+			NRelFileInfo rinfo;
 			ForkNumber	forknum;
 			BlockNumber blocknum;

@@ -128,7 +135,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 			else
 				isvalid = false;
 			bufferid = BufferDescriptorGetBuffer(bufHdr);
-			rnode = bufHdr->tag.rnode;
+			rinfo = BufTagGetNRelFileInfo(bufHdr->tag);
 			forknum = bufHdr->tag.forkNum;
 			blocknum = bufHdr->tag.blockNum;

@@ -141,7 +148,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 			 */
 			if (isvalid)
 			{
-				if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid))
+				if (ReadRecentBuffer(rinfo, forknum, blocknum, bufferid))
 					ReleaseBuffer(bufferid);
 			}
 		}
@@ -238,7 +245,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);

-	neon_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data);
+	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data);

 	relation_close(rel, AccessShareLock);

@@ -267,10 +274,17 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();

 	{
-		RelFileNode rnode = {
+		NRelFileInfo rinfo = {
+#if PG_MAJORVERSION_NUM < 16
 			.spcNode = PG_GETARG_OID(0),
 			.dbNode = PG_GETARG_OID(1),
-		.relNode = PG_GETARG_OID(2)};
+			.relNode = PG_GETARG_OID(2)
+#else
+			.spcOid = PG_GETARG_OID(0),
+			.dbOid = PG_GETARG_OID(1),
+			.relNumber = PG_GETARG_OID(2)
+#endif
+		};

 		ForkNumber	forknum = PG_GETARG_UINT32(3);

@@ -284,7 +298,7 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);

-		neon_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data);
+		neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data);
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -18,10 +18,12 @@
 */
 #include "postgres.h"

+#include "../neon/neon_pgversioncompat.h"
+
 #include "access/xlog.h"
 #include "storage/block.h"
 #include "storage/buf_internals.h"
-#include "storage/relfilenode.h"
+#include RELFILEINFO_HDR
 #include "storage/smgr.h"

 #if PG_VERSION_NUM >= 150000
@@ -43,10 +45,12 @@ static int	used_pages;
 static int
 locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
 {
+	NRelFileInfo rinfo = InfoFromSMgrRel(reln);
+
 	/* We only hold a small number of pages, so linear search */
 	for (int i = 0; i < used_pages; i++)
 	{
-		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+		if (RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(page_tag[i]))
 			&& forknum == page_tag[i].forkNum
 			&& blkno == page_tag[i].blockNum)
 		{
@@ -63,15 +67,26 @@ static void inmem_open(SMgrRelation reln);
 static void inmem_close(SMgrRelation reln, ForkNumber forknum);
 static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 static bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
-static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
-static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
-						 BlockNumber blocknum, char *buffer, bool skipFsync);
+static void inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo);
 static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum);
+#if PG_MAJORVERSION_NUM < 16
+static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, char *buffer, bool skipFsync);
 static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					   char *buffer);
 static void inmem_write(SMgrRelation reln, ForkNumber forknum,
 						BlockNumber blocknum, char *buffer, bool skipFsync);
+#else
+static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, const void *buffer, bool skipFsync);
+static void inmem_zeroextend(SMgrRelation reln, ForkNumber forknum,
+							 BlockNumber blocknum, int nblocks, bool skipFsync);
+static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					   void *buffer);
+static void inmem_write(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum, const void *buffer, bool skipFsync);
+#endif
 static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber blocknum, BlockNumber nblocks);
 static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
@@ -95,9 +110,11 @@ inmem_init(void)
 static bool
 inmem_exists(SMgrRelation reln, ForkNumber forknum)
 {
+	NRelFileInfo rinfo = InfoFromSMgrRel(reln);
+
 	for (int i = 0; i < used_pages; i++)
 	{
-		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+		if (RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(page_tag[i]))
 			&& forknum == page_tag[i].forkNum)
 		{
 			return true;
@@ -120,7 +137,7 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 *	inmem_unlink() -- Unlink a relation.
 */
 static void
-inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
+inmem_unlink(NRelFileInfoBackend rinfo, ForkNumber forknum, bool isRedo)
 {
 }

@@ -135,12 +152,28 @@ inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
 */
 static void
 inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+#if PG_MAJORVERSION_NUM < 16
 			 char *buffer, bool skipFsync)
+#else
+			 const void *buffer, bool skipFsync)
+#endif
 {
 	/* same as smgwrite() for us */
 	inmem_write(reln, forknum, blkno, buffer, skipFsync);
 }

+#if PG_MAJORVERSION_NUM >= 16
+static void
+inmem_zeroextend(SMgrRelation reln, ForkNumber forknum,
+				 BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+	char buffer[BLCKSZ] = {0};
+
+	for (int i = 0; i < nblocks; i++)
+		inmem_extend(reln, forknum, blocknum + i, buffer, skipFsync);
+}
+#endif
+
 /*
 *  inmem_open() -- Initialize newly-opened relation.
 */
@@ -180,7 +213,11 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum,
 */
 static void
 inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+#if PG_MAJORVERSION_NUM < 16
 		   char *buffer)
+#else
+		   void *buffer)
+#endif
 {
 	int			pg;

@@ -200,7 +237,11 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 */
 static void
 inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+#if PG_MAJORVERSION_NUM < 16
 			char *buffer, bool skipFsync)
+#else
+			const void *buffer, bool skipFsync)
+#endif
 {
 	int			pg;

@@ -216,9 +257,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 */
 		elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
 			 "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
+			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 			 forknum,
 			 blocknum,
 			 used_pages);
@@ -227,14 +266,13 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,

 		pg = used_pages;
 		used_pages++;
-		INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
+
+		InitBufferTag(&page_tag[pg], &InfoFromSMgrRel(reln), forknum, blocknum);
 	}
 	else
 	{
 		elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
+			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 			 forknum,
 			 blocknum,
 			 used_pages);
@@ -287,6 +325,9 @@ static const struct f_smgr inmem_smgr =
 	.smgr_exists = inmem_exists,
 	.smgr_unlink = inmem_unlink,
 	.smgr_extend = inmem_extend,
+#if PG_MAJORVERSION_NUM >= 16
+	.smgr_zeroextend = inmem_zeroextend,
+#endif
 	.smgr_prefetch = inmem_prefetch,
 	.smgr_read = inmem_read,
 	.smgr_write = inmem_write,
@@ -297,11 +338,11 @@ static const struct f_smgr inmem_smgr =
 };

 const f_smgr *
-smgr_inmem(BackendId backend, RelFileNode rnode)
+smgr_inmem(BackendId backend, NRelFileInfo rinfo)
 {
 	Assert(InRecovery);
 	if (backend != InvalidBackendId)
-		return smgr_standard(backend, rnode);
+		return smgr_standard(backend, rinfo);
 	else
 		return &inmem_smgr;
 }
--- a/pgxn/neon_walredo/inmem_smgr.h
+++ b/pgxn/neon_walredo/inmem_smgr.h
@@ -11,7 +11,7 @@
 #ifndef INMEM_SMGR_H
 #define INMEM_SMGR_H

-extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
+extern const f_smgr *smgr_inmem(BackendId backend, NRelFileInfo rinfo);
 extern void smgr_init_inmem(void);

 #endif /* INMEM_SMGR_H */
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -43,6 +43,8 @@

 #include "postgres.h"

+#include "../neon/neon_pgversioncompat.h"
+
 #include <fcntl.h>
 #include <limits.h>
 #include <signal.h>
@@ -61,9 +63,11 @@
 #include <malloc.h>
 #endif

+#if PG_MAJORVERSION_NUM < 16
 #ifndef HAVE_GETRUSAGE
 #include "rusagestub.h"
 #endif
+#endif

 #include "access/clog.h"
 #include "access/commit_ts.h"
@@ -187,7 +191,7 @@ enter_seccomp_mode(void)
 * backend processes. Some initialization was done in CallExtMain
 * already.
 */
-void
+PGDLLEXPORT void
 WalRedoMain(int argc, char *argv[])
 {
 	int			firstchar;
@@ -200,7 +204,7 @@ WalRedoMain(int argc, char *argv[])

 	/*
 	 * WAL redo does not need a large number of buffers. And speed of
-	 * DropRelFileNodeAllLocalBuffers() is proportional to the number of
+	 * DropRelationAllLocalBuffers() is proportional to the number of
 	 * buffers. So let's keep it small (default value is 1024)
 	 */
 	num_temp_buffers = 4;
@@ -212,6 +216,12 @@ WalRedoMain(int argc, char *argv[])
 	smgr_hook = smgr_inmem;
 	smgr_init_hook = smgr_init_inmem;

+#if PG_VERSION_NUM >= 160000
+	/* make rmgr registry believe we can register the resource manager */
+	process_shared_preload_libraries_in_progress = true;
+	load_file("$libdir/neon_rmgr", false);
+	process_shared_preload_libraries_in_progress = false;
+#endif

 	/* Initialize MaxBackends (if under postmaster, was done already) */
 	MaxConnections = 1;
@@ -300,6 +310,9 @@ WalRedoMain(int argc, char *argv[])
 	 */
 	MemoryContextSwitchTo(MessageContext);
 	initStringInfo(&input_message);
+#if PG_MAJORVERSION_NUM >= 16
+	MyBackendType = B_BACKEND;
+#endif

 	for (;;)
 	{
@@ -534,16 +547,16 @@ CreateFakeSharedMemoryAndSemaphores()

 /* Version compatility wrapper for ReadBufferWithoutRelcache */
 static inline Buffer
-NeonRedoReadBuffer(RelFileNode rnode,
+NeonRedoReadBuffer(NRelFileInfo rinfo,
 		   ForkNumber forkNum, BlockNumber blockNum,
 		   ReadBufferMode mode)
 {
 #if PG_VERSION_NUM >= 150000
-	return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode,
+	return ReadBufferWithoutRelcache(rinfo, forkNum, blockNum, mode,
 									 NULL, /* no strategy */
 									 true); /* WAL redo is only performed on permanent rels */
 #else
-	return ReadBufferWithoutRelcache(rnode, forkNum, blockNum, mode,
+	return ReadBufferWithoutRelcache(rinfo, forkNum, blockNum, mode,
 									 NULL); /* no strategy */
 #endif
 }
@@ -647,7 +660,7 @@ ReadRedoCommand(StringInfo inBuf)
 static void
 BeginRedoForBlock(StringInfo input_message)
 {
-	RelFileNode rnode;
+	NRelFileInfo rinfo;
 	ForkNumber forknum;
 	BlockNumber blknum;
 	SMgrRelation reln;
@@ -662,22 +675,26 @@ BeginRedoForBlock(StringInfo input_message)
 	 * BlockNumber
 	 */
 	forknum = pq_getmsgbyte(input_message);
-	rnode.spcNode = pq_getmsgint(input_message, 4);
-	rnode.dbNode = pq_getmsgint(input_message, 4);
-	rnode.relNode = pq_getmsgint(input_message, 4);
+#if PG_MAJORVERSION_NUM < 16
+	rinfo.spcNode = pq_getmsgint(input_message, 4);
+	rinfo.dbNode = pq_getmsgint(input_message, 4);
+	rinfo.relNode = pq_getmsgint(input_message, 4);
+#else
+	rinfo.spcOid = pq_getmsgint(input_message, 4);
+	rinfo.dbOid = pq_getmsgint(input_message, 4);
+	rinfo.relNumber = pq_getmsgint(input_message, 4);
+#endif
 	blknum = pq_getmsgint(input_message, 4);
 	wal_redo_buffer = InvalidBuffer;

-	INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
+	InitBufferTag(&target_redo_tag, &rinfo, forknum, blknum);

 	elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u",
-		 target_redo_tag.rnode.spcNode,
-		 target_redo_tag.rnode.dbNode,
-		 target_redo_tag.rnode.relNode,
+		 RelFileInfoFmt(rinfo),
 		 target_redo_tag.forkNum,
 		 target_redo_tag.blockNum);

-	reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
+	reln = smgropen(rinfo, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
 		reln->smgr_cached_nblocks[forknum] < blknum + 1)
 	{
@@ -691,7 +708,7 @@ BeginRedoForBlock(StringInfo input_message)
 static void
 PushPage(StringInfo input_message)
 {
-	RelFileNode rnode;
+	NRelFileInfo rinfo;
 	ForkNumber forknum;
 	BlockNumber blknum;
 	const char *content;
@@ -709,13 +726,19 @@ PushPage(StringInfo input_message)
 	 * 8k page content
 	 */
 	forknum = pq_getmsgbyte(input_message);
-	rnode.spcNode = pq_getmsgint(input_message, 4);
-	rnode.dbNode = pq_getmsgint(input_message, 4);
-	rnode.relNode = pq_getmsgint(input_message, 4);
+#if PG_MAJORVERSION_NUM < 16
+	rinfo.spcNode = pq_getmsgint(input_message, 4);
+	rinfo.dbNode = pq_getmsgint(input_message, 4);
+	rinfo.relNode = pq_getmsgint(input_message, 4);
+#else
+	rinfo.spcOid = pq_getmsgint(input_message, 4);
+	rinfo.dbOid = pq_getmsgint(input_message, 4);
+	rinfo.relNumber = pq_getmsgint(input_message, 4);
+#endif
 	blknum = pq_getmsgint(input_message, 4);
 	content = pq_getmsgbytes(input_message, BLCKSZ);

-	buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_ZERO_AND_LOCK);
+	buf = NeonRedoReadBuffer(rinfo, forknum, blknum, RBM_ZERO_AND_LOCK);
 	wal_redo_buffer = buf;
 	page = BufferGetPage(buf);
 	memcpy(page, content, BLCKSZ);
@@ -831,7 +854,7 @@ ApplyRecord(StringInfo input_message)
 	 */
 	if (BufferIsInvalid(wal_redo_buffer))
 	{
-		wal_redo_buffer = NeonRedoReadBuffer(target_redo_tag.rnode,
+		wal_redo_buffer = NeonRedoReadBuffer(BufTagGetNRelFileInfo(target_redo_tag),
 											 target_redo_tag.forkNum,
 											 target_redo_tag.blockNum,
 											 RBM_NORMAL);
@@ -878,26 +901,29 @@ static bool
 redo_block_filter(XLogReaderState *record, uint8 block_id)
 {
 	BufferTag	target_tag;
+	NRelFileInfo rinfo;

 #if PG_VERSION_NUM >= 150000
 	XLogRecGetBlockTag(record, block_id,
-					   &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum);
+					   &rinfo, &target_tag.forkNum, &target_tag.blockNum);
 #else
 	if (!XLogRecGetBlockTag(record, block_id,
-							&target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum))
+							&rinfo, &target_tag.forkNum, &target_tag.blockNum))
 	{
 		/* Caller specified a bogus block_id */
 		elog(PANIC, "failed to locate backup block with ID %d", block_id);
 	}
 #endif
+	CopyNRelFileInfoToBufTag(target_tag, rinfo);

 	/*
 	 * Can a WAL redo function ever access a relation other than the one that
 	 * it modifies? I don't see why it would.
+	 * Custom RMGRs may be affected by this.
 	 */
-	if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode))
+	if (!RelFileInfoEquals(rinfo, BufTagGetNRelFileInfo(target_redo_tag)))
 		elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
-			 target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum);
+			 RelFileInfoFmt(rinfo), target_tag.forkNum, target_tag.blockNum);

 	/*
 	 * If this block isn't one we are currently restoring, then return 'true'
@@ -914,7 +940,7 @@ redo_block_filter(XLogReaderState *record, uint8 block_id)
 static void
 GetPage(StringInfo input_message)
 {
-	RelFileNode rnode;
+	NRelFileInfo rinfo;
 	ForkNumber forknum;
 	BlockNumber blknum;
 	Buffer		buf;
@@ -931,14 +957,20 @@ GetPage(StringInfo input_message)
 	 * BlockNumber
 	 */
 	forknum = pq_getmsgbyte(input_message);
-	rnode.spcNode = pq_getmsgint(input_message, 4);
-	rnode.dbNode = pq_getmsgint(input_message, 4);
-	rnode.relNode = pq_getmsgint(input_message, 4);
+#if PG_MAJORVERSION_NUM < 16
+	rinfo.spcNode = pq_getmsgint(input_message, 4);
+	rinfo.dbNode = pq_getmsgint(input_message, 4);
+	rinfo.relNode = pq_getmsgint(input_message, 4);
+#else
+	rinfo.spcOid = pq_getmsgint(input_message, 4);
+	rinfo.dbOid = pq_getmsgint(input_message, 4);
+	rinfo.relNumber = pq_getmsgint(input_message, 4);
+#endif
 	blknum = pq_getmsgint(input_message, 4);

 	/* FIXME: check that we got a BeginRedoForBlock message or this earlier */

-	buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_NORMAL);
+	buf = NeonRedoReadBuffer(rinfo, forknum, blknum, RBM_NORMAL);
 	Assert(buf == wal_redo_buffer);
 	page = BufferGetPage(buf);
 	/* single thread, so don't bother locking the page */
@@ -961,7 +993,7 @@ GetPage(StringInfo input_message)
 	} while (tot_written < BLCKSZ);

 	ReleaseBuffer(buf);
-	DropRelFileNodeAllLocalBuffers(rnode);
+	DropRelationAllLocalBuffers(rinfo);
 	wal_redo_buffer = InvalidBuffer;

 	elog(TRACE, "Page sent back for block %u", blknum);
--- a/Show More
+++ b/Show More