fmt

proxy: track control-plane durations per connection request
proxy: make auth more type safe (#5689 )
2026-04-14 13:00:37 +00:00 · 2023-12-08 15:58:17 +00:00 · 2023-12-08 12:29:50 +00:00 · 2023-12-08 11:48:37 +00:00 · 2023-12-08 10:12:37 +00:00 · 2023-12-08 14:03:13 +04:00
431 changed files with 56206 additions and 23284 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -1,17 +1,3 @@
-# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
-# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
-# optimizations enabled by "opt-level=1" don't affect debuggability too much.
-#
-# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
-#
-[profile.dev.package."*"]
-# Set the default for dependencies in Development mode.
-opt-level = 3
-
-[profile.dev]
-# Turn on a small amount of optimization in Development mode.
-opt-level = 1
-
 [build]
 # This is only present for local builds, as it will be overridden
 # by the RUSTDOCFLAGS env var in CI.
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,5 +22,11 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]

+[final-excludes]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]
+
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.dockerignore
+++ b/.dockerignore
@@ -14,10 +14,12 @@
 !pgxn/
 !proxy/
 !safekeeper/
+!s3_scrubber/
 !storage_broker/
 !trace/
 !vendor/postgres-v14/
 !vendor/postgres-v15/
+!vendor/postgres-v16/
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,8 +17,9 @@ assignees: ''
 ## Implementation ideas


-## Tasks
- [ ]
+```[tasklist]
+### Tasks
+```


 ## Other related tasks and Epics
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -3,7 +3,7 @@
 **NB: this PR must be merged only by 'Create a merge commit'!**

 ### Checklist when preparing for release
- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow)
+- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b)
 - [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
 - [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?

--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -0,0 +1,12 @@
+self-hosted-runner:
+  labels:
+    - arm64
+    - dev
+    - gen3
+    - large
+    - small
+    - us-east-2
+config-variables:
+  - REMOTE_STORAGE_AZURE_CONTAINER
+  - REMOTE_STORAGE_AZURE_REGION
+  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -76,8 +76,8 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.23.1
-        ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86
+        ALLURE_VERSION: 2.24.0
+        ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
    - name: Acquire lock
@@ -203,6 +203,10 @@ runs:
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
      run: |
+        if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
+          exit 0
+        fi
+
        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}

        ./scripts/pysync
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -70,6 +70,9 @@ runs:
        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
        path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        prefix: latest
+        # The lack of compatibility snapshot (for example, for the new Postgres version)
+        # shouldn't fail the whole job. Only relevant test should fail.
+        skip-if-does-not-exist: true

    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -0,0 +1,31 @@
+name: Lint GitHub Workflows
+
+on:
+  push:
+    branches:
+      - main
+      - release
+    paths:
+      - '.github/workflows/*.ya?ml'
+  pull_request:
+    paths:
+      - '.github/workflows/*.ya?ml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: reviewdog/action-actionlint@v1
+        env:
+          # SC2046 - Quote this to prevent word splitting. - https://www.shellcheck.net/wiki/SC2046
+          # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086
+          SHELLCHECK_OPTS: --exclude=SC2046,SC2086
+        with:
+          fail_on_error: true
+          filter_mode: nofilter
+          level: error
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -2,7 +2,9 @@ name: Handle `approved-for-ci-run` label
 # This workflow helps to run CI pipeline for PRs made by external contributors (from forks).

 on:
-  pull_request:
+  pull_request_target:
+    branches:
+      - main
    types:
      # Default types that triggers a workflow ([1]):
      # - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
@@ -14,42 +16,103 @@ on:
      # Actual magic happens here:
      - labeled

+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+
 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
+  BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}"
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}

 jobs:
  remove-label:
    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
    # The PR should be reviewed and labelled manually again.

-    runs-on: [ ubuntu-latest ]
+    permissions:
+      pull-requests: write # For `gh pr edit`

    if: |
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

+    runs-on: ubuntu-latest
+
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

-  create-branch:
-    # Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
+  create-or-update-pr-for-ci-run:
+    # Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.

-    runs-on: [ ubuntu-latest ]
+    permissions:
+      pull-requests: write # for `gh pr edit`
+      # For `git push` and `gh pr create` we use CI_ACCESS_TOKEN

    if: |
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')

+    runs-on: ubuntu-latest
+
    steps:
      - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"

      - uses: actions/checkout@v3
        with:
          ref: main
+          token: ${{ secrets.CI_ACCESS_TOKEN }}

      - run: gh pr checkout "${PR_NUMBER}"

-      - run: git checkout -b "ci-run/pr-${PR_NUMBER}"
+      - run: git checkout -b "${BRANCH}"

-      - run: git push --force origin "ci-run/pr-${PR_NUMBER}"
+      - run: git push --force origin "${BRANCH}"
+
+      - name: Create a Pull Request for CI run (if required)
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          cat << EOF > body.md
+            This Pull Request is created automatically to run the CI pipeline for #${PR_NUMBER}
+
+            Please do not alter or merge/close it.
+
+            Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
+          EOF
+
+          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
+          if [ -z "${ALREADY_CREATED}" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
+                                                       --body-file "body.md" \
+                                                       --head "${BRANCH}" \
+                                                       --base "main" \
+                                                       --draft
+          fi
+
+  cleanup:
+    # Close PRs and delete branchs if the original PR is closed.
+
+    permissions:
+      contents: write # for `--delete-branch` flag in `gh pr close`
+      pull-requests: write # for `gh pr close`
+
+    if: |
+      github.event.action == 'closed' &&
+      github.event.pull_request.head.repo.full_name != github.repository
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
+        run: |
+          CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --json 'closed' --jq '.[].closed')"
+          if [ "${CLOSED}" == "false" ]; then
+            gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch
+          fi
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -117,6 +117,7 @@ jobs:
    outputs:
      pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
      olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
+      tpch-compare-matrix: ${{ steps.tpch-compare-matrix.outputs.matrix }}

    steps:
    - name: Generate matrix for pgbench benchmark
@@ -136,11 +137,11 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

-        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

    - name: Generate matrix for OLAP benchmarks
      id: olap-compare-matrix
@@ -152,11 +153,30 @@ jobs:
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
-          matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" },
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
                                                   { "platform": "rds-aurora"   }]')
        fi

-        echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
+
+    - name: Generate matrix for TPC-H benchmarks
+      id: tpch-compare-matrix
+      run: |
+        matrix='{
+          "platform": [
+            "neon-captest-reuse"
+          ],
+          "scale": [
+            "10"
+          ]
+        }'
+
+        if [ "$(date +%A)" = "Saturday" ]; then
+          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
+                                                   { "platform": "rds-aurora",   "scale": "10" }]')
+        fi
+
+        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT

  pgbench-compare:
    needs: [ generate-matrices ]
@@ -233,7 +253,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -358,7 +382,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -372,6 +400,7 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        TEST_OLAP_SCALE: 10

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -398,7 +427,7 @@ jobs:

    strategy:
      fail-fast: false
-      matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }}
+      matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }}

    env:
      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
@@ -407,6 +436,7 @@ jobs:
      BUILD_TYPE: remote
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
      PLATFORM: ${{ matrix.platform }}
+      TEST_OLAP_SCALE: ${{ matrix.scale }}

    runs-on: [ self-hosted, us-east-2, x64 ]
    container:
@@ -428,18 +458,17 @@ jobs:
        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH

-    - name: Set up Connection String
-      id: set-up-connstr
+    - name: Get Connstring Secret Name
      run: |
        case "${PLATFORM}" in
          neon-captest-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=CAPTEST_TPCH
            ;;
          rds-aurora)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          rds-postgres)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
+            ENV_PLATFORM=RDS_AURORA_TPCH
            ;;
          *)
            echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
@@ -447,9 +476,21 @@ jobs:
            ;;
        esac

+        CONNSTR_SECRET_NAME="BENCHMARK_${ENV_PLATFORM}_S${TEST_OLAP_SCALE}_CONNSTR"
+        echo "CONNSTR_SECRET_NAME=${CONNSTR_SECRET_NAME}" >> $GITHUB_ENV
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        CONNSTR=${{ secrets[env.CONNSTR_SECRET_NAME] }}
+
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -463,6 +504,7 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        TEST_OLAP_SCALE: ${{ matrix.scale }}

    - name: Create Allure report
      if: ${{ !cancelled() }}
@@ -534,7 +576,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        psql ${CONNSTR} -c "SELECT version();"
+        QUERY="SELECT version();"
+        if [[ "${PLATFORM}" = "neon"* ]]; then
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
+        fi
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,7 +5,6 @@ on:
    branches:
      - main
      - release
-      - ci-run/pr-*
  pull_request:

 defaults:
@@ -24,7 +23,30 @@ env:
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

 jobs:
+  check-permissions:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Disallow PRs from forks
+      if: |
+        github.event_name == 'pull_request' &&
+        github.event.pull_request.head.repo.full_name != github.repository
+
+      run: |
+        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
+          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
+        else
+          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
+        fi
+
+        echo >&2 "We don't run CI for PRs from forks"
+        echo >&2 "${MESSAGE}"
+
+        exit 1
+
+
  tag:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
@@ -53,6 +75,7 @@ jobs:
        id: build-tag

  check-codestyle-python:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -85,6 +108,7 @@ jobs:
        run: poetry run mypy .

  check-codestyle-rust:
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -148,9 +172,10 @@ jobs:
      # https://github.com/EmbarkStudios/cargo-deny
      - name: Check rust licenses/bans/advisories/sources
        if: ${{ !cancelled() }}
-        run: cargo deny check
+        run: cargo deny check --hide-inclusion-graph

  build-neon:
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -162,6 +187,7 @@ jobs:
    env:
      BUILD_TYPE: ${{ matrix.build_type }}
      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}

    steps:
      - name: Fix git ownership
@@ -187,7 +213,7 @@ jobs:
          # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603

          FAILED=false
-          for postgres in postgres-v14 postgres-v15; do
+          for postgres in postgres-v14 postgres-v15 postgres-v16; do
            expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
            actual=$(git rev-parse "HEAD:vendor/${postgres}")
            if [ "${expected}" != "${actual}" ]; then
@@ -209,6 +235,10 @@ jobs:
        id: pg_v15_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT

+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
      # Set some environment variables used by all the steps.
      #
      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
@@ -229,10 +259,12 @@ jobs:
            cov_prefix=""
            CARGO_FLAGS="--locked --release"
          fi
-          echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
-          echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
-          echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
-          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+          {
+            echo "cov_prefix=${cov_prefix}"
+            echo "CARGO_FEATURES=${CARGO_FEATURES}"
+            echo "CARGO_FLAGS=${CARGO_FLAGS}"
+            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
+          } >> $GITHUB_ENV

      # Disabled for now
      # Don't include the ~/.cargo/registry/src directory. It contains just
@@ -267,6 +299,13 @@ jobs:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
        run: mold -run make postgres-v14 -j$(nproc)
@@ -275,9 +314,16 @@ jobs:
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
        run: mold -run make postgres-v15 -j$(nproc)

+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
      - name: Run cargo build
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
@@ -293,6 +339,16 @@ jobs:
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
      - name: Install rust binaries
        run: |
          # Install target binaries
@@ -348,17 +404,17 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
+    needs: [ check-permissions, build-neon, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
-    needs: [ build-neon ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug, release ]
-        pg_version: [ v14, v15 ]
+        pg_version: [ v14, v15, v16 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -378,20 +434,21 @@ jobs:
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
+          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}

      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
        uses: ./.github/actions/save-coverage-data

  benchmarks:
+    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
-    needs: [ build-neon ]
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
@@ -413,17 +470,18 @@ jobs:
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

  create-test-report:
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
+    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ regress-tests, benchmarks ]
-    if: ${{ !cancelled() }}

    steps:
      - uses: actions/checkout@v3
@@ -449,42 +507,40 @@ jobs:
              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
            }

+            const coverage = {
+              coverageUrl: "${{ needs.coverage-report.outputs.coverage-html }}",
+              summaryJsonUrl: "${{ needs.coverage-report.outputs.coverage-json }}",
+            }
+
            const script = require("./scripts/comment-test-report.js")
            await script({
              github,
              context,
              fetch,
              report,
+              coverage,
            })

  coverage-report:
+    needs: [ check-permissions, regress-tests ]
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
-    needs: [ regress-tests ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug ]
+    outputs:
+        coverage-html: ${{ steps.upload-coverage-report-new.outputs.report-url }}
+        coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
    steps:
      - name: Checkout
        uses: actions/checkout@v3
        with:
          submodules: true
-          fetch-depth: 1
-
-#      Disabled for now
-#      - name: Restore cargo deps cache
-#        id: cache_cargo
-#        uses: actions/cache@v3
-#        with:
-#          path: |
-#            ~/.cargo/registry/
-#            !~/.cargo/registry/src
-#            ~/.cargo/git/
-#            target/
-#          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+          fetch-depth: 0

      - name: Get Neon artifact
        uses: ./.github/actions/download
@@ -527,13 +583,48 @@ jobs:
          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT

+      - name: Build coverage report NEW
+        id: upload-coverage-report-new
+        env:
+          BUCKET: neon-github-public-dev
+          # A differential coverage report is available only for PRs.
+          # (i.e. for pushes into main/release branches we have a regular coverage report)
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
+        run: |
+          CURRENT="${COMMIT_SHA}"
+          BASELINE="$(git merge-base $BASE_SHA $CURRENT)"
+
+          cp /tmp/coverage/report/lcov.info ./${CURRENT}.info
+
+          GENHTML_ARGS="--ignore-errors path,unmapped,empty --synthesize-missing --demangle-cpp rustfilt --output-directory lcov-html ${CURRENT}.info"
+
+          # Use differential coverage if the baseline coverage exists.
+          # It can be missing if the coverage repoer wasn't uploaded yet or tests has failed on BASELINE commit.
+          if aws s3 cp --only-show-errors s3://${BUCKET}/code-coverage/${BASELINE}/lcov.info ./${BASELINE}.info; then
+            git diff ${BASELINE} ${CURRENT} -- '*.rs' > baseline-current.diff
+
+            GENHTML_ARGS="--baseline-file ${BASELINE}.info --diff-file baseline-current.diff ${GENHTML_ARGS}"
+          fi
+
+          genhtml ${GENHTML_ARGS}
+
+          aws s3 cp --only-show-errors --recursive ./lcov-html/ s3://${BUCKET}/code-coverage/${COMMIT_SHA}/lcov
+
+          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/index.html
+          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
+
+          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
+          echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
+
      - uses: actions/github-script@v6
        env:
          REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
+          REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        with:
          script: |
-            const { REPORT_URL, COMMIT_SHA } = process.env
+            const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env

            await github.rest.repos.createCommitStatus({
              owner: context.repo.owner,
@@ -544,12 +635,21 @@ jobs:
              context: 'Code coverage report',
            })

+            await github.rest.repos.createCommitStatus({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              sha: `${COMMIT_SHA}`,
+              state: 'success',
+              target_url: `${REPORT_URL_NEW}`,
+              context: 'Code coverage report NEW',
+            })
+
  trigger-e2e-tests:
+    needs: [ check-permissions, promote-images, tag ]
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ promote-images, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
@@ -590,8 +690,8 @@ jobs:
            }"

  neon-image:
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -628,6 +728,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -638,7 +739,7 @@ jobs:

  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag ]
+    needs: [ check-permissions, tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -683,17 +784,17 @@ jobs:
        run: rm -rf ~/.ecr

  compute-node-image:
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: gcr.io/kaniko-project/executor:v1.9.2-debug
      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
      options: --add-host=download.osgeo.org:140.211.15.30
-    needs: [ tag ]
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15 ]
+        version: [ v14, v15, v16 ]
    defaults:
      run:
        shell: sh -eu {0}
@@ -742,17 +843,17 @@ jobs:
        run: rm -rf ~/.ecr

  vm-compute-node-image:
+    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ tag, compute-node-image ]
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15 ]
+        version: [ v14, v15, v16 ]
    defaults:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.17.5
+      VM_BUILDER_VERSION: v0.19.0

    steps:
      - name: Checkout
@@ -774,8 +875,7 @@ jobs:
      - name: Build vm image
        run: |
          ./vm-builder \
-            -enable-file-cache \
-            -cgroup-uid=postgres \
+            -spec=vm-image-spec.yaml \
            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

@@ -784,7 +884,7 @@ jobs:
          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
-    needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
+    needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ]
    runs-on: [ self-hosted, gen3, small ]

    steps:
@@ -827,8 +927,8 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml down

  promote-images:
+    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: [ self-hosted, gen3, small ]
-    needs: [ tag, test-images, vm-compute-node-image ]
    container: golang:1.19-bullseye
    # Don't add if-condition here.
    # The job should always be run because we have dependant other jobs that shouldn't be skipped
@@ -848,6 +948,7 @@ jobs:
        run: |
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16

      - name: Add latest tag to images
        if: |
@@ -860,6 +961,8 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -872,6 +975,8 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest

      - name: Configure Docker Hub login
        run: |
@@ -883,6 +988,7 @@ jobs:
        run: |
          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}

      - name: Push latest tags to Docker Hub
        if: |
@@ -895,21 +1001,19 @@ jobs:
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-  build-private-extensions:
-    runs-on: [ self-hosted, gen3, small ]
-    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-      options: --init
-    needs: [ tag ]
+  trigger-custom-extensions-build-and-wait:
+    needs: [ check-permissions, tag ]
+    runs-on: ubuntu-latest
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
-          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }}
          REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"

          curl -f -X POST \
@@ -939,11 +1043,50 @@ jobs:
              }
            }"

+      - name: Wait for extension build to finish
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
+          INTERVAL=15 # try each N seconds
+
+          last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context
+
+          for ((i=0; i <= TIMEOUT; i+=INTERVAL)); do
+            sleep $INTERVAL
+
+            # Get statuses for the latest commit in the PR / branch
+            gh api \
+              -H "Accept: application/vnd.github+json" \
+              -H "X-GitHub-Api-Version: 2022-11-28" \
+              "/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json
+
+            # Get the latest status for the "build-and-upload-extensions" context
+            last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json)
+            if [ "${last_status}" = "pending" ]; then
+              # Extension build is still in progress.
+              continue
+            elif [ "${last_status}" = "success" ]; then
+              # Extension build is successful.
+              exit 0
+            else
+              # Status is neither "pending" nor "success", exit the loop and fail the job.
+              break
+            fi
+          done
+
+          # Extension build failed, print `statuses.json` for debugging and fail the job.
+          jq '.' statuses.json
+
+          echo >&2 "Status of extension build is '${last_status}' != 'success'"
+          exit 1
+
  deploy:
+    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
+    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
+
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ promote-images, tag, regress-tests ]
-    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
        run: |
@@ -966,7 +1109,10 @@ jobs:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+
+            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
          else
@@ -981,20 +1127,35 @@ jobs:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
          script: |
-            github.rest.git.createRef({
+            await github.rest.git.createRef({
              owner: context.repo.owner,
              repo: context.repo.repo,
              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
              sha: context.sha,
            })

+      - name: Create GitHub release
+        if: github.ref_name == 'release'
+        uses: actions/github-script@v6
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            await github.rest.repos.createRelease({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              tag_name: "${{ needs.tag.outputs.build-tag }}",
+              generate_release_notes: true,
+            })
+
  promote-compatibility-data:
+    needs: [ check-permissions, promote-images, tag, regress-tests ]
+    if: github.ref_name == 'release'
+
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ promote-images, tag, regress-tests ]
-    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
    steps:
      - name: Promote compatibility snapshot for the release
        env:
@@ -1002,7 +1163,7 @@ jobs:
          PREFIX: artifacts/latest
        run: |
          # Update compatibility snapshot for the release
-          for pg_version in v14 v15; do
+          for pg_version in v14 v15 v16; do
            for build_type in debug release; do
              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -4,7 +4,6 @@ on:
  push:
    branches:
      - main
-      - ci-run/pr-*
  pull_request:

 defaults:
@@ -22,7 +21,10 @@ env:

 jobs:
  check-macos-build:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    timeout-minutes: 90
    runs-on: macos-latest

@@ -33,13 +35,13 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 1

      - name: Install macOS postgres dependencies
-        run: brew install flex bison openssl protobuf
+        run: brew install flex bison openssl protobuf icu4c pkg-config

      - name: Set pg 14 revision for caching
        id: pg_v14_rev
@@ -49,6 +51,10 @@ jobs:
        id: pg_v15_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT

+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
      - name: Cache postgres v14 build
        id: cache_pg_14
        uses: actions/cache@v3
@@ -63,6 +69,13 @@ jobs:
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
      - name: Set extra env for macOS
        run: |
          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
@@ -80,14 +93,21 @@ jobs:

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: make postgres-v14 -j$(nproc)
+        run: make postgres-v14 -j$(sysctl -n hw.ncpu)

      - name: Build postgres v15
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: make postgres-v15 -j$(nproc)
+        run: make postgres-v15 -j$(sysctl -n hw.ncpu)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: make postgres-v16 -j$(sysctl -n hw.ncpu)

      - name: Build neon extensions
-        run: make neon-pg-ext -j$(nproc)
+        run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
+
+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(sysctl -n hw.ncpu)

      - name: Run cargo build
        run: cargo build --all --release
@@ -95,8 +115,182 @@ jobs:
      - name: Check that no warnings are produced
        run: ./run_clippy.sh

+  check-linux-arm-build:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+      CARGO_FEATURES: --features testing
+      CARGO_FLAGS: --locked --release
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set env variables
+        run: |
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      - name: Run cargo test
+        run: |
+          cargo test $CARGO_FLAGS $CARGO_FEATURES
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
+  check-codestyle-rust-arm:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
  gather-rust-build-stats:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -112,7 +306,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 1
@@ -121,6 +315,9 @@ jobs:
      - name: Get postgres headers
        run: make postgres-headers -j$(nproc)

+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(nproc)
+
      - name: Produce the build stats
        run: cargo build --all --release --timings

--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -0,0 +1,29 @@
+name: Notify Slack channel about upcoming release
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.number }}
+  cancel-in-progress: true
+
+on:
+  pull_request:
+    branches:
+      - release
+    types:
+      # Default types that triggers a workflow:
+      # - https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
+      - opened
+      - synchronize
+      - reopened
+      # Additional types that we want to handle:
+      - closed
+
+jobs:
+  notify:
+    runs-on: [ ubuntu-latest ]
+
+    steps:
+      - uses: neondatabase/dev-actions/release-pr-notify@main
+        with:
+          slack-token: ${{ secrets.SLACK_BOT_TOKEN }}
+          slack-channel-id: ${{ vars.SLACK_UPCOMING_RELEASE_CHANNEL_ID || 'C05QQ9J1BRC' }} # if not set, then `#test-release-notifications`
+          github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,16 +2,19 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 10 * * 2'
+    - cron: '0 6 * * 1'
  workflow_dispatch:

 jobs:
  create_release_branch:
-    runs-on: [ubuntu-latest]
+    runs-on: [ ubuntu-latest ]
+
+    permissions:
+      contents: write # for `git push`

    steps:
    - name: Check out code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
      with:
        ref: main

@@ -26,9 +29,16 @@ jobs:
      run: git push origin releases/${{ steps.date.outputs.date }}

    - name: Create pull request into release
-      uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
-      with:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        head: releases/${{ steps.date.outputs.date }}
-        base: release
-        title: Release ${{ steps.date.outputs.date }}
+      env:
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+      run: |
+        cat << EOF > body.md
+          ## Release ${{ steps.date.outputs.date }}
+
+          **Please merge this PR using 'Create a merge commit'!**
+        EOF
+
+        gh pr create --title "Release ${{ steps.date.outputs.date }}" \
+                     --body-file "body.md" \
+                     --head "releases/${{ steps.date.outputs.date }}" \
+                     --base "release"
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,6 @@ test_output/
 *.o
 *.so
 *.Po
+
+# pgindent typedef lists
+*.list
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,7 @@
 	path = vendor/postgres-v15
 	url = https://github.com/neondatabase/postgres.git
 	branch = REL_15_STABLE_neon
+[submodule "vendor/postgres-v16"]
+	path = vendor/postgres-v16
+	url = https://github.com/neondatabase/postgres.git
+	branch = REL_16_STABLE_neon
--- a/2
+++ b/2
@@ -5,7 +5,7 @@
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
-/pageserver/ @neondatabase/compute @neondatabase/storage
+/pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,6 +9,24 @@ refactoring, additional comments, and so forth. Let's try to raise the
 bar, and clean things up as we go. Try to leave code in a better shape
 than it was before.

+## Pre-commit hook
+
+We have a sample pre-commit hook in `pre-commit.py`.
+To set it up, run:
+
+```bash
+ln -s ../../pre-commit.py .git/hooks/pre-commit
+```
+
+This will run following checks on staged files before each commit:
+- `rustfmt`
+- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
+
+There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
+and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
+
+If you want to skip the hook, run `git commit` with `--no-verify` option.
+
 ## Submitting changes

 1. Get at least one +1 on your PR before you push.
@@ -27,3 +45,28 @@ your patch's fault. Help to fix the root cause if something else has
 broken the CI, before pushing.

 *Happy Hacking!*
+
+# How to run a CI pipeline on Pull Requests from external contributors
+_An instruction for maintainers_
+
+## TL;DR:
+- Review the PR
+- If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then:
+    - Press the "Approve and run" button in GitHub UI
+    - Add the `approved-for-ci-run` label to the PR
+
+Repeat all steps after any change to the PR.
+- When the changes are ready to get merged — merge the original PR (not the internal one)
+
+## Longer version:
+
+GitHub Actions triggered by the `pull_request` event don't share repository secrets with the forks (for security reasons).
+So, passing the CI pipeline on Pull Requests from external contributors is impossible.
+
+We're using the following approach to make it work:
+- After the review, assign the `approved-for-ci-run` label to the PR if changes look safe
+- A GitHub Action will create an internal branch and a new PR with the same changes (for example, for a PR `#1234`, it'll be a branch `ci-run/pr-1234`)
+- Because the PR is created from the internal branch, it is able to access repository secrets (that's why it's crucial to make sure that the PR doesn't contain any malicious code that could expose our secrets or intentionally harm the CI)
+- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
+
+For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,4 +1,5 @@
 [workspace]
+resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
@@ -7,6 +8,7 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
+    "s3_scrubber",
    "workspace_hack",
    "trace",
    "libs/compute_api",
@@ -24,6 +26,7 @@ members = [
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
+    "libs/walproposer",
 ]

 [workspace.package]
@@ -33,15 +36,20 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
-async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
+arc-swap = "1.6"
+async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
+azure_core = "0.16"
+azure_identity = "0.16"
+azure_storage = "0.16"
+azure_storage_blobs = "0.16"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "0.55", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.27"
-aws-smithy-http = "0.55"
-aws-credential-types = "0.55"
-aws-types = "0.55"
+aws-config = { version = "1.0", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.0"
+aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
+aws-smithy-types = "1.0"
+aws-credential-types = "1.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -49,6 +57,7 @@ bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
+camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
@@ -57,7 +66,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = "5.5.0"
+dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -73,11 +82,13 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
+http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.9"
+hyper-tungstenite = "0.11"
 inotify = "0.10.2"
+ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "8"
 libc = "0.2"
@@ -105,31 +116,37 @@ reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
-rustls = "0.20"
+rustc-hash = "1.1.0"
+rustls = "0.21"
 rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
-sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sd-notify = "0.4.1"
+sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
+serde_path_to_error = "0.1"
 serde_with = "2.0"
+serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
+smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
+task-local-extensions = "0.1.4"
 test-context = "0.1"
 thiserror = "1.0"
-tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
+tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.9.0"
-tokio-rustls = "0.23"
+tokio-postgres-rustls = "0.10.0"
+tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7", features = ["io"] }
@@ -143,7 +160,7 @@ tracing-subscriber = { version = "0.3", default_features = false, features = ["s
 url = "2.2"
 uuid = { version = "1.2", features = ["v4", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.23"
+webpki-roots = "0.25"
 x509-parser = "0.15"

 ## TODO replace this with tracing
@@ -151,11 +168,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -176,22 +193,23 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
+walproposer = { version = "0.1", path = "./libs/walproposer/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.10"
-rstest = "0.17"
-tempfile = "3.4"
+rcgen = "0.11"
+rstest = "0.18"
+camino-tempfile = "1.0.2"
 tonic-build = "0.9"

 [patch.crates-io]

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ################# Binary contents sections

--- a/9
+++ b/9
@@ -12,6 +12,7 @@ WORKDIR /home/nonroot

 COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
 COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
+COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
@@ -26,6 +27,7 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
+ARG BUILD_TAG

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -39,6 +41,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev

 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --chown=nonroot . .

 # Show build caching stats to check if it was used in the end.
@@ -65,6 +68,7 @@ RUN set -e \
    && apt install -y \
        libreadline-dev \
        libseccomp-dev \
+        libicu67 \
        openssl \
        ca-certificates \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
@@ -75,12 +79,13 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
+COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
 COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

 # By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -74,8 +74,8 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar

 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
-    echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
+    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
@@ -124,8 +124,21 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
-    echo "1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 plv8.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PLV8_VERSION=3.1.5 \
+        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
+        ;; \
+      "v16") \
+        export PLV8_VERSION=3.1.8 \
+        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
+    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -172,8 +185,8 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
    cp -R /h3/usr / && \
    rm -rf build

-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
-    echo "c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 h3-pg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -211,8 +224,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
-    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
+    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -243,8 +256,8 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 FROM build-deps AS hypopg-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
-    echo "e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e hypopg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
+    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
    mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -307,8 +320,8 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 FROM build-deps AS ip4r-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O ip4r.tar.gz && \
-    echo "78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 ip4r.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
+    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
    mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -323,8 +336,8 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.1.tar.gz -O i
 FROM build-deps AS prefix-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \
-    echo "38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e prefix.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
+    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -339,8 +352,8 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O pr
 FROM build-deps AS hll-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \
-    echo "9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 hll.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
+    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -355,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \
-    echo "9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
+    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -371,12 +384,23 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz
 FROM build-deps AS timescaledb-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN apt-get update && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export TIMESCALEDB_VERSION=2.10.1 \
+        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
+        ;; \
+      *) \
+        export TIMESCALEDB_VERSION=2.13.0 \
+        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
+        ;; \
+    esac && \
+    apt-get update && \
    apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
-    echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
+    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
@@ -405,6 +429,10 @@ RUN case "${PG_VERSION}" in \
        export PG_HINT_PLAN_VERSION=15_1_5_0 \
        export PG_HINT_PLAN_CHECKSUM=564cbbf4820973ffece63fbf76e3c0af62c4ab23543142c7caaa682bc48918be \
        ;; \
+      "v16") \
+        export PG_HINT_PLAN_VERSION=16_1_6_0 \
+        export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
+        ;; \
      *) \
        echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
        ;; \
@@ -452,8 +480,8 @@ FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
-    echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
+    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -479,8 +507,8 @@ RUN apt-get update && \
        libfreetype6-dev

 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
-    echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
+    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
    mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
    cmake \
        -D RDK_BUILD_CAIRO_SUPPORT=OFF \
@@ -551,12 +579,19 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
-    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PG_EMBEDDING_VERSION=0.3.5 \
+        export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
+        ;; \
+      *) \
+        echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
+    esac && \
+    wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
+    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
+    make -j $(getconf _NPROCESSORS_ONLN) install

 #########################################################################################
 #
@@ -582,7 +617,7 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
 #########################################################################################
 #
 # Layer "rust extensions"
-# This layer is used to build `pgx` deps
+# This layer is used to build `pgrx` deps
 #
 #########################################################################################
 FROM build-deps AS rust-extensions-build
@@ -602,8 +637,8 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
-    cargo install --locked --version 0.7.3 cargo-pgx && \
-    /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+    cargo install --locked --version 0.10.2 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'

 USER root

@@ -615,14 +650,13 @@ USER root
 #########################################################################################

 FROM rust-extensions-build AS pg-jsonschema-pg-build
+ARG PG_VERSION

-# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
-# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
-RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
+    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control

 #########################################################################################
@@ -633,17 +667,13 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421e
 #########################################################################################

 FROM rust-extensions-build AS pg-graphql-pg-build
+ARG PG_VERSION

-# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
-# Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
-# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
-# same 1.1 version we've used before.
-RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
-    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
+    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
-    cargo pgx install --release && \
+    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
    # it's needed to enable extension because it uses untrusted C language
    sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
@@ -656,12 +686,13 @@ RUN wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367
 #########################################################################################

 FROM rust-extensions-build AS pg-tiktoken-pg-build
+ARG PG_VERSION

-# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
-RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
-    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
+# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
+    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
-    cargo pgx install --release && \
+    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

 #########################################################################################
@@ -672,14 +703,36 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
 #########################################################################################

 FROM rust-extensions-build AS pg-pgx-ulid-build
+ARG PG_VERSION

-RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
-    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
+    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
+    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
+    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+    echo "********************************************************************************************************" && \
+    sed -i 's/pgrx       = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control

+#########################################################################################
+#
+# Layer "wal2json-build"
+# Compile "wal2json" extension
+#
+#########################################################################################
+
+FROM build-deps AS wal2json-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
+    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
+    mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -716,6 +769,7 @@ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -726,6 +780,20 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_utils \
        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon_rmgr \
+        -s install && \
+    case "${PG_VERSION}" in \
+        "v14" | "v15") \
+        ;; \
+        "v16") \
+            echo "Skipping HNSW for PostgreSQL 16" && exit 0 \
+        ;; \
+        *) \
+            echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+        esac && \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/hnsw \
--- a/121
+++ b/121
@@ -29,6 +29,7 @@ else ifeq ($(UNAME_S),Darwin)
 	# It can be configured with OPENSSL_PREFIX variable
 	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
 	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
 	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
 	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
@@ -61,7 +62,7 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers
+neon: postgres-headers walproposer-lib
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

@@ -71,6 +72,10 @@ neon: postgres-headers
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
+	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
+		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
+		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
+		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
@@ -83,6 +88,8 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 # I'm not sure why it wouldn't work, but this is the only place (apart from
 # the "build-all-versions" entry points) where direct mention of PostgreSQL
 # versions is used.
+.PHONY: postgres-configure-v16
+postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
 .PHONY: postgres-configure-v15
 postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
 .PHONY: postgres-configure-v14
@@ -118,6 +125,10 @@ postgres-clean-%:
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean

+.PHONY: postgres-check-%
+postgres-check-%: postgres-%
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check
+
 .PHONY: neon-pg-ext-%
 neon-pg-ext-%: postgres-%
 	+@echo "Compiling neon $*"
@@ -130,6 +141,11 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
+	+@echo "Compiling neon_rmgr $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install
 	+@echo "Compiling neon_test_utils $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
@@ -140,11 +156,6 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
-	+@echo "Compiling hnsw $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install

 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
@@ -160,35 +171,79 @@ neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-	-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-	-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean
+
+# Build walproposer as a static library. walproposer source code is located
+# in the pgxn/neon directory.
+# 
+# We also need to include libpgport.a and libpgcommon.a, because walproposer
+# uses some functions from those libraries.
+# 
+# Some object files are removed from libpgport.a and libpgcommon.a because
+# they depend on openssl and other libraries that are not included in our
+# Rust build.
+.PHONY: walproposer-lib
+walproposer-lib: neon-pg-ext-v16
+	+@echo "Compiling walproposer-lib"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+ifeq ($(UNAME_S),Linux)
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
+		pg_strong_random.o
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
+		pg_crc32c.o \
+		hmac_openssl.o \
+		cryptohash_openssl.o \
+		scram-common.o \
+		md5_common.o \
+		checksum_helper.o
+endif
+
+.PHONY: walproposer-lib-clean
+walproposer-lib-clean:
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean

 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
-	neon-pg-ext-v15
+	neon-pg-ext-v15 \
+	neon-pg-ext-v16

 .PHONY: neon-pg-ext-clean
 neon-pg-ext-clean: \
 	neon-pg-ext-clean-v14 \
-	neon-pg-ext-clean-v15
+	neon-pg-ext-clean-v15 \
+	neon-pg-ext-clean-v16

 # shorthand to build all Postgres versions
 .PHONY: postgres
 postgres: \
 	postgres-v14 \
-	postgres-v15
+	postgres-v15 \
+	postgres-v16

 .PHONY: postgres-headers
 postgres-headers: \
 	postgres-headers-v14 \
-	postgres-headers-v15
+	postgres-headers-v15 \
+	postgres-headers-v16

 .PHONY: postgres-clean
 postgres-clean: \
 	postgres-clean-v14 \
-	postgres-clean-v15
+	postgres-clean-v15 \
+	postgres-clean-v16
+
+.PHONY: postgres-check
+postgres-check: \
+	postgres-check-v14 \
+	postgres-check-v15 \
+	postgres-check-v16

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
@@ -205,6 +260,44 @@ distclean:
 fmt:
 	./pre-commit.py --fix-inplace

+postgres-%-pg-bsd-indent: postgres-%
+	+@echo "Compiling pg_bsd_indent"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
+
+# Create typedef list for the core. Note that generally it should be combined with
+# buildfarm one to cover platform specific stuff.
+# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
+postgres-%-typedefs.list: postgres-%
+	$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
+
+# Indent postgres. See src/tools/pgindent/README for details.
+.PHONY: postgres-%-pgindent
+postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
+	+@echo merge with buildfarm typedef to cover all platforms
+	+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
+		REL_16_STABLE list misses PGSemaphoreData
+	# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
+	# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
+	cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
+		cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
+	+@echo note: you might want to run it on selected files/dirs instead.
+	INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
+		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
+		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
+		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
+	rm -f pg*.BAK
+
+# Indent pxgn/neon.
+.PHONY: pgindent
+neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
+		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
+		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
+
+
 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
--- a/4
+++ b/4
@@ -1,5 +1,5 @@
 Neon
 Copyright 2022 Neon Inc.

-The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the
-PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT.
+The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
+See vendor/postgres-vX/COPYRIGHT for details.
--- a/README.md
+++ b/README.md
@@ -29,18 +29,18 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry
+libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry
+  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
 pacman -S base-devel readline zlib libseccomp openssl clang \
-postgresql-libs cmake postgresql protobuf curl
+postgresql-libs cmake postgresql protobuf curl lsof
 ```

 Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
@@ -55,7 +55,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf openssl flex bison
+brew install protobuf openssl flex bison icu4c pkg-config

 # add openssl to PATH, required for ed25519 keys generation in neon_local
 echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
@@ -149,6 +149,9 @@ tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
 Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
 Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one

+# create postgres compute node
+> cargo neon endpoint create main
+
 # start postgres compute node
 > cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
@@ -185,8 +188,11 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 (L) main [de200bd42b49cc1814412c7e592dd6e9]
 (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]

+# create postgres on that branch
+> cargo neon endpoint create migration_check --branch-name migration_check
+
 # start postgres on that branch
-> cargo neon endpoint start migration_check --branch-name migration_check
+> cargo neon endpoint start migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
 Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'

--- a/clippy.toml
+++ b/clippy.toml
@@ -0,0 +1,5 @@
+disallowed-methods = [
+    "tokio::task::block_in_place",
+    # Allow this for now, to deny it later once we stop using Handle::block_on completely
+    # "tokio::runtime::Handle::block_on",
+]
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -38,3 +38,4 @@ toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.12.4"
+bytes = "1.0"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -31,7 +31,7 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
+//!             -r http://pg-ext-s3-gateway
 //! ```
 //!
 use std::collections::HashMap;
@@ -51,7 +51,7 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
-use compute_tools::extension_server::{get_pg_version, init_remote_storage};
+use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
@@ -60,7 +60,7 @@ use compute_tools::spec::*;

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
-const BUILD_TAG_DEFAULT: &str = "5670669815";
+const BUILD_TAG_DEFAULT: &str = "latest";

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -74,10 +74,18 @@ fn main() -> Result<()> {
    let pgbin_default = String::from("postgres");
    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);

-    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
-    let ext_remote_storage = remote_ext_config.map(|x| {
-        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
-    });
+    let ext_remote_storage = matches
+        .get_one::<String>("remote-ext-config")
+        // Compatibility hack: if the control plane specified any remote-ext-config
+        // use the default value for extension storage proxy gateway.
+        // Remove this once the control plane is updated to pass the gateway URL
+        .map(|conf| {
+            if conf.starts_with("http") {
+                conf.trim_end_matches('/')
+            } else {
+                "http://pg-ext-s3-gateway"
+            }
+        });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -156,6 +164,7 @@ fn main() -> Result<()> {
                let path = Path::new(sp);
                let file = File::open(path)?;
                spec = Some(serde_json::from_reader(file)?);
+                live_config_allowed = true;
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
                    live_config_allowed = true;
@@ -197,7 +206,7 @@ fn main() -> Result<()> {
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
-        ext_remote_storage,
+        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
    };
@@ -265,7 +274,13 @@ fn main() -> Result<()> {
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
-            drop(state);
+            // Notify others that Postgres failed to start. In case of configuring the
+            // empty compute, it's likely that API handler is still waiting for compute
+            // state change. With this we will notify it that compute is in Failed state,
+            // so control plane will know about it earlier and record proper error instead
+            // of timeout.
+            compute.state_changed.notify_all();
+            drop(state); // unlock
            delay_exit = true;
            None
        }
@@ -277,32 +292,26 @@ fn main() -> Result<()> {
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            use tracing::warn;
-            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
+            let vm_monitor_addr = matches
+                .get_one::<String>("vm-monitor-addr")
+                .expect("--vm-monitor-addr should always be set because it has a default arg");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
-            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
-            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
-                (None, None) => None,
-                (None, Some(_)) => {
-                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
-                    None
-                }
-                (Some(_), None) => {
-                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
-                }
-                (Some(_), Some(_)) => Some(
+            let rt = if env::var_os("AUTOSCALING").is_some() {
+                Some(
                    tokio::runtime::Builder::new_multi_thread()
                        .worker_threads(4)
                        .enable_all()
                        .build()
-                        .expect("failed to create tokio runtime for monitor"),
-                ),
+                        .expect("failed to create tokio runtime for monitor")
+                )
+            } else {
+                None
            };

            // This token is used internally by the monitor to clean up all threads
@@ -313,8 +322,7 @@ fn main() -> Result<()> {
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
-                        addr: vm_monitor_addr.cloned().unwrap(),
-                        file_cache_on_disk,
+                        addr: vm_monitor_addr.clone(),
                    })),
                    token.clone(),
                ))
@@ -485,11 +493,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            Arg::new("file-cache-on-disk")
-                .long("file-cache-on-disk")
-                .action(clap::ArgAction::SetTrue),
-        )
 }

 #[test]
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,12 +1,39 @@
-use anyhow::{anyhow, Result};
+use anyhow::{anyhow, Ok, Result};
+use postgres::Client;
 use tokio_postgres::NoTls;
-use tracing::{error, instrument};
+use tracing::{error, instrument, warn};

 use crate::compute::ComputeNode;

+/// Create a special service table for availability checks
+/// only if it does not exist already.
+pub fn create_availability_check_data(client: &mut Client) -> Result<()> {
+    let query = "
+        DO $$
+        BEGIN
+            IF NOT EXISTS(
+                SELECT 1
+                FROM pg_catalog.pg_tables
+                WHERE tablename = 'health_check'
+            )
+            THEN
+            CREATE TABLE health_check (
+                id serial primary key,
+                updated_at timestamptz default now()
+            );
+            INSERT INTO health_check VALUES (1, now())
+                ON CONFLICT (id) DO UPDATE
+                 SET updated_at = now();
+            END IF;
+        END
+        $$;";
+    client.execute(query, &[])?;
+
+    Ok(())
+}
+
 /// Update timestamp in a row in a special service table to check
 /// that we can actually write some data in this particular timeline.
-/// Create table if it's missing.
 #[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    // Connect to the database.
@@ -24,21 +51,28 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    });

    let query = "
-    CREATE TABLE IF NOT EXISTS health_check (
-        id serial primary key,
-        updated_at timestamptz default now()
-    );
    INSERT INTO health_check VALUES (1, now())
        ON CONFLICT (id) DO UPDATE
         SET updated_at = now();";

-    let result = client.simple_query(query).await?;
-
-    if result.len() != 2 {
-        return Err(anyhow::format_err!(
-            "expected 2 query results, but got {}",
-            result.len()
-        ));
+    match client.simple_query(query).await {
+        Result::Ok(result) => {
+            if result.len() != 1 {
+                return Err(anyhow::anyhow!(
+                    "expected 1 query results, but got {}",
+                    result.len()
+                ));
+            }
+        }
+        Err(err) => {
+            if let Some(state) = err.code() {
+                if state == &tokio_postgres::error::SqlState::DISK_FULL {
+                    warn!("Tenant disk is full");
+                    return Ok(());
+                }
+            }
+            return Err(err.into());
+        }
    }

    Ok(())
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -22,11 +22,12 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, RemotePath};

+use crate::checker::create_availability_check_data;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -58,8 +59,8 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
-    ///  the S3 bucket that we search for extensions in
-    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    /// the address of extension storage proxy gateway
+    pub ext_remote_storage: Option<String>,
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
@@ -251,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
@@ -276,6 +277,17 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
 }

 impl ComputeNode {
+    /// Check that compute node has corresponding feature enabled.
+    pub fn has_feature(&self, feature: ComputeFeature) -> bool {
+        let state = self.state.lock().unwrap();
+
+        if let Some(s) = state.pspec.as_ref() {
+            s.spec.features.contains(&feature)
+        } else {
+            false
+        }
+    }
+
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
        state.status = status;
@@ -691,11 +703,14 @@ impl ComputeNode {
        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
        create_neon_superuser(spec, &mut client)?;
+        cleanup_instance(&mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, self.connstr.as_str())?;
+        handle_grants(spec, &mut client, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
+        handle_extension_neon(&mut client)?;
+        create_availability_check_data(&mut client)?;

        // 'Close' connection
        drop(client);
@@ -707,8 +722,12 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
-        client.simple_query("SELECT pg_reload_conf()")?;
+    fn pg_reload_conf(&self) -> Result<()> {
+        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        Command::new(pgctl_bin)
+            .args(["reload", "-D", &self.pgdata])
+            .output()
+            .expect("cannot run pg_ctl process");
        Ok(())
    }

@@ -720,25 +739,36 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
+        config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
+        // temporarily reset max_cluster_size in config
+        // to avoid the possibility of hitting the limit, while we are reconfiguring:
+        // creating new extensions, roles, etc...
+        config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
-        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
        if spec.mode == ComputeMode::Primary {
            client.simple_query("SET neon.forward_ddl = false")?;
+            cleanup_instance(&mut client)?;
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, self.connstr.as_str())?;
+            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
+            handle_extension_neon(&mut client)?;
        }

        // 'Close' connection
        drop(client);

+        // reset max_cluster_size in config back to original value and reload config
+        config::compute_ctl_temp_override_remove(pgdata_path)?;
+        self.pg_reload_conf()?;
+
        let unknown_op = "unknown".to_string();
        let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
        info!(
@@ -799,7 +829,17 @@ impl ComputeNode {

        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
+            let pgdata_path = Path::new(&self.pgdata);
+            // temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are applying config:
+            // creating new extensions, roles, etc...
+            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+            self.pg_reload_conf()?;
+
            self.apply_config(&compute_state)?;
+
+            config::compute_ctl_temp_override_remove(pgdata_path)?;
+            self.pg_reload_conf()?;
        }

        let startup_end_time = Utc::now();
@@ -947,12 +987,12 @@ LIMIT 100",
        real_ext_name: String,
        ext_path: RemotePath,
    ) -> Result<u64, DownloadError> {
-        let remote_storage = self
-            .ext_remote_storage
-            .as_ref()
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "Remote extensions storage is not configured",
-            )))?;
+        let ext_remote_storage =
+            self.ext_remote_storage
+                .as_ref()
+                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                    "Remote extensions storage is not configured",
+                )))?;

        let ext_archive_name = ext_path.object_name().expect("bad path");

@@ -1008,7 +1048,7 @@ LIMIT 100",
        let download_size = extension_server::download_extension(
            &real_ext_name,
            &ext_path,
-            remote_storage,
+            ext_remote_storage,
            &self.pgbin,
        )
        .await
@@ -1037,7 +1077,7 @@ LIMIT 100",
        let remote_extensions = spec
            .remote_extensions
            .as_ref()
-            .ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;
+            .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
@@ -1078,7 +1118,8 @@ LIMIT 100",

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
+            let (ext_name, ext_path) =
+                remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -46,8 +46,6 @@ pub fn write_postgres_conf(
        writeln!(file, "{}", conf)?;
    }

-    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
-
    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
    if let Some(s) = &spec.pageserver_connstring {
@@ -95,5 +93,25 @@ pub fn write_postgres_conf(
        writeln!(file, "neon.extension_server_port={}", port)?;
    }

+    // This is essential to keep this line at the end of the file,
+    // because it is intended to override any settings above.
+    writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
+
+    Ok(())
+}
+
+/// create file compute_ctl_temp_override.conf in pgdata_dir
+/// add provided options to this file
+pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
+    let path = pgdata_path.join("compute_ctl_temp_override.conf");
+    let mut file = File::create(path)?;
+    write!(file, "{}", options)?;
+    Ok(())
+}
+
+/// remove file compute_ctl_temp_override.conf in pgdata_dir
+pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
+    let path = pgdata_path.join("compute_ctl_temp_override.conf");
+    std::fs::remove_file(path)?;
    Ok(())
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,17 +71,16 @@ More specifically, here is an example ext_index.json
    }
 }
 */
-use anyhow::Context;
 use anyhow::{self, Result};
+use anyhow::{bail, Context};
+use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
+use regex::Regex;
 use remote_storage::*;
-use serde_json;
-use std::io::Read;
-use std::num::{NonZeroU32, NonZeroUsize};
+use reqwest::StatusCode;
 use std::path::Path;
 use std::str;
 use tar::Archive;
-use tokio::io::AsyncReadExt;
 use tracing::info;
 use tracing::log::warn;
 use zstd::stream::read::Decoder;
@@ -106,12 +105,28 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {

 pub fn get_pg_version(pgbin: &str) -> String {
    // pg_config --version returns a (platform specific) human readable string
-    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
    let human_version = get_pg_config("--version", pgbin);
-    if human_version.contains("15") {
-        return "v15".to_string();
-    } else if human_version.contains("14") {
-        return "v14".to_string();
+    return parse_pg_version(&human_version).to_string();
+}
+
+fn parse_pg_version(human_version: &str) -> &str {
+    // Normal releases have version strings like "PostgreSQL 15.4". But there
+    // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
+    // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
+    // configure option, you can tack any string to the version number,
+    // e.g. "PostgreSQL 15.4foobar".
+    match Regex::new(r"^PostgreSQL (?<major>\d+).+")
+        .unwrap()
+        .captures(human_version)
+    {
+        Some(captures) if captures.len() == 2 => match &captures["major"] {
+            "14" => return "v14",
+            "15" => return "v15",
+            "16" => return "v16",
+            _ => {}
+        },
+        _ => {}
    }
    panic!("Unsuported postgres version {human_version}");
 }
@@ -121,23 +136,31 @@ pub fn get_pg_version(pgbin: &str) -> String {
 pub async fn download_extension(
    ext_name: &str,
    ext_path: &RemotePath,
-    remote_storage: &GenericRemoteStorage,
+    ext_remote_storage: &str,
    pgbin: &str,
 ) -> Result<u64> {
    info!("Download extension {:?} from {:?}", ext_name, ext_path);
-    let mut download = remote_storage.download(ext_path).await?;
-    let mut download_buffer = Vec::new();
-    download
-        .download_stream
-        .read_to_end(&mut download_buffer)
-        .await?;
+
+    // TODO add retry logic
+    let download_buffer =
+        match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
+            Ok(buffer) => buffer,
+            Err(error_message) => {
+                return Err(anyhow::anyhow!(
+                    "error downloading extension {:?}: {:?}",
+                    ext_name,
+                    error_message
+                ));
+            }
+        };
+
    let download_size = download_buffer.len() as u64;
+    info!("Download size {:?}", download_size);
    // it's unclear whether it is more performant to decompress into memory or not
    // TODO: decompressing into memory can be avoided
-    let mut decoder = Decoder::new(download_buffer.as_slice())?;
-    let mut decompress_buffer = Vec::new();
-    decoder.read_to_end(&mut decompress_buffer)?;
-    let mut archive = Archive::new(decompress_buffer.as_slice());
+    let decoder = Decoder::new(download_buffer.as_ref())?;
+    let mut archive = Archive::new(decoder);
+
    let unzip_dest = pgbin
        .strip_suffix("/bin/postgres")
        .expect("bad pgbin")
@@ -180,7 +203,19 @@ pub async fn download_extension(
 // Create extension control files from spec
 pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    for ext_data in remote_extensions.extension_data.values() {
+    for (ext_name, ext_data) in remote_extensions.extension_data.iter() {
+        // Check if extension is present in public or custom.
+        // If not, then it is not allowed to be used by this compute.
+        if let Some(public_extensions) = &remote_extensions.public_extensions {
+            if !public_extensions.contains(ext_name) {
+                if let Some(custom_extensions) = &remote_extensions.custom_extensions {
+                    if !custom_extensions.contains(ext_name) {
+                        continue; // skip this extension, it is not allowed
+                    }
+                }
+            }
+        }
+
        for (control_name, control_content) in &ext_data.control_data {
            let control_path = local_sharedir.join(control_name);
            if !control_path.exists() {
@@ -193,29 +228,69 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    }
 }

-// This function initializes the necessary structs to use remote storage
-pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
-    #[derive(Debug, serde::Deserialize)]
-    struct RemoteExtJson {
-        bucket: String,
-        region: String,
-        endpoint: Option<String>,
-        prefix: Option<String>,
-    }
-    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
+// Do request to extension storage proxy, i.e.
+// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
+// using HHTP GET
+// and return the response body as bytes
+//
+async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
+    let uri = format!("{}/{}", ext_remote_storage, ext_path);

-    let config = S3Config {
-        bucket_name: remote_ext_json.bucket,
-        bucket_region: remote_ext_json.region,
-        prefix_in_bucket: remote_ext_json.prefix,
-        endpoint: remote_ext_json.endpoint,
-        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
-        max_keys_per_list_response: None,
-    };
-    let config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
-        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
-        storage: RemoteStorageKind::AwsS3(config),
-    };
-    GenericRemoteStorage::from_config(&config)
+    info!("Download extension {:?} from uri {:?}", ext_path, uri);
+
+    let resp = reqwest::get(uri).await?;
+
+    match resp.status() {
+        StatusCode::OK => match resp.bytes().await {
+            Ok(resp) => {
+                info!("Download extension {:?} completed successfully", ext_path);
+                Ok(resp)
+            }
+            Err(e) => bail!("could not deserialize remote extension response: {}", e),
+        },
+        StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
+        _ => bail!(
+            "unexpected remote extension response status code: {}",
+            resp.status()
+        ),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::parse_pg_version;
+
+    #[test]
+    fn test_parse_pg_version() {
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
+            "v15"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
+            "v14"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_unsupported_version() {
+        parse_pg_version("PostgreSQL 13.14");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_incorrect_version_format() {
+        parse_pg_version("PostgreSQL 14");
+    }
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -1,4 +1,6 @@
 use std::convert::Infallible;
+use std::net::IpAddr;
+use std::net::Ipv6Addr;
 use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
@@ -121,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

-        // download extension files from S3 on demand
+        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
            info!("req.uri {:?}", req.uri());
@@ -169,7 +171,12 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    }
                };

-                remote_extensions.get_ext(&filename, is_library)
+                remote_extensions.get_ext(
+                    &filename,
+                    is_library,
+                    &compute.build_tag,
+                    &compute.pgversion,
+                )
            };

            match ext {
@@ -220,7 +227,7 @@ async fn handle_configure_request(

        let parsed_spec = match ParsedSpec::try_from(spec) {
            Ok(ps) => ps,
-            Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
+            Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
        };

        // XXX: wrap state update under lock in code blocks. Otherwise,
@@ -293,7 +300,9 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
-    let addr = SocketAddr::from(([0, 0, 0, 0], port));
+    // this usually binds to both IPv4 and IPv6 on linux
+    // see e.g. https://github.com/rust-lang/rust/pull/34440
+    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);

    let make_service = make_service_fn(move |_conn| {
        let state = state.clone();
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -156,17 +156,17 @@ paths:
                description: Error text or 'OK' if download succeeded.
                example: "OK"
        400:
-        description: Request is invalid.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"
+          description: Request is invalid.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
        500:
-        description: Extension download request failed.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"
+          description: Extension download request failed.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
-//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -1,5 +1,5 @@
 use std::sync::Arc;
-use std::{thread, time};
+use std::{thread, time::Duration};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
@@ -7,7 +7,7 @@ use tracing::{debug, info};

 use crate::compute::ComputeNode;

-const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
+const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
@@ -17,13 +17,12 @@ fn watch_compute_activity(compute: &ComputeNode) {
    let connstr = compute.connstr.as_str();
    // Define `client` outside of the loop to reuse existing connection if it's active.
    let mut client = Client::connect(connstr, NoTls);
-    let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);

    info!("watching Postgres activity at {}", connstr);

    loop {
        // Should be outside of the write lock to allow others to read while we sleep.
-        thread::sleep(timeout);
+        thread::sleep(MONITOR_CHECK_INTERVAL);

        match &mut client {
            Ok(cli) => {
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -6,4 +6,4 @@ pub const DEFAULT_LOG_LEVEL: &str = "info";
 //   https://www.postgresql.org/docs/15/auth-password.html
 //
 // So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
-pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
+pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\tall\t\tmd5";
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
 use std::fmt::Write;
 use std::fs;
 use std::fs::File;
@@ -192,11 +193,16 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
+        .query(
+            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
+            &[],
+        )?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
+            replication: Some(row.get("rolreplication")),
+            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
@@ -205,22 +211,37 @@ pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
 }

 /// Build a list of existing Postgres databases
-pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
-    let postgres_dbs = client
+pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>> {
+    // `pg_database.datconnlimit = -2` means that the database is in the
+    // invalid state. See:
+    //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
+    let postgres_dbs: Vec<Database> = client
        .query(
-            "SELECT datname, datdba::regrole::text as owner
-               FROM pg_catalog.pg_database;",
+            "SELECT
+                datname AS name,
+                datdba::regrole::text AS owner,
+                NOT datallowconn AS restrict_conn,
+                datconnlimit = - 2 AS invalid
+            FROM
+                pg_catalog.pg_database;",
            &[],
        )?
        .iter()
        .map(|row| Database {
-            name: row.get("datname"),
+            name: row.get("name"),
            owner: row.get("owner"),
+            restrict_conn: row.get("restrict_conn"),
+            invalid: row.get("invalid"),
            options: None,
        })
        .collect();

-    Ok(postgres_dbs)
+    let dbs_map = postgres_dbs
+        .iter()
+        .map(|db| (db.name.clone(), db.clone()))
+        .collect::<HashMap<_, _>>();
+
+    Ok(dbs_map)
 }

 /// Wait for Postgres to become ready to accept connections. It's ready to
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -13,7 +13,7 @@ use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

 use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
-use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};
+use compute_api::spec::{ComputeSpec, PgIdent, Role};

 // Do control plane request and return response if any. In case of error it
 // returns a bool flag indicating whether it makes sense to retry the request
@@ -24,7 +24,7 @@ fn do_control_plane_request(
 ) -> Result<ControlPlaneSpecResponse, (bool, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
-        .header("Authorization", jwt)
+        .header("Authorization", format!("Bearer {}", jwt))
        .send()
        .map_err(|e| {
            (
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
    base_uri: &str,
    compute_id: &str,
 ) -> Result<Option<ComputeSpec>> {
-    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
+    let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
        Ok(v) => v,
        Err(_) => "".to_string(),
@@ -118,19 +118,6 @@ pub fn get_spec_from_control_plane(
    spec
 }

-/// It takes cluster specification and does the following:
-/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
-/// - Update `pg_hba.conf` to allow external connections.
-pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
-    // File `postgresql.conf` is no longer included into `basebackup`, so just
-    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
-
-    update_pg_hba(pgdata_path)?;
-
-    Ok(())
-}
-
 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
    // XXX: consider making it a part of spec.json
@@ -161,6 +148,38 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
    Ok(())
 }

+/// Compute could be unexpectedly shut down, for example, during the
+/// database dropping. This leaves the database in the invalid state,
+/// which prevents new db creation with the same name. This function
+/// will clean it up before proceeding with catalog updates. All
+/// possible future cleanup operations may go here too.
+#[instrument(skip_all)]
+pub fn cleanup_instance(client: &mut Client) -> Result<()> {
+    let existing_dbs = get_existing_dbs(client)?;
+
+    for (_, db) in existing_dbs {
+        if db.invalid {
+            // After recent commit in Postgres, interrupted DROP DATABASE
+            // leaves the database in the invalid state. According to the
+            // commit message, the only option for user is to drop it again.
+            // See:
+            //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
+            //
+            // Postgres Neon extension is done the way, that db is de-registered
+            // in the control plane metadata only after it is dropped. So there is
+            // a chance that it still thinks that db should exist. This means
+            // that it will be re-created by `handle_databases()`. Yet, it's fine
+            // as user can just repeat drop (in vanilla Postgres they would need
+            // to do the same, btw).
+            let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote());
+            info!("dropping invalid database {}", db.name);
+            client.execute(query.as_str(), &[])?;
+        }
+    }
+
+    Ok(())
+}
+
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
 #[instrument(skip_all)]
@@ -233,6 +252,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
+                || !r.bypassrls.unwrap_or(false)
+                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -264,13 +285,14 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
+                let mut query: String =
+                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
@@ -379,13 +401,13 @@ fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent
 /// which together provide us idempotency.
 #[instrument(skip_all)]
 pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
+    let existing_dbs = get_existing_dbs(client)?;

    // Print a list of existing Postgres databases (only in debug mode)
    if span_enabled!(Level::INFO) {
        info!("postgres databases:");
-        for r in &existing_dbs {
-            info!("    {}:{}", r.name, r.owner);
+        for (dbname, db) in &existing_dbs {
+            info!("    {}:{}", dbname, db.owner);
        }
    }

@@ -439,8 +461,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                "rename_db" => {
                    let new_name = op.new_name.as_ref().unwrap();

-                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
-                    if existing_dbs.iter().any(|r| r.name == op.name) {
+                    if existing_dbs.get(&op.name).is_some() {
                        let query: String = format!(
                            "ALTER DATABASE {} RENAME TO {}",
                            op.name.pg_quote(),
@@ -457,14 +478,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    }

    // Refresh Postgres databases info to handle possible renames
-    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
+    let existing_dbs = get_existing_dbs(client)?;

    info!("cluster spec databases:");
    for db in &spec.cluster.databases {
        let name = &db.name;
-
-        // XXX: with a limited number of databases it is fine, but consider making it a HashMap
-        let pg_db = existing_dbs.iter().find(|r| r.name == *name);
+        let pg_db = existing_dbs.get(name);

        enum DatabaseAction {
            None,
@@ -530,13 +549,32 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
-    info!("cluster spec grants:");
+pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
+    info!("modifying database permissions");
+    let existing_dbs = get_existing_dbs(client)?;

    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
    for db in &spec.cluster.databases {
+        match existing_dbs.get(&db.name) {
+            Some(pg_db) => {
+                if pg_db.restrict_conn || pg_db.invalid {
+                    info!(
+                        "skipping grants for db {} (invalid: {}, connections not allowed: {})",
+                        db.name, pg_db.invalid, pg_db.restrict_conn
+                    );
+                    continue;
+                }
+            }
+            None => {
+                bail!(
+                    "database {} doesn't exist in Postgres after handle_databases()",
+                    db.name
+                );
+            }
+        }
+
        let mut conf = Config::from_str(connstr)?;
        conf.dbname(&db.name);

@@ -575,6 +613,11 @@ pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {

        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
        // This is needed because since postgres 15 this privilege is removed by default.
+        // TODO: web_access isn't created for almost 1 year. It could be that we have
+        // active users of 1 year old projects, but hopefully not, so check it and
+        // remove this code if possible. The worst thing that could happen is that
+        // user won't be able to use public schema in NEW databases created in the
+        // very OLD project.
        let grant_query = "DO $$\n\
                BEGIN\n\
                    IF EXISTS(\n\
@@ -618,3 +661,33 @@ pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()>

    Ok(())
 }
+
+/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
+#[instrument(skip_all)]
+pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
+    info!("handle extension neon");
+
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
+    client.simple_query(query)?;
+
+    query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
+    info!("create neon extension with query: {}", query);
+    client.simple_query(query)?;
+
+    query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
+    client.simple_query(query)?;
+
+    query = "ALTER EXTENSION neon SET SCHEMA neon";
+    info!("alter neon extension schema with query: {}", query);
+    client.simple_query(query)?;
+
+    // this will be a no-op if extension is already up to date,
+    // which may happen in two cases:
+    // - extension was just installed
+    // - extension was already installed and is up to date
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension schema with query: {}", query);
+    client.simple_query(query)?;
+
+    Ok(())
+}
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -28,7 +28,7 @@ mod pg_helpers_tests {
        assert_eq!(
            spec.cluster.settings.as_pg_settings(),
            r#"fsync = off
-wal_level = replica
+wal_level = logical
 hot_standby = on
 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
 wal_log_hints = on
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,12 +6,15 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
 git-version.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
+hex.workspace = true
+hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
 serde.workspace = true
@@ -20,6 +23,7 @@ serde_with.workspace = true
 tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
+tokio.workspace = true
 url.workspace = true
 # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
 # instead, so that recompile times are better.
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -1,6 +1,7 @@
 # Minimal neon environment with one safekeeper. This is equivalent to the built-in
 # defaults that you get with no --config
-[pageserver]
+[[pageservers]]
+id=1
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
 pg_auth_type = 'Trust'
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -0,0 +1,137 @@
+use crate::{background_process, local_env::LocalEnv};
+use anyhow::anyhow;
+use camino::Utf8PathBuf;
+use serde::{Deserialize, Serialize};
+use std::{path::PathBuf, process::Child};
+use utils::id::{NodeId, TenantId};
+
+pub struct AttachmentService {
+    env: LocalEnv,
+    listen: String,
+    path: PathBuf,
+    client: reqwest::blocking::Client,
+}
+
+const COMMAND: &str = "attachment_service";
+
+#[derive(Serialize, Deserialize)]
+pub struct AttachHookRequest {
+    pub tenant_id: TenantId,
+    pub node_id: Option<NodeId>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct AttachHookResponse {
+    pub gen: Option<u32>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct InspectRequest {
+    pub tenant_id: TenantId,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct InspectResponse {
+    pub attachment: Option<(u32, NodeId)>,
+}
+
+impl AttachmentService {
+    pub fn from_env(env: &LocalEnv) -> Self {
+        let path = env.base_data_dir.join("attachments.json");
+
+        // Makes no sense to construct this if pageservers aren't going to use it: assume
+        // pageservers have control plane API set
+        let listen_url = env.control_plane_api.clone().unwrap();
+
+        let listen = format!(
+            "{}:{}",
+            listen_url.host_str().unwrap(),
+            listen_url.port().unwrap()
+        );
+
+        Self {
+            env: env.clone(),
+            path,
+            listen,
+            client: reqwest::blocking::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
+        }
+    }
+
+    fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
+            .expect("non-Unicode path")
+    }
+
+    pub fn start(&self) -> anyhow::Result<Child> {
+        let path_str = self.path.to_string_lossy();
+
+        background_process::start_process(
+            COMMAND,
+            &self.env.base_data_dir,
+            &self.env.attachment_service_bin(),
+            ["-l", &self.listen, "-p", &path_str],
+            [],
+            background_process::InitialPidFile::Create(&self.pid_file()),
+            // TODO: a real status check
+            || Ok(true),
+        )
+    }
+
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        background_process::stop_process(immediate, COMMAND, &self.pid_file())
+    }
+
+    /// Call into the attach_hook API, for use before handing out attachments to pageservers
+    pub fn attach_hook(
+        &self,
+        tenant_id: TenantId,
+        pageserver_id: NodeId,
+    ) -> anyhow::Result<Option<u32>> {
+        use hyper::StatusCode;
+
+        let url = self
+            .env
+            .control_plane_api
+            .clone()
+            .unwrap()
+            .join("attach-hook")
+            .unwrap();
+
+        let request = AttachHookRequest {
+            tenant_id,
+            node_id: Some(pageserver_id),
+        };
+
+        let response = self.client.post(url).json(&request).send()?;
+        if response.status() != StatusCode::OK {
+            return Err(anyhow!("Unexpected status {}", response.status()));
+        }
+
+        let response = response.json::<AttachHookResponse>()?;
+        Ok(response.gen)
+    }
+
+    pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
+        use hyper::StatusCode;
+
+        let url = self
+            .env
+            .control_plane_api
+            .clone()
+            .unwrap()
+            .join("inspect")
+            .unwrap();
+
+        let request = InspectRequest { tenant_id };
+
+        let response = self.client.post(url).json(&request).send()?;
+        if response.status() != StatusCode::OK {
+            return Err(anyhow!("Unexpected status {}", response.status()));
+        }
+
+        let response = response.json::<InspectResponse>()?;
+        Ok(response.attachment)
+    }
+}
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -16,12 +16,13 @@ use std::ffi::OsStr;
 use std::io::Write;
 use std::os::unix::prelude::AsRawFd;
 use std::os::unix::process::CommandExt;
-use std::path::{Path, PathBuf};
+use std::path::Path;
 use std::process::{Child, Command};
 use std::time::Duration;
 use std::{fs, io, thread};

 use anyhow::Context;
+use camino::{Utf8Path, Utf8PathBuf};
 use nix::errno::Errno;
 use nix::fcntl::{FcntlArg, FdFlag};
 use nix::sys::signal::{kill, Signal};
@@ -45,9 +46,9 @@ const NOTICE_AFTER_RETRIES: u64 = 50;
 /// it itself.
 pub enum InitialPidFile<'t> {
    /// Create a pidfile, to allow future CLI invocations to manipulate the process.
-    Create(&'t Path),
+    Create(&'t Utf8Path),
    /// The process will create the pidfile itself, need to wait for that event.
-    Expect(&'t Path),
+    Expect(&'t Utf8Path),
 }

 /// Start a background child process using the parameters given.
@@ -85,7 +86,7 @@ where
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
        .args(args);
-    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
+    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

    let pid_file_to_check = match initial_pid_file {
@@ -137,7 +138,11 @@ where
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
-pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
+pub fn stop_process(
+    immediate: bool,
+    process_name: &str,
+    pid_file: &Utf8Path,
+) -> anyhow::Result<()> {
    let pid = match pid_file::read(pid_file)
        .with_context(|| format!("read pid_file {pid_file:?}"))?
    {
@@ -233,11 +238,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
    filled_cmd
 }

-fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    for env_key in [
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
        "AWS_SESSION_TOKEN",
+        "AZURE_STORAGE_ACCOUNT",
+        "AZURE_STORAGE_ACCESS_KEY",
    ] {
        if let Ok(value) = std::env::var(env_key) {
            cmd = cmd.env(env_key, value);
@@ -252,10 +259,10 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
 ///    will remain held until the cmd exits.
 fn pre_exec_create_pidfile<P>(cmd: &mut Command, path: P) -> &mut Command
 where
-    P: Into<PathBuf>,
+    P: Into<Utf8PathBuf>,
 {
-    let path: PathBuf = path.into();
-    // SAFETY
+    let path: Utf8PathBuf = path.into();
+    // SAFETY:
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
@@ -311,7 +318,7 @@ where

 fn process_started<F>(
    pid: Pid,
-    pid_file_to_check: Option<&Path>,
+    pid_file_to_check: Option<&Utf8Path>,
    status_check: &F,
 ) -> anyhow::Result<bool>
 where
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -0,0 +1,337 @@
+/// The attachment service mimics the aspects of the control plane API
+/// that are required for a pageserver to operate.
+///
+/// This enables running & testing pageservers without a full-blown
+/// deployment of the Neon cloud platform.
+///
+use anyhow::anyhow;
+use clap::Parser;
+use hex::FromHex;
+use hyper::StatusCode;
+use hyper::{Body, Request, Response};
+use pageserver_api::shard::TenantShardId;
+use serde::{Deserialize, Serialize};
+use std::path::{Path, PathBuf};
+use std::{collections::HashMap, sync::Arc};
+use utils::http::endpoint::request_span;
+use utils::logging::{self, LogFormat};
+use utils::signals::{ShutdownSignals, Signal};
+
+use utils::{
+    http::{
+        endpoint::{self},
+        error::ApiError,
+        json::{json_request, json_response},
+        RequestExt, RouterBuilder,
+    },
+    id::{NodeId, TenantId},
+    tcp_listener,
+};
+
+use pageserver_api::control_api::{
+    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
+    ValidateResponseTenant,
+};
+
+use control_plane::attachment_service::{
+    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
+};
+
+#[derive(Parser)]
+#[command(author, version, about, long_about = None)]
+#[command(arg_required_else_help(true))]
+struct Cli {
+    /// Host and port to listen on, like `127.0.0.1:1234`
+    #[arg(short, long)]
+    listen: std::net::SocketAddr,
+
+    /// Path to the .json file to store state (will be created if it doesn't exist)
+    #[arg(short, long)]
+    path: PathBuf,
+}
+
+// The persistent state of each Tenant
+#[derive(Serialize, Deserialize, Clone)]
+struct TenantState {
+    // Currently attached pageserver
+    pageserver: Option<NodeId>,
+
+    // Latest generation number: next time we attach, increment this
+    // and use the incremented number when attaching
+    generation: u32,
+}
+
+fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+    V: Clone + Serialize,
+{
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
+
+    transformed
+        .collect::<HashMap<String, V>>()
+        .serialize(serializer)
+}
+
+fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+    V: Deserialize<'de>,
+{
+    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
+    hex_map
+        .into_iter()
+        .map(|(k, v)| {
+            TenantId::from_hex(k)
+                .map(|k| (k, v))
+                .map_err(serde::de::Error::custom)
+        })
+        .collect()
+}
+
+// Top level state available to all HTTP handlers
+#[derive(Serialize, Deserialize)]
+struct PersistentState {
+    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+    tenants: HashMap<TenantId, TenantState>,
+
+    #[serde(skip)]
+    path: PathBuf,
+}
+
+impl PersistentState {
+    async fn save(&self) -> anyhow::Result<()> {
+        let bytes = serde_json::to_vec(self)?;
+        tokio::fs::write(&self.path, &bytes).await?;
+
+        Ok(())
+    }
+
+    async fn load(path: &Path) -> anyhow::Result<Self> {
+        let bytes = tokio::fs::read(path).await?;
+        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
+        decoded.path = path.to_owned();
+        Ok(decoded)
+    }
+
+    async fn load_or_new(path: &Path) -> Self {
+        match Self::load(path).await {
+            Ok(s) => {
+                tracing::info!("Loaded state file at {}", path.display());
+                s
+            }
+            Err(e)
+                if e.downcast_ref::<std::io::Error>()
+                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
+                    .unwrap_or(false) =>
+            {
+                tracing::info!("Will create state file at {}", path.display());
+                Self {
+                    tenants: HashMap::new(),
+                    path: path.to_owned(),
+                }
+            }
+            Err(e) => {
+                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
+            }
+        }
+    }
+}
+
+/// State available to HTTP request handlers
+#[derive(Clone)]
+struct State {
+    inner: Arc<tokio::sync::RwLock<PersistentState>>,
+}
+
+impl State {
+    fn new(persistent_state: PersistentState) -> State {
+        Self {
+            inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
+        }
+    }
+}
+
+#[inline(always)]
+fn get_state(request: &Request<Body>) -> &State {
+    request
+        .data::<Arc<State>>()
+        .expect("unknown state type")
+        .as_ref()
+}
+
+/// Pageserver calls into this on startup, to learn which tenants it should attach
+async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
+
+    let state = get_state(&req).inner.clone();
+    let mut locked = state.write().await;
+
+    let mut response = ReAttachResponse {
+        tenants: Vec::new(),
+    };
+    for (t, state) in &mut locked.tenants {
+        if state.pageserver == Some(reattach_req.node_id) {
+            state.generation += 1;
+            response.tenants.push(ReAttachResponseTenant {
+                // TODO(sharding): make this shard-aware
+                id: TenantShardId::unsharded(*t),
+                gen: state.generation,
+            });
+        }
+    }
+
+    locked.save().await.map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
+/// Pageserver calls into this before doing deletions, to confirm that it still
+/// holds the latest generation for the tenants with deletions enqueued
+async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
+
+    let locked = get_state(&req).inner.read().await;
+
+    let mut response = ValidateResponse {
+        tenants: Vec::new(),
+    };
+
+    for req_tenant in validate_req.tenants {
+        // TODO(sharding): make this shard-aware
+        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
+            let valid = tenant_state.generation == req_tenant.gen;
+            tracing::info!(
+                "handle_validate: {}(gen {}): valid={valid} (latest {})",
+                req_tenant.id,
+                req_tenant.gen,
+                tenant_state.generation
+            );
+            response.tenants.push(ValidateResponseTenant {
+                id: req_tenant.id,
+                valid,
+            });
+        }
+    }
+
+    json_response(StatusCode::OK, response)
+}
+/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
+/// (in the real control plane this is unnecessary, because the same program is managing
+///  generation numbers and doing attachments).
+async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
+
+    let state = get_state(&req).inner.clone();
+    let mut locked = state.write().await;
+
+    let tenant_state = locked
+        .tenants
+        .entry(attach_req.tenant_id)
+        .or_insert_with(|| TenantState {
+            pageserver: attach_req.node_id,
+            generation: 0,
+        });
+
+    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
+        tenant_state.generation += 1;
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            ps_id = %attaching_pageserver,
+            generation = %tenant_state.generation,
+            "issuing",
+        );
+    } else if let Some(ps_id) = tenant_state.pageserver {
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            %ps_id,
+            generation = %tenant_state.generation,
+            "dropping",
+        );
+    } else {
+        tracing::info!(
+            tenant_id = %attach_req.tenant_id,
+            "no-op: tenant already has no pageserver");
+    }
+    tenant_state.pageserver = attach_req.node_id;
+    let generation = tenant_state.generation;
+
+    tracing::info!(
+        "handle_attach_hook: tenant {} set generation {}, pageserver {}",
+        attach_req.tenant_id,
+        tenant_state.generation,
+        attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
+    );
+
+    locked.save().await.map_err(ApiError::InternalServerError)?;
+
+    json_response(
+        StatusCode::OK,
+        AttachHookResponse {
+            gen: attach_req.node_id.map(|_| generation),
+        },
+    )
+}
+
+async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
+
+    let state = get_state(&req).inner.clone();
+    let locked = state.write().await;
+    let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
+
+    json_response(
+        StatusCode::OK,
+        InspectResponse {
+            attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
+        },
+    )
+}
+
+fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
+    endpoint::make_router()
+        .data(Arc::new(State::new(persistent_state)))
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
+        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/inspect", |r| request_span(r, handle_inspect))
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    logging::init(
+        LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stdout,
+    )?;
+
+    let args = Cli::parse();
+    tracing::info!(
+        "Starting, state at {}, listening on {}",
+        args.path.to_string_lossy(),
+        args.listen
+    );
+
+    let persistent_state = PersistentState::load_or_new(&args.path).await;
+
+    let http_listener = tcp_listener::bind(args.listen)?;
+    let router = make_router(persistent_state)
+        .build()
+        .map_err(|err| anyhow!(err))?;
+    let service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
+
+    tracing::info!("Serving on {0}", args.listen);
+
+    tokio::task::spawn(server);
+
+    ShutdownSignals::handle(|signal| match signal {
+        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
+            tracing::info!("Got {}. Terminating", signal.name());
+            // We're just a test helper: no graceful shutdown.
+            std::process::exit(0);
+        }
+    })?;
+
+    Ok(())
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,15 +8,17 @@
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
 use compute_api::spec::ComputeMode;
+use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
-use control_plane::pageserver::PageServerNode;
+use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
+use control_plane::tenant_migration::migrate_tenant;
 use control_plane::{broker, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
-    DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
-    DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use postgres_backend::AuthType;
 use safekeeper_api::{
@@ -43,27 +45,44 @@ project_git_version!(GIT_VERSION);

 const DEFAULT_PG_VERSION: &str = "15";

-fn default_conf() -> String {
-    format!(
+const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
+
+fn default_conf(num_pageservers: u16) -> String {
+    let mut template = format!(
        r#"
 # Default built-in configuration, defined in main.rs
+control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
+
 [broker]
 listen_addr = '{DEFAULT_BROKER_ADDR}'

-[pageserver]
-id = {DEFAULT_PAGESERVER_ID}
-listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
-listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
-pg_auth_type = '{trust_auth}'
-http_auth_type = '{trust_auth}'
-
 [[safekeepers]]
 id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
+
 "#,
-        trust_auth = AuthType::Trust,
-    )
+    );
+
+    for i in 0..num_pageservers {
+        let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
+        let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
+        let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
+
+        template += &format!(
+            r#"
+[[pageservers]]
+id = {pageserver_id}
+listen_pg_addr = '127.0.0.1:{pg_port}'
+listen_http_addr = '127.0.0.1:{http_port}'
+pg_auth_type = '{trust_auth}'
+http_auth_type = '{trust_auth}'
+"#,
+            trust_auth = AuthType::Trust,
+        )
+    }
+
+    template
 }

 ///
@@ -107,8 +126,10 @@ fn main() -> Result<()> {
            "start" => handle_start_all(sub_args, &env),
            "stop" => handle_stop_all(sub_args, &env),
            "pageserver" => handle_pageserver(sub_args, &env),
+            "attachment_service" => handle_attachment_service(sub_args, &env),
            "safekeeper" => handle_safekeeper(sub_args, &env),
            "endpoint" => handle_endpoint(sub_args, &env),
+            "mappings" => handle_mappings(sub_args, &mut env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
            _ => bail!("unexpected subcommand {sub_name}"),
        };
@@ -252,7 +273,7 @@ fn get_timeline_infos(
    env: &local_env::LocalEnv,
    tenant_id: &TenantId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
-    Ok(PageServerNode::from_env(env)
+    Ok(get_default_pageserver(env)
        .timeline_list(tenant_id)?
        .into_iter()
        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
@@ -287,6 +308,9 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }

 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
+    let num_pageservers = init_match
+        .get_one::<u16>("num-pageservers")
+        .expect("num-pageservers arg has a default");
    // Create config file
    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
        // load and parse the file
@@ -298,7 +322,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
        })?
    } else {
        // Built-in default config
-        default_conf()
+        default_conf(*num_pageservers)
    };

    let pg_version = init_match
@@ -312,18 +336,34 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;

+    // Create remote storage location for default LocalFs remote storage
+    std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
+
    // Initialize pageserver, create initial tenant and timeline.
-    let pageserver = PageServerNode::from_env(&env);
-    pageserver
-        .initialize(&pageserver_config_overrides(init_match))
-        .unwrap_or_else(|e| {
-            eprintln!("pageserver init failed: {e:?}");
-            exit(1);
-        });
+    for ps_conf in &env.pageservers {
+        PageServerNode::from_env(&env, ps_conf)
+            .initialize(&pageserver_config_overrides(init_match))
+            .unwrap_or_else(|e| {
+                eprintln!("pageserver init failed: {e:?}");
+                exit(1);
+            });
+    }

    Ok(env)
 }

+/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
+/// For typical interactive use, one would just run with a single pageserver.  Scenarios with
+/// tenant/timeline placement across multiple pageservers are managed by python test code rather
+/// than this CLI.
+fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
+    let ps_conf = env
+        .pageservers
+        .first()
+        .expect("Config is validated to contain at least one pageserver");
+    PageServerNode::from_env(env, ps_conf)
+}
+
 fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
    init_match
        .get_many::<String>("pageserver-config-override")
@@ -334,7 +374,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
 }

 fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    let pageserver = get_default_pageserver(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
            for t in pageserver.tenant_list()? {
@@ -342,13 +382,25 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
            }
        }
        Some(("create", create_match)) => {
-            let initial_tenant_id = parse_tenant_id(create_match)?;
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();
-            let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
-            println!("tenant {new_tenant_id} successfully created on the pageserver");
+
+            // If tenant ID was not specified, generate one
+            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
+
+            let generation = if env.control_plane_api.is_some() {
+                // We must register the tenant with the attachment service, so
+                // that when the pageserver restarts, it will be re-attached.
+                let attachment_service = AttachmentService::from_env(env);
+                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
+            } else {
+                None
+            };
+
+            pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
+            println!("tenant {tenant_id} successfully created on the pageserver");

            // Create an initial timeline for the new tenant
            let new_timeline_id = parse_timeline_id(create_match)?;
@@ -358,28 +410,29 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .context("Failed to parse postgres version from the argument string")?;

            let timeline_info = pageserver.timeline_create(
-                new_tenant_id,
+                tenant_id,
                new_timeline_id,
                None,
                None,
                Some(pg_version),
+                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;
            let last_record_lsn = timeline_info.last_record_lsn;

            env.register_branch_mapping(
                DEFAULT_BRANCH_NAME.to_string(),
-                new_tenant_id,
+                tenant_id,
                new_timeline_id,
            )?;

            println!(
-                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
+                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
            );

            if create_match.get_flag("set-default") {
-                println!("Setting tenant {new_tenant_id} as a default one");
-                env.default_tenant_id = Some(new_tenant_id);
+                println!("Setting tenant {tenant_id} as a default one");
+                env.default_tenant_id = Some(tenant_id);
            }
        }
        Some(("set-default", set_default_match)) => {
@@ -400,6 +453,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
+        Some(("migrate", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let new_pageserver = get_pageserver(env, matches)?;
+            let new_pageserver_id = new_pageserver.conf.id;
+
+            migrate_tenant(env, tenant_id, new_pageserver)?;
+            println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
+        }
+
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -407,7 +469,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
 }

 fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+    let pageserver = get_default_pageserver(env);

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
@@ -426,8 +488,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let timeline_info =
-                pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
+            let new_timeline_id_opt = parse_timeline_id(create_match)?;
+
+            let timeline_info = pageserver.timeline_create(
+                tenant_id,
+                new_timeline_id_opt,
+                None,
+                None,
+                Some(pg_version),
+                None,
+            )?;
            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
@@ -484,6 +554,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                None,
                pg_version,
                ComputeMode::Primary,
+                DEFAULT_PAGESERVER_ID,
            )?;
            println!("Done");
        }
@@ -513,6 +584,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                start_lsn,
                Some(ancestor_timeline_id),
                None,
+                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;

@@ -537,14 +609,11 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
        Some(ep_subcommand_data) => ep_subcommand_data,
        None => bail!("no endpoint subcommand provided"),
    };
-
    let mut cplane = ComputeControlPlane::load(env.clone())?;

-    // All subcommands take an optional --tenant-id option
-    let tenant_id = get_tenant_id(sub_args, env)?;
-
    match sub_name {
        "list" => {
+            let tenant_id = get_tenant_id(sub_args, env)?;
            let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
                eprintln!("Failed to load timeline info: {}", e);
                HashMap::new()
@@ -604,6 +673,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
            println!("{table}");
        }
        "create" => {
+            let tenant_id = get_tenant_id(sub_args, env)?;
            let branch_name = sub_args
                .get_one::<String>("branch-name")
                .map(|s| s.as_str())
@@ -634,6 +704,13 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .unwrap_or(false);

+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                } else {
+                    DEFAULT_PAGESERVER_ID
+                };
+
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -641,6 +718,18 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
            };

+            match (mode, hot_standby) {
+                (ComputeMode::Static(_), true) => {
+                    bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
+                }
+                (ComputeMode::Primary, true) => {
+                    bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
+                }
+                _ => {}
+            }
+
+            cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
+
            cplane.new_endpoint(
                &endpoint_id,
                tenant_id,
@@ -649,15 +738,21 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                http_port,
                pg_version,
                mode,
+                pageserver_id,
            )?;
        }
        "start" => {
-            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
-            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                } else {
+                    DEFAULT_PAGESERVER_ID
+                };
+
            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");

            // If --safekeepers argument is given, use only the listed safekeeper nodes.
@@ -675,78 +770,46 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    env.safekeepers.iter().map(|sk| sk.id).collect()
                };

-            let endpoint = cplane.endpoints.get(endpoint_id.as_str());
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;

-            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
-                let claims = Claims::new(Some(tenant_id), Scope::Tenant);
+            cplane.check_conflicting_endpoints(
+                endpoint.mode,
+                endpoint.tenant_id,
+                endpoint.timeline_id,
+            )?;
+
+            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
+            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
+                let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);

                Some(env.generate_auth_token(&claims)?)
            } else {
                None
            };

-            let hot_standby = sub_args
-                .get_one::<bool>("hot-standby")
-                .copied()
-                .unwrap_or(false);
-
-            if let Some(endpoint) = endpoint {
-                match (&endpoint.mode, hot_standby) {
-                    (ComputeMode::Static(_), true) => {
-                        bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
-                    }
-                    (ComputeMode::Primary, true) => {
-                        bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
-                    }
-                    _ => {}
-                }
-                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
-            } else {
-                let branch_name = sub_args
-                    .get_one::<String>("branch-name")
-                    .map(|s| s.as_str())
-                    .unwrap_or(DEFAULT_BRANCH_NAME);
-                let timeline_id = env
-                    .get_branch_timeline_id(branch_name, tenant_id)
-                    .ok_or_else(|| {
-                        anyhow!("Found no timeline id for branch name '{branch_name}'")
-                    })?;
-                let lsn = sub_args
-                    .get_one::<String>("lsn")
-                    .map(|lsn_str| Lsn::from_str(lsn_str))
-                    .transpose()
-                    .context("Failed to parse Lsn from the request")?;
-                let pg_version = sub_args
-                    .get_one::<u32>("pg-version")
-                    .copied()
-                    .context("Failed to `pg-version` from the argument string")?;
-
-                let mode = match (lsn, hot_standby) {
-                    (Some(lsn), false) => ComputeMode::Static(lsn),
-                    (None, true) => ComputeMode::Replica,
-                    (None, false) => ComputeMode::Primary,
-                    (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
+            println!("Starting existing endpoint {endpoint_id}...");
+            endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
+        }
+        "reconfigure" => {
+            let endpoint_id = sub_args
+                .get_one::<String>("endpoint_id")
+                .ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
+            let pageserver_id =
+                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
+                    Some(NodeId(
+                        id_str.parse().context("while parsing pageserver id")?,
+                    ))
+                } else {
+                    None
                };
-
-                // when used with custom port this results in non obvious behaviour
-                // port is remembered from first start command, i e
-                // start --port X
-                // stop
-                // start <-- will also use port X even without explicit port argument
-                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
-
-                let ep = cplane.new_endpoint(
-                    endpoint_id,
-                    tenant_id,
-                    timeline_id,
-                    pg_port,
-                    http_port,
-                    pg_version,
-                    mode,
-                )?;
-                ep.start(&auth_token, safekeepers, remote_ext_config)?;
-            }
+            endpoint.reconfigure(pageserver_id)?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -767,52 +830,141 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
    Ok(())
 }

-fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let pageserver = PageServerNode::from_env(env);
+fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = match sub_match.subcommand() {
+        Some(ep_subcommand_data) => ep_subcommand_data,
+        None => bail!("no mappings subcommand provided"),
+    };

+    match sub_name {
+        "map" => {
+            let branch_name = sub_args
+                .get_one::<String>("branch-name")
+                .expect("branch-name argument missing");
+
+            let tenant_id = sub_args
+                .get_one::<String>("tenant-id")
+                .map(|x| TenantId::from_str(x))
+                .expect("tenant-id argument missing")
+                .expect("malformed tenant-id arg");
+
+            let timeline_id = sub_args
+                .get_one::<String>("timeline-id")
+                .map(|x| TimelineId::from_str(x))
+                .expect("timeline-id argument missing")
+                .expect("malformed timeline-id arg");
+
+            env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?;
+
+            Ok(())
+        }
+        other => unimplemented!("mappings subcommand {other}"),
+    }
+}
+
+fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
+    let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
+        NodeId(id_str.parse().context("while parsing pageserver id")?)
+    } else {
+        DEFAULT_PAGESERVER_ID
+    };
+
+    Ok(PageServerNode::from_env(
+        env,
+        env.get_pageserver_conf(node_id)?,
+    ))
+}
+
+fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
-        Some(("start", start_match)) => {
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) {
+        Some(("start", subcommand_args)) => {
+            if let Err(e) = get_pageserver(env, subcommand_args)?
+                .start(&pageserver_config_overrides(subcommand_args))
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }

+        Some(("stop", subcommand_args)) => {
+            let immediate = subcommand_args
+                .get_one::<String>("stop-mode")
+                .map(|s| s.as_str())
+                == Some("immediate");
+
+            if let Err(e) = get_pageserver(env, subcommand_args)?.stop(immediate) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+        }
+
+        Some(("restart", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            //TODO what shutdown strategy should we use here?
+            if let Err(e) = pageserver.stop(false) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+                eprintln!("pageserver start failed: {e}");
+                exit(1);
+            }
+        }
+
+        Some(("migrate", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            //TODO what shutdown strategy should we use here?
+            if let Err(e) = pageserver.stop(false) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+                eprintln!("pageserver start failed: {e}");
+                exit(1);
+            }
+        }
+
+        Some(("status", subcommand_args)) => {
+            match get_pageserver(env, subcommand_args)?.check_status() {
+                Ok(_) => println!("Page server is up and running"),
+                Err(err) => {
+                    eprintln!("Page server is not available: {}", err);
+                    exit(1);
+                }
+            }
+        }
+
+        Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
+        None => bail!("no pageserver subcommand provided"),
+    }
+    Ok(())
+}
+
+fn handle_attachment_service(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let svc = AttachmentService::from_env(env);
+    match sub_match.subcommand() {
+        Some(("start", _start_match)) => {
+            if let Err(e) = svc.start() {
+                eprintln!("start failed: {e}");
+                exit(1);
+            }
+        }
+
        Some(("stop", stop_match)) => {
            let immediate = stop_match
                .get_one::<String>("stop-mode")
                .map(|s| s.as_str())
                == Some("immediate");

-            if let Err(e) = pageserver.stop(immediate) {
-                eprintln!("pageserver stop failed: {}", e);
+            if let Err(e) = svc.stop(immediate) {
+                eprintln!("stop failed: {}", e);
                exit(1);
            }
        }
-
-        Some(("restart", restart_match)) => {
-            //TODO what shutdown strategy should we use here?
-            if let Err(e) = pageserver.stop(false) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-
-            if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) {
-                eprintln!("pageserver start failed: {e}");
-                exit(1);
-            }
-        }
-
-        Some(("status", _)) => match PageServerNode::from_env(env).check_status() {
-            Ok(_) => println!("Page server is up and running"),
-            Err(err) => {
-                eprintln!("Page server is not available: {}", err);
-                exit(1);
-            }
-        },
-
-        Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
-        None => bail!("no pageserver subcommand provided"),
+        Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
+        None => bail!("no attachment_service subcommand provided"),
    }
    Ok(())
 }
@@ -897,11 +1049,23 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    broker::start_broker_process(env)?;

-    let pageserver = PageServerNode::from_env(env);
-    if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
-        eprintln!("pageserver {} start failed: {:#}", env.pageserver.id, e);
-        try_stop_all(env, true);
-        exit(1);
+    // Only start the attachment service if the pageserver is configured to need it
+    if env.control_plane_api.is_some() {
+        let attachment_service = AttachmentService::from_env(env);
+        if let Err(e) = attachment_service.start() {
+            eprintln!("attachment_service start failed: {:#}", e);
+            try_stop_all(env, true);
+            exit(1);
+        }
+    }
+
+    for ps_conf in &env.pageservers {
+        let pageserver = PageServerNode::from_env(env, ps_conf);
+        if let Err(e) = pageserver.start(&pageserver_config_overrides(sub_match)) {
+            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
+            try_stop_all(env, true);
+            exit(1);
+        }
    }

    for node in env.safekeepers.iter() {
@@ -925,8 +1089,6 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
 }

 fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
-    let pageserver = PageServerNode::from_env(env);
-
    // Stop all endpoints
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
@@ -941,8 +1103,11 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        }
    }

-    if let Err(e) = pageserver.stop(immediate) {
-        eprintln!("pageserver {} stop failed: {:#}", env.pageserver.id, e);
+    for ps_conf in &env.pageservers {
+        let pageserver = PageServerNode::from_env(env, ps_conf);
+        if let Err(e) = pageserver.stop(immediate) {
+            eprintln!("pageserver {} stop failed: {:#}", ps_conf.id, e);
+        }
    }

    for node in env.safekeepers.iter() {
@@ -955,6 +1120,13 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    if let Err(e) = broker::stop_broker_process(env) {
        eprintln!("neon broker stop failed: {e:#}");
    }
+
+    if env.control_plane_api.is_some() {
+        let attachment_service = AttachmentService::from_env(env);
+        if let Err(e) = attachment_service.stop(immediate) {
+            eprintln!("attachment service stop failed: {e:#}");
+        }
+    }
 }

 fn cli() -> Command {
@@ -969,6 +1141,17 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

+    // --id, when using a pageserver command
+    let pageserver_id_arg = Arg::new("pageserver-id")
+        .long("id")
+        .global(true)
+        .help("pageserver id")
+        .required(false);
+    // --pageserver-id when using a non-pageserver command
+    let endpoint_pageserver_id_arg = Arg::new("endpoint-pageserver-id")
+        .long("pageserver-id")
+        .required(false);
+
    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
        .short('e')
        .long("safekeeper-extra-opt")
@@ -1029,7 +1212,7 @@ fn cli() -> Command {
    let remote_ext_config_args = Arg::new("remote-ext-config")
        .long("remote-ext-config")
        .num_args(1)
-        .help("Configure the S3 bucket that we search for extensions in.")
+        .help("Configure the remote extensions storage proxy gateway to request for extensions.")
        .required(false);

    let lsn_arg = Arg::new("lsn")
@@ -1050,6 +1233,13 @@ fn cli() -> Command {
        .help("Force initialization even if the repository is not empty")
        .required(false);

+    let num_pageservers_arg = Arg::new("num-pageservers")
+        .value_parser(value_parser!(u16))
+        .long("num-pageservers")
+        .help("How many pageservers to create (default 1)")
+        .required(false)
+        .default_value("1");
+
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1057,6 +1247,7 @@ fn cli() -> Command {
            Command::new("init")
                .about("Initialize a new Neon repository, preparing configs for services to start with")
                .arg(pageserver_config_args.clone())
+                .arg(num_pageservers_arg.clone())
                .arg(
                    Arg::new("config")
                        .long("config")
@@ -1084,6 +1275,7 @@ fn cli() -> Command {
            .subcommand(Command::new("create")
                .about("Create a new blank timeline")
                .arg(tenant_id_arg.clone())
+                .arg(timeline_id_arg.clone())
                .arg(branch_name_arg.clone())
                .arg(pg_version_arg.clone())
            )
@@ -1127,16 +1319,37 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("migrate")
+                .about("Migrate a tenant from one pageserver to another")
+                .arg(tenant_id_arg.clone())
+                .arg(pageserver_id_arg.clone()))
        )
        .subcommand(
            Command::new("pageserver")
                .arg_required_else_help(true)
                .about("Manage pageserver")
+                .arg(pageserver_id_arg)
                .subcommand(Command::new("status"))
+                .subcommand(Command::new("start")
+                    .about("Start local pageserver")
+                    .arg(pageserver_config_args.clone())
+                )
+                .subcommand(Command::new("stop")
+                    .about("Stop local pageserver")
+                    .arg(stop_mode_arg.clone())
+                )
+                .subcommand(Command::new("restart")
+                    .about("Restart local pageserver")
+                    .arg(pageserver_config_args.clone())
+                )
+        )
+        .subcommand(
+            Command::new("attachment_service")
+                .arg_required_else_help(true)
+                .about("Manage attachment_service")
                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
                .subcommand(Command::new("stop").about("Stop local pageserver")
                            .arg(stop_mode_arg.clone()))
-                .subcommand(Command::new("restart").about("Restart local pageserver").arg(pageserver_config_args.clone()))
        )
        .subcommand(
            Command::new("safekeeper")
@@ -1172,6 +1385,7 @@ fn cli() -> Command {
                    .arg(lsn_arg.clone())
                    .arg(pg_port_arg.clone())
                    .arg(http_port_arg.clone())
+                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(
                        Arg::new("config-only")
                            .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1183,21 +1397,19 @@ fn cli() -> Command {
                .subcommand(Command::new("start")
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
-                    .arg(tenant_id_arg.clone())
-                    .arg(branch_name_arg)
-                    .arg(timeline_id_arg)
-                    .arg(lsn_arg)
-                    .arg(pg_port_arg)
-                    .arg(http_port_arg)
-                    .arg(pg_version_arg)
-                    .arg(hot_standby_arg)
+                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                )
+                .subcommand(Command::new("reconfigure")
+                            .about("Reconfigure the endpoint")
+                            .arg(endpoint_pageserver_id_arg)
+                            .arg(endpoint_id_arg.clone())
+                            .arg(tenant_id_arg.clone())
+                )
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
-                    .arg(tenant_id_arg)
                    .arg(
                        Arg::new("destroy")
                            .help("Also delete data directory (now optional, should be default in future)")
@@ -1208,6 +1420,18 @@ fn cli() -> Command {
                )

        )
+        .subcommand(
+            Command::new("mappings")
+                .arg_required_else_help(true)
+                .about("Manage neon_local branch name mappings")
+                .subcommand(
+                    Command::new("map")
+                        .about("Create new mapping which cannot exist already")
+                        .arg(branch_name_arg.clone())
+                        .arg(tenant_id_arg.clone())
+                        .arg(timeline_id_arg.clone())
+                )
+        )
        // Obsolete old name for 'endpoint'. We now just print an error if it's used.
        .subcommand(
            Command::new("pg")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -7,7 +7,7 @@
 //! ```
 use anyhow::Context;

-use std::path::PathBuf;
+use camino::Utf8PathBuf;

 use crate::{background_process, local_env};

@@ -30,7 +30,7 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
        || {
            let url = broker.client_url();
            let status_url = url.join("status").with_context(|| {
-                format!("Failed to append /status path to broker endpoint {url}",)
+                format!("Failed to append /status path to broker endpoint {url}")
            })?;
            let request = client
                .get(status_url)
@@ -50,6 +50,7 @@ pub fn stop_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    background_process::stop_process(true, "storage_broker", &storage_broker_pid_file_path(env))
 }

-fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
-    env.base_data_dir.join("storage_broker.pid")
+fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> Utf8PathBuf {
+    Utf8PathBuf::from_path_buf(env.base_data_dir.join("storage_broker.pid"))
+        .expect("non-Unicode path")
 }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -45,8 +45,8 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
+use compute_api::spec::RemoteExtSpec;
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
@@ -57,19 +57,17 @@ use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
-#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
    endpoint_id: String,
-    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
    pg_port: u16,
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
+    pageserver_id: NodeId,
 }

 //
@@ -82,19 +80,16 @@ pub struct ComputeControlPlane {
    pub endpoints: BTreeMap<String, Arc<Endpoint>>,

    env: LocalEnv,
-    pageserver: Arc<PageServerNode>,
 }

 impl ComputeControlPlane {
    // Load current endpoints from the endpoints/ subdirectories
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
-        let pageserver = Arc::new(PageServerNode::from_env(&env));
-
        let mut endpoints = BTreeMap::default();
        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
-            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
+            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?;
            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
        }

@@ -102,7 +97,6 @@ impl ComputeControlPlane {
            base_port: 55431,
            endpoints,
            env,
-            pageserver,
        })
    }

@@ -125,20 +119,30 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
+        pageserver_id: NodeId,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
+        let pageserver =
+            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
+
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
-            pageserver: Arc::clone(&self.pageserver),
+            pageserver,
            timeline_id,
            mode,
            tenant_id,
            pg_version,
-            skip_pg_catalog_updates: false,
+            // We don't setup roles and databases in the spec locally, so we don't need to
+            // do catalog updates. Catalog updates also include check availability
+            // data creation. Yet, we have tests that check that size and db dump
+            // before and after start are the same. So, skip catalog updates,
+            // with this we basically test a case of waking up an idle compute, where
+            // we also skip catalog updates in the cloud.
+            skip_pg_catalog_updates: true,
        });

        ep.create_endpoint_dir()?;
@@ -152,7 +156,8 @@ impl ComputeControlPlane {
                http_port,
                pg_port,
                pg_version,
-                skip_pg_catalog_updates: false,
+                skip_pg_catalog_updates: true,
+                pageserver_id,
            })?,
        )?;
        std::fs::write(
@@ -165,6 +170,30 @@ impl ComputeControlPlane {

        Ok(ep)
    }
+
+    pub fn check_conflicting_endpoints(
+        &self,
+        mode: ComputeMode,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<()> {
+        if matches!(mode, ComputeMode::Primary) {
+            // this check is not complete, as you could have a concurrent attempt at
+            // creating another primary, both reading the state before checking it here,
+            // but it's better than nothing.
+            let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
+                v.tenant_id == tenant_id
+                    && v.timeline_id == timeline_id
+                    && v.mode == mode
+                    && v.status() != "stopped"
+            });
+
+            if let Some((key, _)) = duplicates.next() {
+                bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
+            }
+        }
+        Ok(())
+    }
 }

 ///////////////////////////////////////////////////////////////////////////////
@@ -187,18 +216,14 @@ pub struct Endpoint {
    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
    pub env: LocalEnv,
-    pageserver: Arc<PageServerNode>,
+    pageserver: PageServerNode,

    // Optimizations
    skip_pg_catalog_updates: bool,
 }

 impl Endpoint {
-    fn from_dir_entry(
-        entry: std::fs::DirEntry,
-        env: &LocalEnv,
-        pageserver: &Arc<PageServerNode>,
-    ) -> Result<Endpoint> {
+    fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
        if !entry.file_type()?.is_dir() {
            anyhow::bail!(
                "Endpoint::from_dir_entry failed: '{}' is not a directory",
@@ -214,12 +239,15 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

+        let pageserver =
+            PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
+
        Ok(Endpoint {
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
-            pageserver: Arc::clone(pageserver),
+            pageserver,
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
@@ -247,7 +275,7 @@ impl Endpoint {
        conf.append("shared_buffers", "1MB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
-        conf.append("wal_level", "replica");
+        conf.append("wal_level", "logical");
        // wal_sender_timeout is the maximum time to wait for WAL replication.
        // It also defines how often the walreciever will send a feedback message to the wal sender.
        conf.append("wal_sender_timeout", "5s");
@@ -408,18 +436,34 @@ impl Endpoint {
            );
        }

-        // Also wait for the compute_ctl process to die. It might have some cleanup
-        // work to do after postgres stops, like syncing safekeepers, etc.
-        //
+        Ok(())
+    }
+
+    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
-
        Ok(())
    }

+    fn read_postgresql_conf(&self) -> Result<String> {
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
+        // memory. We will include it in the spec file that we pass to
+        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
+        // in the data directory.
+        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
+        match std::fs::read(&postgresql_conf_path) {
+            Ok(content) => Ok(String::from_utf8(content)?),
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
+            Err(e) => Err(anyhow::Error::new(e).context(format!(
+                "failed to read config file in {}",
+                postgresql_conf_path.to_str().unwrap()
+            ))),
+        }
+    }
+
    pub fn start(
        &self,
        auth_token: &Option<String>,
@@ -430,21 +474,7 @@ impl Endpoint {
            anyhow::bail!("The endpoint is already running");
        }

-        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
-        // memory. We will include it in the spec file that we pass to
-        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
-        // in the data directory.
-        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
-        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
-            Ok(content) => String::from_utf8(content)?,
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
-            Err(e) => {
-                return Err(anyhow::Error::new(e).context(format!(
-                    "failed to read config file in {}",
-                    postgresql_conf_path.to_str().unwrap()
-                )))
-            }
-        };
+        let postgresql_conf = self.read_postgresql_conf()?;

        // We always start the compute node from scratch, so if the Postgres
        // data dir exists from a previous launch, remove it first.
@@ -472,11 +502,24 @@ impl Endpoint {
            }
        }

+        // check for file remote_extensions_spec.json
+        // if it is present, read it and pass to compute_ctl
+        let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
+        let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
+        let remote_extensions: Option<RemoteExtSpec>;
+
+        if let Ok(spec_file) = remote_extensions_spec {
+            remote_extensions = serde_json::from_reader(spec_file).ok();
+        } else {
+            remote_extensions = None;
+        };
+
        // Create spec file
        let spec = ComputeSpec {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
+            features: vec![],
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
@@ -493,7 +536,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            remote_extensions: None,
+            remote_extensions,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -615,6 +658,61 @@ impl Endpoint {
        }
    }

+    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+        let mut spec: ComputeSpec = {
+            let spec_path = self.endpoint_path().join("spec.json");
+            let file = std::fs::File::open(spec_path)?;
+            serde_json::from_reader(file)?
+        };
+
+        let postgresql_conf = self.read_postgresql_conf()?;
+        spec.cluster.postgresql_conf = Some(postgresql_conf);
+
+        if let Some(pageserver_id) = pageserver_id {
+            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
+            let mut endpoint_conf: EndpointConf = {
+                let file = std::fs::File::open(&endpoint_config_path)?;
+                serde_json::from_reader(file)?
+            };
+            endpoint_conf.pageserver_id = pageserver_id;
+            std::fs::write(
+                endpoint_config_path,
+                serde_json::to_string_pretty(&endpoint_conf)?,
+            )?;
+
+            let pageserver =
+                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
+            let ps_http_conf = &pageserver.pg_connection_config;
+            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
+            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
+        }
+
+        let client = reqwest::blocking::Client::new();
+        let response = client
+            .post(format!(
+                "http://{}:{}/configure",
+                self.http_address.ip(),
+                self.http_address.port()
+            ))
+            .body(format!(
+                "{{\"spec\":{}}}",
+                serde_json::to_string_pretty(&spec)?
+            ))
+            .send()?;
+
+        let status = response.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            Ok(())
+        } else {
+            let url = response.url().to_owned();
+            let msg = match response.text() {
+                Ok(err_body) => format!("Error: {}", err_body),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            };
+            Err(anyhow::anyhow!(msg))
+        }
+    }
+
    pub fn stop(&self, destroy: bool) -> Result<()> {
        // If we are going to destroy data directory,
        // use immediate shutdown mode, otherwise,
@@ -623,15 +721,25 @@ impl Endpoint {
        // Postgres is always started from scratch, so stop
        // without destroy only used for testing and debugging.
        //
+        self.pg_ctl(
+            if destroy {
+                &["-m", "immediate", "stop"]
+            } else {
+                &["stop"]
+            },
+            &None,
+        )?;
+
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
+        //
+        self.wait_for_compute_ctl_to_exit()?;
        if destroy {
-            self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
            println!(
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
            std::fs::remove_dir_all(self.endpoint_path())?;
-        } else {
-            self.pg_ctl(&["stop"], &None)?;
        }
        Ok(())
    }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,12 +1,12 @@
-//
-// Local control plane.
-//
-// Can start, configure and stop postgres instances running as a local processes.
-//
-// Intended to be used in integration tests and in CLI tools for
-// local installations.
-//
+//! Local control plane.
+//!
+//! Can start, configure and stop postgres instances running as a local processes.
+//!
+//! Intended to be used in integration tests and in CLI tools for
+//! local installations.
+#![deny(clippy::undocumented_unsafe_blocks)]

+pub mod attachment_service;
 mod background_process;
 pub mod broker;
 pub mod endpoint;
@@ -14,3 +14,4 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
+pub mod tenant_migration;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -8,7 +8,6 @@ use anyhow::{bail, ensure, Context};
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::env;
 use std::fs;
@@ -33,7 +32,6 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
-#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
@@ -59,7 +57,6 @@ pub struct LocalEnv {
    // Default tenant ID to use with the 'neon_local' command line utility, when
    // --tenant_id is not explicitly specified.
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub default_tenant_id: Option<TenantId>,

    // used to issue tokens during e.g pg start
@@ -68,17 +65,22 @@ pub struct LocalEnv {

    pub broker: NeonBroker,

-    pub pageserver: PageServerConf,
+    /// This Vec must always contain at least one pageserver
+    pub pageservers: Vec<PageServerConf>,

    #[serde(default)]
    pub safekeepers: Vec<SafekeeperConf>,

+    // Control plane location: if None, we will not run attachment_service.  If set, this will
+    // be propagated into each pageserver's configuration.
+    #[serde(default)]
+    pub control_plane_api: Option<Url>,
+
    /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
    #[serde(default)]
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
-    #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
    branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }

@@ -176,32 +178,28 @@ impl LocalEnv {
    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match pg_version {
-            14 => Ok(path.join(format!("v{pg_version}"))),
-            15 => Ok(path.join(format!("v{pg_version}"))),
+            14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
            _ => bail!("Unsupported postgres version: {}", pg_version),
        }
    }

    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        match pg_version {
-            14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

    pub fn pageserver_bin(&self) -> PathBuf {
        self.neon_distrib_dir.join("pageserver")
    }

+    pub fn attachment_service_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("attachment_service")
+    }
+
    pub fn safekeeper_bin(&self) -> PathBuf {
        self.neon_distrib_dir.join("safekeeper")
    }
@@ -214,15 +212,23 @@ impl LocalEnv {
        self.base_data_dir.join("endpoints")
    }

-    // TODO: move pageserver files into ./pageserver
-    pub fn pageserver_data_dir(&self) -> PathBuf {
-        self.base_data_dir.clone()
+    pub fn pageserver_data_dir(&self, pageserver_id: NodeId) -> PathBuf {
+        self.base_data_dir
+            .join(format!("pageserver_{pageserver_id}"))
    }

    pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf {
        self.base_data_dir.join("safekeepers").join(data_dir_name)
    }

+    pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
+        if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
+            Ok(conf)
+        } else {
+            bail!("could not find pageserver {id}")
+        }
+    }
+
    pub fn register_branch_mapping(
        &mut self,
        branch_name: String,
@@ -299,6 +305,10 @@ impl LocalEnv {
            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
        }

+        if env.pageservers.is_empty() {
+            anyhow::bail!("Configuration must contain at least one pageserver");
+        }
+
        env.base_data_dir = base_path();

        Ok(env)
@@ -331,7 +341,7 @@ impl LocalEnv {
        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
        // a bit sad.
-        let mut conf_content = r#"# This file describes a locale deployment of the page server
+        let mut conf_content = r#"# This file describes a local deployment of the page server
 # and safekeeeper node. It is read by the 'neon_local' command-line
 # utility.
 "#
@@ -461,9 +471,9 @@ impl LocalEnv {
    }

    fn auth_keys_needed(&self) -> bool {
-        self.pageserver.pg_auth_type == AuthType::NeonJWT
-            || self.pageserver.http_auth_type == AuthType::NeonJWT
-            || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+        self.pageservers.iter().any(|ps| {
+            ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
+        }) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
    }
 }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -11,10 +11,15 @@ use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::process::{Child, Command};
+use std::time::Duration;
 use std::{io, result};

 use anyhow::{bail, Context};
-use pageserver_api::models::{self, TenantInfo, TimelineInfo};
+use camino::Utf8PathBuf;
+use pageserver_api::models::{
+    self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
+};
+use pageserver_api::shard::TenantShardId;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use reqwest::blocking::{Client, RequestBuilder, Response};
@@ -27,8 +32,12 @@ use utils::{
    lsn::Lsn,
 };

+use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

+/// Directory within .neon which will be used by default for LocalFs remote storage.
+pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
+
 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
    #[error("Reqwest error: {0}")]
@@ -76,43 +85,42 @@ impl ResponseErrorMessageExt for Response {
 #[derive(Debug)]
 pub struct PageServerNode {
    pub pg_connection_config: PgConnectionConfig,
+    pub conf: PageServerConf,
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
 }

 impl PageServerNode {
-    pub fn from_env(env: &LocalEnv) -> PageServerNode {
-        let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr)
-            .expect("Unable to parse listen_pg_addr");
+    pub fn from_env(env: &LocalEnv, conf: &PageServerConf) -> PageServerNode {
+        let (host, port) =
+            parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
        let port = port.unwrap_or(5432);
        Self {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
+            conf: conf.clone(),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
+            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
        }
    }

-    // pageserver conf overrides defined by neon_local configuration.
-    fn neon_local_overrides(&self) -> Vec<String> {
-        let id = format!("id={}", self.env.pageserver.id);
+    /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
+    ///
+    /// These all end up on the command line of the `pageserver` binary.
+    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
+        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );

-        let http_auth_type_param =
-            format!("http_auth_type='{}'", self.env.pageserver.http_auth_type);
-        let listen_http_addr_param = format!(
-            "listen_http_addr='{}'",
-            self.env.pageserver.listen_http_addr
-        );
+        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);

-        let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type);
-        let listen_pg_addr_param =
-            format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -126,34 +134,52 @@ impl PageServerNode {
            broker_endpoint_param,
        ];

-        if self.env.pageserver.http_auth_type != AuthType::Trust
-            || self.env.pageserver.pg_auth_type != AuthType::Trust
-        {
-            overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
+        if let Some(control_plane_api) = &self.env.control_plane_api {
+            overrides.push(format!(
+                "control_plane_api='{}'",
+                control_plane_api.as_str()
+            ));
        }
+
+        if !cli_overrides
+            .iter()
+            .any(|c| c.starts_with("remote_storage"))
+        {
+            overrides.push(format!(
+                "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
+            ));
+        }
+
+        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
+        {
+            // Keys are generated in the toplevel repo dir, pageservers' workdirs
+            // are one level below that, so refer to keys with ../
+            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
+        }
+
+        // Apply the user-provided overrides
+        overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
+
        overrides
    }

    /// Initializes a pageserver node by creating its config with the overrides provided.
    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
-        self.pageserver_init(config_overrides).with_context(|| {
-            format!(
-                "Failed to run init for pageserver node {}",
-                self.env.pageserver.id,
-            )
-        })
+        self.pageserver_init(config_overrides)
+            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
    }

    pub fn repo_path(&self) -> PathBuf {
-        self.env.pageserver_data_dir()
+        self.env.pageserver_data_dir(self.conf.id)
    }

    /// The pid file is created by the pageserver process, with its pid stored inside.
    /// Other pageservers cannot lock the same file and overwrite it for as long as the current
    /// pageserver runs. (Unless someone removes the file manually; never do that!)
-    fn pid_file(&self) -> PathBuf {
-        self.repo_path().join("pageserver.pid")
+    fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.repo_path().join("pageserver.pid"))
+            .expect("non-Unicode path")
    }

    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
@@ -162,7 +188,7 @@ impl PageServerNode {

    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        let datadir = self.repo_path();
-        let node_id = self.env.pageserver.id;
+        let node_id = self.conf.id;
        println!(
            "Initializing pageserver node {} at '{}' in {:?}",
            node_id,
@@ -171,6 +197,10 @@ impl PageServerNode {
        );
        io::stdout().flush()?;

+        if !datadir.exists() {
+            std::fs::create_dir(&datadir)?;
+        }
+
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
        })?;
@@ -195,13 +225,10 @@ impl PageServerNode {
    }

    fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
-
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
-            self.env.pageserver.id,
+            self.conf.id,
            self.pg_connection_config.raw_address(),
            datadir
        );
@@ -210,7 +237,7 @@ impl PageServerNode {
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!(
                "Cannot start pageserver node {} in path that has no string representation: {:?}",
-                self.env.pageserver.id, datadir,
+                self.conf.id, datadir,
            )
        })?;
        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
@@ -240,8 +267,7 @@ impl PageServerNode {
    ) -> Vec<Cow<'a, str>> {
        let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];

-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
+        let overrides = self.neon_local_overrides(config_overrides);
        for config_override in overrides {
            args.push(Cow::Borrowed("-c"));
            args.push(Cow::Owned(config_override));
@@ -254,7 +280,7 @@ impl PageServerNode {
        // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
        // needs a token, and how to generate that token, seems independent to whether
        // the pageserver requires a token in incoming requests.
-        Ok(if self.env.pageserver.http_auth_type != AuthType::Trust {
+        Ok(if self.conf.http_auth_type != AuthType::Trust {
            // Generate a token to connect from the pageserver to a safekeeper
            let token = self
                .env
@@ -279,7 +305,7 @@ impl PageServerNode {

    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
        let mut config = self.pg_connection_config.clone();
-        if self.env.pageserver.pg_auth_type == AuthType::NeonJWT {
+        if self.conf.pg_auth_type == AuthType::NeonJWT {
            let token = self
                .env
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
@@ -290,7 +316,7 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
        let mut builder = self.http_client.request(method, url);
-        if self.env.pageserver.http_auth_type == AuthType::NeonJWT {
+        if self.conf.http_auth_type == AuthType::NeonJWT {
            let token = self
                .env
                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
@@ -316,7 +342,8 @@ impl PageServerNode {

    pub fn tenant_create(
        &self,
-        new_tenant_id: Option<TenantId>,
+        new_tenant_id: TenantId,
+        generation: Option<u32>,
        settings: HashMap<&str, &str>,
    ) -> anyhow::Result<TenantId> {
        let mut settings = settings.clone();
@@ -382,11 +409,9 @@ impl PageServerNode {
                .context("Failed to parse 'gc_feedback' as bool")?,
        };

-        // If tenant ID was not specified, generate one
-        let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
-
        let request = models::TenantCreateRequest {
-            new_tenant_id,
+            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
+            generation,
            config,
        };
        if !settings.is_empty() {
@@ -494,6 +519,32 @@ impl PageServerNode {
        Ok(())
    }

+    pub fn location_config(
+        &self,
+        tenant_id: TenantId,
+        config: LocationConfig,
+        flush_ms: Option<Duration>,
+    ) -> anyhow::Result<()> {
+        let req_body = TenantLocationConfigRequest { tenant_id, config };
+
+        let path = format!(
+            "{}/tenant/{}/location_config",
+            self.http_base_url, tenant_id
+        );
+        let path = if let Some(flush_ms) = flush_ms {
+            format!("{}?flush_ms={}", path, flush_ms.as_millis())
+        } else {
+            path
+        };
+
+        self.http_request(Method::PUT, path)?
+            .json(&req_body)
+            .send()?
+            .error_from_body()?;
+
+        Ok(())
+    }
+
    pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
        let timeline_infos: Vec<TimelineInfo> = self
            .http_request(
@@ -514,6 +565,7 @@ impl PageServerNode {
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
+        existing_initdb_timeline_id: Option<TimelineId>,
    ) -> anyhow::Result<TimelineInfo> {
        // If timeline ID was not specified, generate one
        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
@@ -527,6 +579,7 @@ impl PageServerNode {
            ancestor_start_lsn,
            ancestor_timeline_id,
            pg_version,
+            existing_initdb_timeline_id,
        })
        .send()?
        .error_from_body()?
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -11,6 +11,7 @@ use std::process::Child;
 use std::{io, result};

 use anyhow::Context;
+use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
@@ -97,8 +98,9 @@ impl SafekeeperNode {
        SafekeeperNode::datadir_path_by_id(&self.env, self.id)
    }

-    pub fn pid_file(&self) -> PathBuf {
-        self.datadir_path().join("safekeeper.pid")
+    pub fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.datadir_path().join("safekeeper.pid"))
+            .expect("non-Unicode path")
    }

    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -0,0 +1,197 @@
+//!
+//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
+//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
+//! point to the new pageserver.
+//!
+use crate::local_env::LocalEnv;
+use crate::{
+    attachment_service::AttachmentService, endpoint::ComputeControlPlane,
+    pageserver::PageServerNode,
+};
+use pageserver_api::models::{
+    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+};
+use std::collections::HashMap;
+use std::time::Duration;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+/// Given an attached pageserver, retrieve the LSN for all timelines
+fn get_lsns(
+    tenant_id: TenantId,
+    pageserver: &PageServerNode,
+) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
+    let timelines = pageserver.timeline_list(&tenant_id)?;
+    Ok(timelines
+        .into_iter()
+        .map(|t| (t.timeline_id, t.last_record_lsn))
+        .collect())
+}
+
+/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
+/// `baseline`.
+fn await_lsn(
+    tenant_id: TenantId,
+    pageserver: &PageServerNode,
+    baseline: HashMap<TimelineId, Lsn>,
+) -> anyhow::Result<()> {
+    loop {
+        let latest = match get_lsns(tenant_id, pageserver) {
+            Ok(l) => l,
+            Err(e) => {
+                println!(
+                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                    pageserver.conf.id
+                );
+                std::thread::sleep(Duration::from_millis(500));
+                continue;
+            }
+        };
+
+        let mut any_behind: bool = false;
+        for (timeline_id, baseline_lsn) in &baseline {
+            match latest.get(timeline_id) {
+                Some(latest_lsn) => {
+                    println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                    if latest_lsn < baseline_lsn {
+                        any_behind = true;
+                    }
+                }
+                None => {
+                    // Expected timeline isn't yet visible on migration destination.
+                    // (IRL we would have to account for timeline deletion, but this
+                    //  is just test helper)
+                    any_behind = true;
+                }
+            }
+        }
+
+        if !any_behind {
+            println!("✅ LSN caught up.  Proceeding...");
+            break;
+        } else {
+            std::thread::sleep(Duration::from_millis(500));
+        }
+    }
+
+    Ok(())
+}
+
+/// This function spans multiple services, to demonstrate live migration of a tenant
+/// between pageservers:
+///  - Coordinate attach/secondary/detach on pageservers
+///  - call into attachment_service for generations
+///  - reconfigure compute endpoints to point to new attached pageserver
+pub fn migrate_tenant(
+    env: &LocalEnv,
+    tenant_id: TenantId,
+    dest_ps: PageServerNode,
+) -> anyhow::Result<()> {
+    // Get a new generation
+    let attachment_service = AttachmentService::from_env(env);
+
+    fn build_location_config(
+        mode: LocationConfigMode,
+        generation: Option<u32>,
+        secondary_conf: Option<LocationConfigSecondary>,
+    ) -> LocationConfig {
+        LocationConfig {
+            mode,
+            generation,
+            secondary_conf,
+            tenant_conf: TenantConfig::default(),
+            shard_number: 0,
+            shard_count: 0,
+            shard_stripe_size: 0,
+        }
+    }
+
+    let previous = attachment_service.inspect(tenant_id)?;
+    let mut baseline_lsns = None;
+    if let Some((generation, origin_ps_id)) = &previous {
+        let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
+
+        if origin_ps_id == &dest_ps.conf.id {
+            println!("🔁 Already attached to {origin_ps_id}, freshening...");
+            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
+            dest_ps.location_config(tenant_id, dest_conf, None)?;
+            println!("✅ Migration complete");
+            return Ok(());
+        }
+
+        println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
+
+        let stale_conf =
+            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
+        origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
+
+        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
+    }
+
+    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
+
+    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
+    dest_ps.location_config(tenant_id, dest_conf, None)?;
+
+    if let Some(baseline) = baseline_lsns {
+        println!("🕑 Waiting for LSN to catch up...");
+        await_lsn(tenant_id, &dest_ps, baseline)?;
+    }
+
+    let cplane = ComputeControlPlane::load(env.clone())?;
+    for (endpoint_name, endpoint) in &cplane.endpoints {
+        if endpoint.tenant_id == tenant_id {
+            println!(
+                "🔁 Reconfiguring endpoint {} to use pageserver {}",
+                endpoint_name, dest_ps.conf.id
+            );
+            endpoint.reconfigure(Some(dest_ps.conf.id))?;
+        }
+    }
+
+    for other_ps_conf in &env.pageservers {
+        if other_ps_conf.id == dest_ps.conf.id {
+            continue;
+        }
+
+        let other_ps = PageServerNode::from_env(env, other_ps_conf);
+        let other_ps_tenants = other_ps.tenant_list()?;
+
+        // Check if this tenant is attached
+        let found = other_ps_tenants
+            .into_iter()
+            .map(|t| t.id)
+            .any(|i| i == tenant_id);
+        if !found {
+            continue;
+        }
+
+        // Downgrade to a secondary location
+        let secondary_conf = build_location_config(
+            LocationConfigMode::Secondary,
+            None,
+            Some(LocationConfigSecondary { warm: true }),
+        );
+
+        println!(
+            "💤 Switching to secondary mode on pageserver {}",
+            other_ps.conf.id
+        );
+        other_ps.location_config(tenant_id, secondary_conf, None)?;
+    }
+
+    println!(
+        "🔁 Switching to AttachedSingle mode on pageserver {}",
+        dest_ps.conf.id
+    );
+    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
+    dest_ps.location_config(tenant_id, dest_conf, None)?;
+
+    println!("✅ Migration complete");
+
+    Ok(())
+}
--- a/deny.toml
+++ b/deny.toml
@@ -23,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = ["RUSTSEC-2023-0052"]
+ignore = []

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
@@ -74,10 +74,30 @@ highlight = "all"
 workspace-default-features = "allow"
 external-default-features = "allow"
 allow = []
-deny = []
+
 skip = []
 skip-tree = []

+[[bans.deny]]
+# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
+# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
+name = "async-std"
+
+[[bans.deny]]
+name = "async-io"
+
+[[bans.deny]]
+name = "async-waker"
+
+[[bans.deny]]
+name = "async-global-executor"
+
+[[bans.deny]]
+name = "async-executor"
+
+[[bans.deny]]
+name = "smol"
+
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -25,7 +25,7 @@
            },
            {
                "name": "wal_level",
-                "value": "replica",
+                "value": "logical",
                "vartype": "enum"
            },
            {
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -30,7 +30,7 @@ cleanup() {
 echo "clean up containers if exists"
 cleanup

-for pg_version in 14 15; do
+for pg_version in 14 15 16; do
    echo "start containers (pg_version=$pg_version)."
    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d

--- a/docs/error-handling.md
+++ b/docs/error-handling.md
@@ -188,11 +188,60 @@ that.

 ## Error message style

+### PostgreSQL extensions
+
 PostgreSQL has a style guide for writing error messages:

 https://www.postgresql.org/docs/current/error-style-guide.html

 Follow that guide when writing error messages in the PostgreSQL
-extension. We don't follow it strictly in the pageserver and
-safekeeper, but the advice in the PostgreSQL style guide is generally
-good, and you can't go wrong by following it.
+extensions.
+
+### Neon Rust code
+
+#### Anyhow Context
+
+When adding anyhow `context()`, use form `present-tense-verb+action`.
+
+Example:
+- Bad: `file.metadata().context("could not get file metadata")?;`
+- Good: `file.metadata().context("get file metadata")?;`
+
+#### Logging Errors
+
+When logging any error `e`, use `could not {e:#}` or `failed to {e:#}`.
+
+If `e` is an `anyhow` error and you want to log the backtrace that it contains,
+use `{e:?}` instead of `{e:#}`.
+
+#### Rationale
+
+The `{:#}` ("alternate Display") of an `anyhow` error chain is concatenation fo the contexts, using `: `.
+
+For example, the following Rust code will result in output
+```
+ERROR  failed to list users: load users from server: parse response: invalid json
+```
+
+This is more concise / less noisy than what happens if you do `.context("could not ...")?` at each level, i.e.:
+
+```
+ERROR  could not list users: could not load users from server: could not parse response: invalid json
+```
+
+
+```rust
+fn main() {
+  match list_users().context("list users") else {
+    Ok(_) => ...,
+    Err(e) => tracing::error!("failed to {e:#}"),
+  }
+}
+fn list_users() {
+  http_get_users().context("load users from server")?;
+}
+fn http_get_users() {
+  let response = client....?;
+  response.parse().context("parse response")?; // fails with serde error "invalid json"
+}
+```
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -96,6 +96,16 @@ prefix_in_bucket = '/test_prefix/'

 `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.

+or
+
+```toml
+[remote_storage]
+container_name = 'some-container-name'
+container_region = 'us-east'
+prefix_in_container = '/test-prefix/'
+```
+
+`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed.

 ## Repository background tasks

--- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
+++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
@@ -177,7 +177,7 @@ I e during migration create_branch can be called on old pageserver and newly cre

 The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.

-The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
+The approach largely follows this guide: <https://www.notion.so/neondatabase/Cloud-Ad-hoc-tenant-relocation-f687474f7bfc42269e6214e3acba25c7>

 The happy path sequence:

--- a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
+++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md
@@ -0,0 +1,281 @@
+
+# Crash-Consistent Layer Map Updates By Leveraging `index_part.json`
+
+* Created on: Aug 23, 2023
+* Author: Christian Schwarz
+
+## Summary
+
+This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage.
+Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold.
+
+## Motivation
+
+### Background
+
+We can currently easily make complex, atomic updates to the layer map by means of an RwLock.
+If we crash or restart pageserver, we reconstruct the layer map from:
+1. local timeline directory contents
+2. remote `index_part.json` contents.
+
+The function that is responsible for this is called `Timeline::load_layer_map()`.
+The reconciliation process's behavior is the following:
+* local-only files will become part of the layer map as local-only layers and rescheduled for upload
+* For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map.
+
+### The Problem
+
+There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**.
+The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers.
+As stated above, making the update to the layer map in atomic way is trivial.
+But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion.
+Currently, we issue the system calls one by one and hope we don't crash.
+
+What happens if we crash and restart in the middle of that system call sequence?
+We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in.
+
+We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make.
+
+### Problem's Implications For Compaction
+
+The implications of the above are primarily problematic for compaction.
+Specifically, the part of it that compacts L0 layers into L1 layers.
+
+Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files.
+Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map.
+It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part.
+
+If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s.
+This means the compaction after restart will **overwrite** the previously written L1s.
+Currently we also schedule an S3 upload of the overwritten L1.
+
+If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed.
+
+*However*:
+1. the file size of the overwritten L1s may not be identical, and
+2. the bit pattern of the overwritten L1s may not be identical, and,
+3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
+
+The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
+
+For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s.
+But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite.
+That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
+If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
+Effectively, the data in the L1 has become inaccessible to node B.
+If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
+
+If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
+
+In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum.
+
+But if (3) ever happens, the logical content may be different, and, we could have truly lost data.
+
+Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents.
+**It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).**
+
+## Design
+
+Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load.
+Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise.
+
+During **timeline load**, the only thing that matters is the remote index part content.
+Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines.
+The local timeline dir's `metadata` file does not matter.
+The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part.
+Any layer files in the local timeline dir that aren't in the remote index part are removed during startup.
+The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part.
+Instead, it treats the remote index part as the authoritative layer map.
+If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part.
+If it doesn't match, we remove the file from the local timeline dir.
+
+After load, **at runtime**, nothing changes compared to what we did before this RFC.
+The procedure for single- and multi-object changes is reproduced here for reference:
+* For any new layers that the change adds:
+  * Write them to a temporary location.
+  * While holding layer map lock:
+    * Move them to the final location.
+    * Insert into layer map.
+* Make the S3 changes.
+  We won't reproduce the remote timeline client method calls here because these are subject to change.
+  Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change:
+    * PUT layer files inserted by the change.
+    * PUT an index part that has insertions and deletions of the change.
+    * DELETE the layer files that are deleted by the change.
+
+Note that it is safe for the DELETE to be deferred arbitrarily.
+* If it never happens, we leak the object, but, that's not a correctness concern.
+* As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`.
+* With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details).
+
+## How This Solves The Problem
+
+If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part.
+The S3 change sequence above is obviously crash-consistent.
+If we crash before the index part PUT, then we leak the inserted layer files to S3.
+If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3.
+Leaking is fine, it's a pre-existing condition and not addressed in this RFC.
+
+Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent:
+* atomic layer map update at runtime, currently by using an RwLock in write mode
+* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
+* local timeline dir state:
+  * irrelevant for layer map content => irrelevant for atomic updates / crash consistency
+  * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
+  * if we crash before index part PUT, local layer files will be deleted
+
+## Trade-Offs
+
+### Fundamental
+
+If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`:
+* wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver
+* compaction: we lose the entire compaction iteration work; need to re-do it again
+* gc: no change to what we have today
+
+If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work.
+The amount of work to be re-do is capped to the lag of S3 changes to the local changes.
+Assuming upload queue allows for unlimited queue depth (that's what it does today), this means:
+* on-demand downloads that were needed to do the work: are likely still present, not lost
+* wal ingest: currently unbounded
+* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
+  * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
+  * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
+* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
+  * I have no intuition how expensive / long-running it is in reality.
+* gc: `update_gc_info`` work (not substantial, AFAIK)
+
+To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section).
+However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable.
+We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)).
+However, this RFC is not constraining the design space either.
+
+### Practical
+
+#### Pageserver Restarts
+
+Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case.
+However, regular pageserver restart happen frequently, e.g., during weekly deploys.
+
+In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
+They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
+We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
+A longer budget would expose tenants that are done early to a longer downtime.
+A short budget would risk throwing away more work that'd have to be re-done after restart.
+
+In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3.
+We can mitigate this problem as follows:
+0. initially, by accepting that we need to do the work again
+1. short-term, introducing measures to cap the amount of in-flight work:
+
+   - cap upload queue length, use backpressure to slow down compaction
+   - disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver`
+   - introducing a read-only shutdown state for tenants that are fast to shut down;
+     that state would be equivalent to the state of a tenant in hot standby / readonly mode.
+
+2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it.
+
+#### `disk_consistent_lsn` can go backwards
+
+`disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT.
+Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`.
+Compute certainly doesn't care about `disk_consistent_lsn`.
+
+
+## Side-Effects Of This Design
+
+* local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load.
+
+## Limitations
+
+Multi-object changes that span multiple timelines aren't covered by this RFC.
+That's fine because we currently don't need them, as evidenced by the absence
+of a Pageserver operation that holds multiple timelines' layer map lock at a time.
+
+## Impacted components
+
+Primarily pageservers.
+
+Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work.
+No changes to safekeepers are needed.
+
+## Alternatives considered
+
+### Alternative 1: WAL
+
+We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 .
+The WAL would be used to
+1. make multi-object changes atomic
+2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay.
+
+The WAL is appealing in a local-first world, but, it's much more complex than the design described above:
+* New on-disk state to get right.
+* Forward- and backward-compatibility development costs in the future.
+
+### Alternative 2: Flow Everything Through `index_part.json`
+
+We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`.
+I.e., layer map would always be the last-persisted S3 state.
+That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)).
+And it might make hot standbys / read-only pageservers less of a special case in the future.
+
+But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers.
+
+And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload.
+
+Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable.
+
+### Alternative 3: Sequence Numbers For Layers
+
+Instead of what's proposed in this RFC, we could use unique numbers to identify layer files:
+
+```
+# before
+tenants/$tenant/timelines/$timeline/$key_and_lsn_range
+# after
+tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
+```
+
+To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
+
+This alternative does not solve atomic layer map updates.
+In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
+In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files.
+We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents.
+
+However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
+
+So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
+But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
+The proposed design in this RFC addresses both.
+
+So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
+That way, we avoid a phase where the crash-during-compaction problem is accute.
+
+## Related issues
+
+- https://github.com/neondatabase/neon/issues/4749
+- https://github.com/neondatabase/neon/issues/4418
+  - https://github.com/neondatabase/neon/pull/4422
+- https://github.com/neondatabase/neon/issues/5077
+- https://github.com/neondatabase/neon/issues/4088
+  - (re)resolutions:
+    - https://github.com/neondatabase/neon/pull/4696
+    - https://github.com/neondatabase/neon/pull/4094
+      - https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719
+
+Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that  assumption in order to fix the problem.
+
+
+## Implementation Plan
+
+1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part.
+
+    - The nasty part here is to fix all the tests that fiddle with the local timeline directory.
+      Possibly they are just irrelevant with this change, but, each case will require inspection.
+
+2. Implement the design above.
+
+    - Initially, ship without the mitigations for restart and accept we will do some work twice.
+    - Measure the impact and implement one of the mitigations.
+
--- a/docs/rfcs/028-pageserver-migration.md
+++ b/docs/rfcs/028-pageserver-migration.md
@@ -0,0 +1,599 @@
+# Seamless tenant migration
+
+- Author: john@neon.tech
+- Created on 2023-08-11
+- Implemented on ..
+
+## Summary
+
+The preceding [generation numbers RFC](025-generation-numbers.md) may be thought of as "making tenant
+migration safe". Following that,
+this RFC is about how those migrations are to be done:
+
+1. Seamlessly (without interruption to client availability)
+2. Quickly (enabling faster operations)
+3. Efficiently (minimizing I/O and $ cost)
+
+These points are in priority order: if we have to sacrifice
+efficiency to make a migration seamless for clients, we will
+do so, etc.
+
+This is accomplished by introducing two high level changes:
+
+- A dual-attached state for tenants, used in a control-plane-orchestrated
+  migration procedure that preserves availability during a migration.
+- Warm secondary locations for tenants, where on-disk content is primed
+  for a fast migration of the tenant from its current attachment to this
+  secondary location.
+
+## Motivation
+
+Migrating tenants between pageservers is essential to operating a service
+at scale, in several contexts:
+
+1. Responding to a pageserver node failure by migrating tenants to other pageservers
+2. Balancing load and capacity across pageservers, for example when a user expands their
+   database and they need to migrate to a pageserver with more capacity.
+3. Restarting pageservers for upgrades and maintenance
+
+The current situation steps for migration are:
+
+- detach from old node; skip if old node is dead; (the [skip part is still WIP](https://github.com/neondatabase/cloud/issues/5426)).
+- attach to new node
+- re-configure endpoints to use the new node
+
+Once [generation numbers](025-generation-numbers.md) are implemented,
+the detach step is no longer critical for correctness. So, we can
+
+- attach to a new node,
+- re-configure endpoints to use the new node, and then
+- detach from the old node.
+
+However, this still does not meet our seamless/fast/efficient goals:
+
+- Not fast: The new node will have to download potentially large amounts
+  of data from S3, which may take many minutes.
+- Not seamless: If we attach to a new pageserver before detaching an old one,
+  the new one might delete some objects that interrupt availability of reads on the old one.
+- Not efficient: the old pageserver will continue uploading
+  S3 content during the migration that will never be read.
+
+The user expectations for availability are:
+
+- For planned maintenance, there should be zero availability
+  gap. This expectation is fulfilled by this RFC.
+- For unplanned changes (e.g. node failures), there should be
+  minimal availability gap. This RFC provides the _mechanism_
+  to fail over quickly, but does not provide the failure _detection_
+  nor failover _policy_.
+
+## Non Goals
+
+- Defining service tiers with different storage strategies: the same
+  level of HA & overhead will apply to all tenants. This doesn't rule out
+  adding such tiers in future.
+- Enabling pageserver failover in the absence of a control plane: the control
+  plane will remain the source of truth for what should be attached where.
+- Totally avoiding availability gaps on unplanned migrations during
+  a failure (we expect a small, bounded window of
+  read unavailability of very recent LSNs)
+- Workload balancing: this RFC defines the mechanism for moving tenants
+  around, not the higher level logic for deciding who goes where.
+- Defining all possible configuration flows for tenants: the migration process
+  defined in this RFC demonstrates the sufficiency of the pageserver API, but
+  is not the only kind of configuration change the control plane will ever do.
+  The APIs defined here should let the control plane move tenants around in
+  whatever way is needed while preserving data safety and read availability.
+
+## Impacted components
+
+Pageserver, control plane
+
+## Terminology
+
+- **Attachment**: a tenant is _attached_ to a pageserver if it has
+  been issued a generation number, and is running an instance of
+  the `Tenant` type, ingesting the WAL, and available to serve
+  page reads.
+- **Location**: locations are a superset of attachments. A location
+  is a combination of a tenant and a pageserver. We may _attach_ at a _location_.
+
+- **Secondary location**: a location which is not currently attached.
+- **Warm secondary location**: a location which is not currently attached, but is endeavoring to maintain a warm local cache of layers. We avoid calling this a _warm standby_ to avoid confusion with similar postgres features.
+
+## Implementation (high level)
+
+### Warm secondary locations
+
+To enable faster migrations, we will identify at least one _secondary location_
+for each tenant. This secondary location will keep a warm cache of layers
+for the tenant, so that if it is later attached, it can catch up with the
+latest LSN quickly: rather than downloading everything, it only has to replay
+the recent part of the WAL to advance from the remote_consistent_offset to the
+most recent LSN in the WAL.
+
+The control plane is responsible for selecting secondary locations, and
+calling into pageservers to configure tenants into a secondary mode at this
+new location, as well as attaching the tenant in its existing primary location.
+
+The attached pageserver for a tenant will publish a [layer heatmap](#layer-heatmap)
+to advise secondaries of which layers should be downloaded.
+
+### Location modes
+
+Currently, we consider a tenant to be in one of two states on a pageserver:
+
+- Attached: active `Tenant` object, and layers on local disk
+- Detached: no layers on local disk, no runtime state.
+
+We will extend this with finer-grained modes, whose purpose will become
+clear in later sections:
+
+- **AttachedSingle**: equivalent the existing attached state.
+- **AttachedMulti**: like AttachedSingle, holds an up to date generation, but
+  does not do deletions.
+- **AttachedStale**: like AttachedSingle, holds a stale generation,
+  do not do any remote storage operations.
+- **Secondary**: keep local state on disk, periodically update from S3.
+- **Detached**: equivalent to existing detached state.
+
+To control these finer grained states, a new pageserver API endpoint will be added.
+
+### Cutover procedure
+
+Define old location and new location as "Node A" and "Node B". Consider
+the case where both nodes are available, and Node B was previously configured
+as a secondary location for the tenant we are migrating.
+
+The cutover procedure is orchestrated by the control plane, calling into
+the pageservers' APIs:
+
+1. Call to Node A requesting it to flush to S3 and enter AttachedStale state
+2. Increment generation, and call to Node B requesting it to enter AttachedMulti
+   state with the new generation.
+3. Call to Node B, requesting it to download the latest hot layers from remote storage,
+   according to the latest heatmap flushed by Node A.
+4. Wait for Node B's WAL ingestion to catch up with node A's
+5. Update endpoints to use node B instead of node A
+6. Call to node B requesting it to enter state AttachedSingle.
+7. Call to node A requesting it to enter state Secondary
+
+The following table summarizes how the state of the system advances:
+
+|     Step      |     Node A     |     Node B     | Node used by endpoints |
+| :-----------: | :------------: | :------------: | :--------------------: |
+| 1 (_initial_) | AttachedSingle |   Secondary    |           A            |
+|       2       | AttachedStale  | AttachedMulti  |           A            |
+|       3       | AttachedStale  | AttachedMulti  |           A            |
+|       4       | AttachedStale  | AttachedMulti  |           A            |
+| 5 (_cutover_) | AttachedStale  | AttachedMulti  |           B            |
+|       6       | AttachedStale  | AttachedSingle |           B            |
+|  7 (_final_)  |   Secondary    | AttachedSingle |           B            |
+
+The procedure described for a clean handover from a live node to a secondary
+is also used for failure cases and for migrations to a location that is not
+configured as a secondary, by simply skipping irrelevant steps, as described in
+the following sections.
+
+#### Migration from an unresponsive node
+
+If node A is unavailable, then all calls into
+node A are skipped and we don't wait for B to catch up before
+switching updating the endpoints to use B.
+
+#### Migration to a location that is not a secondary
+
+If node B is initially in Detached state, the procedure is identical. Since Node B
+is coming from a Detached state rather than Secondary, the download of layers and
+catch up with WAL will take much longer.
+
+We might do this if:
+
+- Attached and secondary locations are both critically low on disk, and we need
+  to migrate to a third node with more resources available.
+- We are migrating a tenant which does not use secondary locations to save on cost.
+
+#### Permanent migration away from a node
+
+In the final step of the migration, we generally request the original node to enter a Secondary
+state. This is typical if we are doing a planned migration during maintenance, or to
+balance CPU/network load away from a node.
+
+One might also want to permanently migrate away: this can be done by simply removing the secondary
+location after the migration is complete, or as an optimization by substituting the Detached state
+for the Secondary state in the final step.
+
+#### Cutover diagram
+
+```mermaid
+sequenceDiagram
+participant CP as Control plane
+participant A as Node A
+participant B as Node B
+participant E as Endpoint
+
+CP->>A: PUT Flush & go to AttachedStale
+note right of A: A continues to ingest WAL
+CP->>B: PUT AttachedMulti
+CP->>B: PUT Download layers from latest heatmap
+note right of B: B downloads from S3
+loop Poll until download complete
+CP->>B: GET download status
+end
+activate B
+note right of B: B ingests WAL
+loop Poll until catch up
+CP->>B: GET visible WAL
+CP->>A: GET visible WAL
+end
+deactivate B
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+CP->>B: PUT AttachedSingle
+CP->>A: PUT Secondary
+```
+
+#### Cutover from an unavailable pageserver
+
+This case is far simpler: we may skip straight to our intended
+end state.
+
+```mermaid
+sequenceDiagram
+participant A as Node A
+participant CP as Control plane
+participant B as Node B
+participant E as Endpoint
+
+note right of A: Node A offline
+activate A
+CP->>B: PUT AttachedSingle
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+deactivate A
+```
+
+## Implementation (detail)
+
+### Purpose of AttachedMulti, AttachedStale
+
+#### AttachedMulti
+
+Ordinarily, an attached pageserver whose generation is the latest may delete
+layers at will (e.g. during compaction). If a previous generation pageserver
+is also still attached, and in use by endpoints, then this layer deletion could
+lead to a loss of availability for the endpoint when reading from the previous
+generation pageserver.
+
+The _AttachedMulti_ state simply disables deletions. These will be enqueued
+in `RemoteTimelineClient` until the control plane transitions the
+node into AttachedSingle, which unblocks deletions.  Other remote storage operations
+such as uploads are not blocked.
+
+AttachedMulti is not required for data safety, only to preserve availability
+on pageservers running with stale generations.
+
+A node enters AttachedMulti only when explicitly asked to by the control plane. It should
+only remain in this state for the duration of a migration.
+
+If a control plane bug leaves
+the node in AttachedMulti for a long time, then we must avoid unbounded memory use from enqueued
+deletions. This may be accomplished simply, by dropping enqueued deletions when some modest
+threshold of delayed deletions (e.g. 10k layers per tenant) is reached. As with all deletions,
+it is safe to skip them, and the leaked objects will be eventually cleaned up by scrub or
+by timeline deletion.
+
+During AttachedMulti, the Tenant is free to drop layers from local disk in response to
+disk pressure: only the deletion of remote layers is blocked.
+
+#### AttachedStale
+
+Currently, a pageserver with a stale generation number will continue to
+upload layers, but be prevented from completing deletions. This is safe, but inefficient: layers uploaded by this stale generation
+will not be read back by future generations of pageservers.
+
+The _AttachedStale_ state disables S3 uploads. The stale pageserver
+will continue to ingest the WAL and write layers to local disk, but not to
+do any uploads to S3.
+
+A node may enter AttachedStale in two ways:
+
+- Explicitly, when control plane calls into the node at the start of a migration.
+- Implicitly, when the node tries to validate some deletions and discovers
+  that its generation is stale.
+
+The AttachedStale state also disables sending consumption metrics from
+that location: it is interpreted as an indication that some other pageserver
+is already attached or is about to be attached, and that new pageserver will
+be responsible for sending consumption metrics.
+
+#### Disk Pressure & AttachedStale
+
+Over long periods of time, a tenant location in AttachedStale will accumulate data
+on local disk, as it cannot evict any layers written since it entered the
+AttachStale state. We rely on the control plane to revert the location to
+Secondary or Detached at the end of a migration.
+
+This scenario is particularly noteworthy when evacuating all tenants on a pageserver:
+since _all_ the attached tenants will go into AttachedStale, we will be doing no
+uploads at all, therefore ingested data will cause disk usage to increase continuously.
+Under nominal conditions, the available disk space on pageservers should be sufficient
+to complete the evacuation before this becomes a problem, but we must also handle
+the case where we hit a low disk situation while in this state.
+
+The concept of disk pressure already exists in the pageserver: the `disk_usage_eviction_task`
+touches each Tenant when it determines that a low-disk condition requires
+some layer eviction. Having selected layers for eviction, the eviction
+task calls `Timeline::evict_layers`.
+
+**Safety**: If evict_layers is called while in AttachedStale state, and some of the to-be-evicted
+layers are not yet uploaded to S3, then the block on uploads will be lifted. This
+will result in leaking some objects once a migration is complete, but will enable
+the node to manage its disk space properly: if a node is left with some tenants
+in AttachedStale indefinitely due to a network partition or control plane bug,
+these tenants will not cause a full disk condition.
+
+### Warm secondary updates
+
+#### Layer heatmap
+
+The secondary location's job is to serve reads **with the same quality of service as the original location
+was serving them around the time of a migration**. This does not mean the secondary
+location needs the whole set of layers: inactive layers that might soon
+be evicted on the attached pageserver need not be downloaded by the
+secondary. A totally idle tenant only needs to maintain enough on-disk
+state to enable a fast cold start (i.e. the most recent image layers are
+typically sufficient).
+
+To enable this, we introduce the concept of a _layer heatmap_, which
+acts as an advisory input to secondary locations to decide which
+layers to download from S3.
+
+#### Attached pageserver
+
+The attached pageserver, if in state AttachedSingle, periodically
+uploads a serialized heat map to S3. It may skip this if there
+is no change since the last time it uploaded (e.g. if the tenant
+is totally idle).
+
+Additionally, when the tenant is flushed to remote storage prior to a migration
+(the first step in [cutover procedure](#cutover-procedure)), 
+the heatmap is written out. This enables a future attached pageserver
+to get an up to date view when deciding which layers to download.
+
+#### Secondary location behavior
+
+Secondary warm locations run a simple loop, implemented separately from
+the main `Tenant` type, which represents attached tenants:
+
+- Download the layer heatmap
+- Select any "hot enough" layers to download, if there is sufficient
+  free disk space.
+- Download layers, if they were not previously evicted (see below)
+- Download the latest index_part.json
+- Check if any layers currently on disk are no longer referenced by
+  IndexPart & delete them
+
+Note that the heatmap is only advisory: if a secondary location has plenty
+of disk space, it may choose to retain layers that aren't referenced
+by the heatmap, as long as they are still referenced by the IndexPart. Conversely,
+if a node is very low on disk space, it might opt to raise the heat threshold required
+to both downloading a layer, until more disk space is available.
+
+#### Secondary locations & disk pressure
+
+Secondary locations are subject to eviction on disk pressure, just as
+attached locations are.  For eviction purposes, the access time of a
+layer in a secondary location will be the access time given in the heatmap,
+rather than the literal time at which the local layer file was accessed.
+
+The heatmap will indicate which layers are in local storage on the attached
+location.  The secondary will always attempt to get back to having that
+set of layers on disk, but to avoid flapping, it will remember the access
+time of the layer it was most recently asked to evict, and layers whose
+access time is below that will not be re-downloaded.
+
+The resulting behavior is that after a layer is evicted from a secondary
+location, it is only re-downloaded once the attached pageserver accesses
+the layer and uploads a heatmap reflecting that access time.  On a pageserver
+restart, the secondary location will attempt to download all layers in
+the heatmap again, if they are not on local disk.
+
+This behavior will be slightly different when secondary locations are
+used for "low energy tenants", but that is beyond the scope of this RFC.
+
+### Location configuration API
+
+Currently, the `/tenant/<tenant_id>/config` API defines various
+tunables like compaction settings, which apply to the tenant irrespective
+of which pageserver it is running on.
+
+A new "location config" structure will be introduced, which defines
+configuration which is per-tenant, but local to a particular pageserver,
+such as the attachment mode and whether it is a secondary.
+
+The pageserver will expose a new per-tenant API for setting
+the state: `/tenant/<tenant_id>/location/config`.
+
+Body content:
+
+```
+{
+  state: 'enum{Detached, Secondary, AttachedSingle, AttachedMulti, AttachedStale}',
+  generation: Option<u32>,
+  configuration: `Option<TenantConfig>`
+  flush: bool
+}
+```
+
+Existing `/attach` and `/detach` endpoint will have the same
+behavior as calling `/location/config` with `AttachedSingle` and `Detached`
+states respectively. These endpoints will be deprecated and later
+removed.
+
+The generation attribute is mandatory for entering `AttachedSingle` or
+`AttachedMulti`.
+
+The configuration attribute is mandatory when entering any state other
+than `Detached`. This configuration is the same as the body for
+the existing `/tenant/<tenant_id>/config` endpoint.
+
+The `flush` argument indicates whether the pageservers should flush
+to S3 before proceeding: this only has any effect if the node is
+currently in AttachedSingle or AttachedMulti. This is used
+during the first phase of migration, when transitioning the
+old pageserver to AttachedSingle.
+
+The `/re-attach` API response will be extended to include a `state` as
+well as a `generation`, enabling the pageserver to enter the
+correct state for each tenant on startup.
+
+### Database schema for locations
+
+A new table `ProjectLocation`:
+
+- pageserver_id: int
+- tenant_id: TenantId
+- generation: Option<int>
+- state: `enum(Secondary, AttachedSingle, AttachedMulti)`
+
+Notes:
+
+- It is legacy for a Project to have zero `ProjectLocation`s
+- The `pageserver` column in `Project` now means "to which pageserver should
+  endpoints connect", rather than simply which pageserver is attached.
+- The `generation` column in `Project` remains, and is incremented and used
+  to set the generation of `ProjectLocation` rows when they are set into
+  an attached state.
+- The `Detached` state is implicitly represented as the absence of
+  a `ProjectLocation`.
+
+### Executing migrations
+
+Migrations will be implemented as Go functions, within the
+existing `Operation` framework in the control plane. These
+operations are persistent, such that they will always keep
+trying until completion: this property is important to avoid
+leaving garbage behind on pageservers, such as AttachedStale
+locations.
+
+### Recovery from failures during migration
+
+During migration, the control plane may encounter failures of either
+the original or new pageserver, or both:
+
+- If the original fails, skip past waiting for the new pageserver
+  to catch up, and put it into AttachedSingle immediately.
+- If the new node fails, put the old pageserver into Secondary
+  and then back into AttachedSingle (this has the effect of
+  retaining on-disk state and granting it a fresh generation number).
+- If both nodes fail, keep trying until one of them is available
+  again.
+
+### Control plane -> Pageserver reconciliation
+
+A migration may be done while the old node is unavailable,
+in which case the old node may still be running in an AttachedStale
+state.
+
+In this case, it is undesirable to have the migration `Operation`
+stay alive until the old node eventually comes back online
+and can be cleaned up. To handle this, the control plane
+should run a background reconciliation process to compare
+a pageserver's attachments with the database, and clean up
+any that shouldn't be there any more.
+
+Note that there will be no work to do if the old node was really
+offline, as during startup it will call into `/re-attach` and
+be updated that way. The reconciliation will only be needed
+if the node was unavailable but still running.
+
+## Alternatives considered
+
+### Only enabling secondary locations for tenants on a higher service tier
+
+This will make sense in future, especially for tiny databases that may be
+downloaded from S3 in milliseconds when needed.
+
+However, it is not wise to do it immediately, because pageservers contain
+a mixture of higher and lower tier workloads. If we had 1 tenant with
+a secondary location and 9 without, then those other 9 tenants will do
+a lot of I/O as they try to recover from S3, which may degrade the
+service of the tenant which had a secondary location.
+
+Until we segregate tenant on different service tiers on different pageserver
+nodes, or implement & test QoS to ensure that tenants with secondaries are
+not harmed by tenants without, we should use the same failover approach
+for all the tenants.
+
+### Hot secondary locations (continuous WAL replay)
+
+Instead of secondary locations populating their caches from S3, we could
+have them consume the WAL from safekeepers. The downsides of this would be:
+
+- Double load on safekeepers, which are a less scalable service than S3
+- Secondary locations' on-disk state would end up subtly different to
+  the remote state, which would make synchronizing with S3 more complex/expensive
+  when going into attached state.
+
+The downside of only updating secondary locations from S3 is that we will
+have a delay during migration from replaying the LSN range between what's
+in S3 and what's in the pageserver. This range will be very small on
+planned migrations, as we have the old pageserver flush to S3 immediately
+before attaching the new pageserver. On unplanned migrations (old pageserver
+is unavailable), the range of LSNs to replay is bounded by the flush frequency
+on the old pageserver. However, the migration doesn't have to wait for the
+replay: it's just that not-yet-replayed LSNs will be unavailable for read
+until the new pageserver catches up.
+
+We expect that pageserver reads of the most recent LSNs will be relatively
+rare, as for an active endpoint those pages will usually still be in the postgres
+page cache: this leads us to prefer synchronizing from S3 on secondary
+locations, rather than consuming the WAL from safekeepers.
+
+### Cold secondary locations
+
+It is not functionally necessary to keep warm caches on secondary locations at all. However, if we do not, then
+we would experience a de-facto availability loss in unplanned migrations, as reads to the new node would take an extremely long time (many seconds, perhaps minutes).
+
+Warm caches on secondary locations are necessary to meet
+our availability goals.
+
+### Pageserver-granularity failover
+
+Instead of migrating tenants individually, we could have entire spare nodes,
+and on a node death, move all its work to one of these spares.
+
+This approach is avoided for several reasons:
+
+- we would still need fine-grained tenant migration for other
+  purposes such as balancing load
+- by sharing the spare capacity over many peers rather than one spare node,
+  these peers may use the capacity for other purposes, until it is needed
+  to handle migrated tenants. e.g. for keeping a deeper cache of their
+  attached tenants.
+
+### Readonly during migration
+
+We could simplify migrations by making both previous and new nodes go into a
+readonly state, then flush remote content from the previous node, then activate
+attachment on the secondary node.
+
+The downside to this approach is a potentially large gap in readability of
+recent LSNs while loading data onto the new node. To avoid this, it is worthwhile
+to incur the extra cost of double-replaying the WAL onto old and new nodes' local
+storage during a migration.
+
+### Peer-to-peer pageserver communication
+
+Rather than uploading the heatmap to S3, attached pageservers could make it
+available to peers.
+
+Currently, pageservers have no peer to peer communication, so adding this
+for heatmaps would incur significant overhead in deployment and configuration
+of the service, and ensuring that when a new pageserver is deployed, other
+pageservers are updated to be aware of it.
+
+As well as simplifying implementation, putting heatmaps in S3 will be useful
+for future analytics purposes -- gathering aggregated statistics on activity
+pattersn across many tenants may be done directly from data in S3.
--- a/docs/rfcs/029-pageserver-wal-disaster-recovery.md
+++ b/docs/rfcs/029-pageserver-wal-disaster-recovery.md
@@ -0,0 +1,205 @@
+# Name
+
+Created on: 2023-09-08
+Author: Arpad Müller
+
+## Summary
+
+Enable the pageserver to recover from data corruption events by implementing
+a feature to re-apply historic WAL records in parallel to the already occurring
+WAL replay.
+
+The feature is outside of the user-visible backup and history story, and only
+serves as a second-level backup for the case that there is a bug in the
+pageservers that corrupted the served pages.
+
+The RFC proposes the addition of two new features:
+* recover a broken branch from WAL (downtime is allowed)
+* a test recovery system to recover random branches to make sure recovery works
+
+## Motivation
+
+The historic WAL is currently stored in S3 even after it has been replayed by
+the pageserver and thus been integrated into the pageserver's storage system.
+This is done to defend from data corruption failures inside the pageservers.
+
+However, application of this WAL in the disaster recovery setting is currently
+very manual and we want to automate this to make it easier.
+
+### Use cases
+
+There are various use cases for this feature, like:
+
+* The main motivation is replaying in the instance of pageservers corrupting
+  data.
+* We might want to, beyond the user-visible history features, through our
+  support channels and upon customer request, in select instances, recover
+  historic versions beyond the range of history that we officially support.
+* Running the recovery process in the background for random tenant timelines
+  to figure out if there was a corruption of data (we would compare with what
+  the pageserver stores for the "official" timeline).
+* Using the WAL to arrive at historic pages we can then back up to S3 so that
+  WAL itself can be discarded, or at least not used for future replays.
+  Again, this sounds a lot like what the pageserver is already doing, but the
+  point is to provide a fallback to the service provided by the pageserver.
+
+## Design
+
+### Design constraints
+
+The main design constraint is that the feature needs to be *simple* enough that
+the number of bugs are as low, and reliability as high as possible: the main
+goal of this endeavour is to achieve higher correctness than the pageserver.
+
+For the background process, we cannot afford a downtime of the timeline that is
+being cloned, as we don't want to restrict ourselves to offline tenants only.
+In the scenario where we want to recover from disasters or roll back to a
+historic lsn through support staff, downtimes are more affordable, and
+inevitable if the original had been subject to the corruption. Ideally, the
+two code paths would share code, so the solution would be designed for not
+requiring downtimes.
+
+### API endpoint changes
+
+This RFC proposes two API endpoint changes in the safekeeper and the
+pageserver.
+
+Remember, the pageserver timeline API creation endpoint is to this URL:
+
+```
+/v1/tenant/{tenant_id}/timeline/
+```
+
+Where `{tenant_id}` is the ID of the tenant the timeline is created for,
+and specified as part of the URL. The timeline ID is passed via the POST
+request body as the only required parameter `new_timeline_id`.
+
+This proposal adds one optional parameter called
+`existing_initdb_timeline_id` to the request's json body. If the parameter
+is not specified, behaviour should be as existing, so the pageserver runs
+initdb.
+If the parameter is specified, it is expected to point to a timeline ID.
+In fact that ID might match `new_timeline_id`, what's important is that
+S3 storage contains a matching initdb under the URL matching the given
+tenant and timeline.
+
+Having both `ancestor_timeline_id` and `existing_initdb_timeline_id`
+specified is illegal and will yield in an HTTP error. This feature is
+only meant for the "main" branch that doesn't have any ancestors
+of its own, as only here initdb is relevant.
+
+For the safekeeper, we propose the addition of the following copy endpoint:
+
+```
+/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy
+```
+it is meant for POST requests with json, and the two URL parameters
+`tenant_id` and `source_timeline_id`. The json request body contains
+the two required parameters `target_timeline_id` and `until_lsn`.
+
+After invoking, the copy endpoint starts a copy process of the WAL from
+the source ID to the target ID. The lsn is updated according to the
+progress of the API call.
+
+### Higher level features
+
+We want the API changes to support the following higher level features:
+
+* recovery-after-corruption DR of the main timeline of a tenant. This
+  feature allows for downtime.
+* test DR of the main timeline into a special copy timeline. this feature
+  is meant to run against selected production tenants in the background,
+  without the user noticing, so it does not allow for downtime.
+
+The recovery-after-corruption DR only needs the pageserver changes.
+It works as follows:
+
+* delete the timeline from the pageservers via timeline deletion API
+* re-create it via timeline creation API (same ID as before) and set
+  `existing_initdb_timeline_id` to the same timeline ID
+
+The test DR requires also the copy primitive and works as follows:
+
+* copy the WAL of the timeline to a new place
+* create a new timeline for the tenant
+
+## Non Goals
+
+At the danger of being repetitive, the main goal of this feature is to be a
+backup method, so reliability is very important. This implies that other
+aspects like performance or space reduction are less important.
+
+### Corrupt WAL
+
+The process suggested by this RFC assumes that the WAL is free of corruption.
+In some instances, corruption can make it into WAL, like for example when
+higher level components like postgres or the application first read corrupt
+data, and then execute a write with data derived from that earlier read. That
+written data might then contain the corruption.
+
+Common use cases can hit this quite easily. For example, an application reads
+some counter, increments it, and then writes the new counter value to the
+database.
+On a lower level, the compute might put FPIs (Full Page Images) into the WAL,
+which have corrupt data for rows unrelated to the write operation at hand.
+
+Separating corrupt writes from non-corrupt ones is a hard problem in general,
+and if the application was involved in making the corrupt write, a recovery
+would also involve the application. Therefore, corruption that has made it into
+the WAL is outside of the scope of this feature. However, the WAL replay can be
+issued to right before the point in time where the corruption occured. Then the
+data loss is isolated to post-corruption writes only.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Most changes would happen to the pageservers.
+For the higher level features, maybe other components like the console would
+be involved.
+
+We need to make sure that the shadow timelines are not subject to the usual
+limits and billing we apply to existing timelines.
+
+## Proposed implementation
+
+The first problem to keep in mind is the reproducability of `initdb`.
+So an initial step would be to upload `initdb` snapshots to S3.
+
+After that, we'd have the endpoint spawn a background process which
+performs the replay of the WAL to that new timeline. This process should
+follow the existing workflows as closely as possible, just using the
+WAL records of a different timeline.
+
+The timeline created will be in a special state that solely looks for WAL
+entries of the timeline it is trying to copy. Once the target LSN is reached,
+it turns into a normal timeline that also accepts writes to its own
+timeline ID.
+
+### Scalability
+
+For now we want to run this entire process on a single node, and as
+it is by nature linear, it's hard to parallelize. However, for the
+verification workloads, we can easily start the WAL replay in parallel
+for different points in time. This is valuable especially for tenants
+with large WAL records.
+
+Compare this with the tricks to make addition circuits execute with
+lower latency by making them perform the addition for both possible
+values of the carry bit, and then, in a second step, taking the
+result for the carry bit that was actually obtained.
+
+The other scalability dimension to consider is the WAL length, which
+is a growing question as tenants accumulate changes. There are
+possible approaches to this, including creating snapshots of the
+page files and uploading them to S3, but if we do this for every single
+branch, we lose the cheap branching property.
+
+### Implementation by component
+
+The proposed changes for the various components of the neon architecture
+are written up in this notion page:
+
+https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2
+
+### Unresolved questions
+
+none known (outside of the mentioned ones).
--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -0,0 +1,108 @@
+# Updating Postgres
+
+## Minor Versions
+
+When upgrading to a new minor version of Postgres, please follow these steps:
+
+_Example: 15.4 is the new minor version to upgrade to from 15.3._
+
+1. Clone the Neon Postgres repository if you have not done so already.
+
+    ```shell
+    git clone git@github.com:neondatabase/postgres.git
+    ```
+
+1. Add the Postgres upstream remote.
+
+    ```shell
+    git remote add upstream https://git.postgresql.org/git/postgresql.git
+    ```
+
+1. Create a new branch based on the stable branch you are updating.
+
+    ```shell
+    git checkout -b my-branch REL_15_STABLE_neon
+    ```
+
+1. Tag the last commit on the stable branch you are updating.
+
+    ```shell
+    git tag REL_15_3_neon
+    ```
+
+1. Push the new tag to the Neon Postgres repository.
+
+    ```shell
+    git push origin REL_15_3_neon
+    ```
+
+1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
+
+1. Rebase the branch you created on the tag and resolve any conflicts.
+
+    ```shell
+    git fetch upstream REL_15_4
+    git rebase REL_15_4
+    ```
+
+1. Run the Postgres test suite to make sure our commits have not affected
+Postgres in a negative way.
+
+    ```shell
+    make check
+    # OR
+    meson test -C builddir
+    ```
+
+1. Push your branch to the Neon Postgres repository.
+
+    ```shell
+    git push origin my-branch
+    ```
+
+1. Clone the Neon repository if you have not done so already.
+
+    ```shell
+    git clone git@github.com:neondatabase/neon.git
+    ```
+
+1. Create a new branch.
+
+1. Change the `revisions.json` file to point at the HEAD of your Postgres
+branch.
+
+1. Update the Git submodule.
+
+    ```shell
+    git submodule set-branch --branch my-branch vendor/postgres-v15
+    git submodule update --remote vendor/postgres-v15
+    ```
+
+1. Run the Neon test suite to make sure that Neon is still good to go on this
+minor Postgres release.
+
+    ```shell
+    ./scripts/poetry -k pg15
+    ```
+
+1. Commit your changes.
+
+1. Create a pull request, and wait for CI to go green.
+
+1. Force push the rebased Postgres branches into the Neon Postgres repository.
+
+    ```shell
+    git push --force origin my-branch:REL_15_STABLE_neon
+    ```
+
+    It may require disabling various branch protections.
+
+1. Update your Neon PR to point at the branches.
+
+    ```shell
+    git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
+    git commit --amend --no-edit
+    git push --force origin
+    ```
+
+1. Merge the pull request after getting approval(s) and CI completion.
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -6,7 +6,6 @@
 use std::collections::HashMap;

 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -19,7 +18,6 @@ pub type PgIdent = String;

 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
-#[serde_as]
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,
@@ -28,6 +26,13 @@ pub struct ComputeSpec {
    // but we don't use it for anything. Serde will ignore missing fields when
    // deserializing it.
    pub operation_uuid: Option<String>,
+
+    /// Compute features to enable. These feature flags are provided, when we
+    /// know all the details about client's compute, so they cannot be used
+    /// to change `Empty` compute behavior.
+    #[serde(default)]
+    pub features: Vec<ComputeFeature>,
+
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
@@ -50,12 +55,12 @@ pub struct ComputeSpec {
    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
+
    pub timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
+
    pub pageserver_connstring: Option<String>,
+
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,

@@ -70,6 +75,19 @@ pub struct ComputeSpec {
    pub remote_extensions: Option<RemoteExtSpec>,
 }

+/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
+#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ComputeFeature {
+    // XXX: Add more feature flags here.
+
+    // This is a special feature flag that is used to represent unknown feature flags.
+    // Basically all unknown to enum flags are represented as this one. See unit test
+    // `parse_unknown_features()` for more details.
+    #[serde(other)]
+    UnknownFeature,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -89,6 +107,8 @@ impl RemoteExtSpec {
        &self,
        ext_name: &str,
        is_library: bool,
+        build_tag: &str,
+        pg_major_version: &str,
    ) -> anyhow::Result<(String, RemotePath)> {
        let mut real_ext_name = ext_name;
        if is_library {
@@ -104,11 +124,32 @@ impl RemoteExtSpec {
                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
        }

+        // Check if extension is present in public or custom.
+        // If not, then it is not allowed to be used by this compute.
+        if let Some(public_extensions) = &self.public_extensions {
+            if !public_extensions.contains(&real_ext_name.to_string()) {
+                if let Some(custom_extensions) = &self.custom_extensions {
+                    if !custom_extensions.contains(&real_ext_name.to_string()) {
+                        return Err(anyhow::anyhow!("extension {} is not found", real_ext_name));
+                    }
+                }
+            }
+        }
+
        match self.extension_data.get(real_ext_name) {
-            Some(ext_data) => Ok((
-                real_ext_name.to_string(),
-                RemotePath::from_string(&ext_data.archive_path)?,
-            )),
+            Some(_ext_data) => {
+                // Construct the path to the extension archive
+                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
+                //
+                // Keep it in sync with path generation in
+                // https://github.com/neondatabase/build-custom-extensions/tree/main
+                let archive_path_str =
+                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+                Ok((
+                    real_ext_name.to_string(),
+                    RemotePath::from_string(&archive_path_str)?,
+                ))
+            }
            None => Err(anyhow::anyhow!(
                "real_ext_name {} is not found",
                real_ext_name
@@ -117,14 +158,13 @@ impl RemoteExtSpec {
    }
 }

-#[serde_as]
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeMode {
    /// A read-write node
    #[default]
    Primary,
    /// A read-only node, pinned at a particular LSN
-    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
+    Static(Lsn),
    /// A read-only node that follows the tip of the branch in hot standby mode
    ///
    /// Future versions may want to distinguish between replicas with hot standby
@@ -167,6 +207,8 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
+    pub replication: Option<bool>,
+    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }

@@ -177,6 +219,12 @@ pub struct Database {
    pub name: PgIdent,
    pub owner: PgIdent,
    pub options: GenericOptions,
+    // These are derived flags, not present in the spec file.
+    // They are never set by the control plane.
+    #[serde(skip_deserializing, default)]
+    pub restrict_conn: bool,
+    #[serde(skip_deserializing, default)]
+    pub invalid: bool,
 }

 /// Common type representing both SQL statement params with or without value,
@@ -201,7 +249,10 @@ mod tests {
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
-        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+
+        // Features list defaults to empty vector.
+        assert!(spec.features.is_empty());
    }

    #[test]
@@ -213,4 +264,22 @@ mod tests {
        ob.insert("unknown_field_123123123".into(), "hello".into());
        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
    }
+
+    #[test]
+    fn parse_unknown_features() {
+        // Test that unknown feature flags do not cause any errors.
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
+        let ob = json.as_object_mut().unwrap();
+
+        // Add unknown feature flags.
+        let features = vec!["foo_bar_feature", "baz_feature"];
+        ob.insert("features".into(), features.into());
+
+        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
+
+        assert!(spec.features.len() == 2);
+        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
+        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
+    }
 }
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -76,7 +76,7 @@
            },
            {
                "name": "wal_level",
-                "value": "replica",
+                "value": "logical",
                "vartype": "enum"
            },
            {
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,11 +1,11 @@
-//!
 //! Shared code for consumption metics collection
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use chrono::{DateTime, Utc};
 use rand::Rng;
-use serde::Serialize;
+use serde::{Deserialize, Serialize};

-#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -27,7 +27,8 @@ impl EventType {
    }

    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
-        // these can most likely be thought of as Range or RangeFull
+        // these can most likely be thought of as Range or RangeFull, at least pageserver creates
+        // incremental ranges where the stop and next start are equal.
        use EventType::*;
        match self {
            Incremental {
@@ -41,15 +42,25 @@ impl EventType {
    pub fn is_incremental(&self) -> bool {
        matches!(self, EventType::Incremental { .. })
    }
+
+    /// Returns the absolute time, or for incremental ranges, the stop time.
+    pub fn recorded_at(&self) -> &DateTime<Utc> {
+        use EventType::*;
+
+        match self {
+            Absolute { time } => time,
+            Incremental { stop_time, .. } => stop_time,
+        }
+    }
 }

-#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
-pub struct Event<Extra> {
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub struct Event<Extra, Metric> {
    #[serde(flatten)]
    #[serde(rename = "type")]
    pub kind: EventType,

-    pub metric: &'static str,
+    pub metric: Metric,
    pub idempotency_key: String,
    pub value: u64,

@@ -58,19 +69,45 @@ pub struct Event<Extra> {
 }

 pub fn idempotency_key(node_id: &str) -> String {
-    format!(
-        "{}-{}-{:04}",
-        Utc::now(),
-        node_id,
-        rand::thread_rng().gen_range(0..=9999)
-    )
+    IdempotencyKey::generate(node_id).to_string()
+}
+
+/// Downstream users will use these to detect upload retries.
+pub struct IdempotencyKey<'a> {
+    now: chrono::DateTime<Utc>,
+    node_id: &'a str,
+    nonce: u16,
+}
+
+impl std::fmt::Display for IdempotencyKey<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}-{}-{:04}", self.now, self.node_id, self.nonce)
+    }
+}
+
+impl<'a> IdempotencyKey<'a> {
+    pub fn generate(node_id: &'a str) -> Self {
+        IdempotencyKey {
+            now: Utc::now(),
+            node_id,
+            nonce: rand::thread_rng().gen_range(0..=9999),
+        }
+    }
+
+    pub fn for_tests(now: DateTime<Utc>, node_id: &'a str, nonce: u16) -> Self {
+        IdempotencyKey {
+            now,
+            node_id,
+            nonce,
+        }
+    }
 }

 pub const CHUNK_SIZE: usize = 1000;

 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
 pub struct EventChunk<'a, T: Clone> {
    pub events: std::borrow::Cow<'a, [T]>,
 }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,6 +2,7 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
+#![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
@@ -89,14 +90,14 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];

-pub fn set_build_info_metric(revision: &str) {
+pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
        "Build/version information",
-        &["revision"]
+        &["revision", "build_tag"]
    )
    .expect("Failed to register build info metric");
-    metric.with_label_values(&[revision]).set(1);
+    metric.with_label_values(&[revision, build_tag]).set(1);
 }

 // Records I/O stats in a "cross-platform" way.
--- a/libs/metrics/src/wrappers.rs
+++ b/libs/metrics/src/wrappers.rs
@@ -1,6 +1,6 @@
 use std::io::{Read, Result, Write};

-/// A wrapper for an object implementing [Read](std::io::Read)
+/// A wrapper for an object implementing [Read]
 /// which allows a closure to observe the amount of bytes read.
 /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
 ///
@@ -51,17 +51,17 @@ impl<'a, T> CountedReader<'a, T> {
        }
    }

-    /// Get an immutable reference to the underlying [Read](std::io::Read) implementor
+    /// Get an immutable reference to the underlying [Read] implementor
    pub fn inner(&self) -> &T {
        &self.reader
    }

-    /// Get a mutable reference to the underlying [Read](std::io::Read) implementor
+    /// Get a mutable reference to the underlying [Read] implementor
    pub fn inner_mut(&mut self) -> &mut T {
        &mut self.reader
    }

-    /// Consume the wrapper and return the underlying [Read](std::io::Read) implementor
+    /// Consume the wrapper and return the underlying [Read] implementor
    pub fn into_inner(self) -> T {
        self.reader
    }
@@ -75,7 +75,7 @@ impl<T: Read> Read for CountedReader<'_, T> {
    }
 }

-/// A wrapper for an object implementing [Write](std::io::Write)
+/// A wrapper for an object implementing [Write]
 /// which allows a closure to observe the amount of bytes written.
 /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
 ///
@@ -122,17 +122,17 @@ impl<'a, T> CountedWriter<'a, T> {
        }
    }

-    /// Get an immutable reference to the underlying [Write](std::io::Write) implementor
+    /// Get an immutable reference to the underlying [Write] implementor
    pub fn inner(&self) -> &T {
        &self.writer
    }

-    /// Get a mutable reference to the underlying [Write](std::io::Write) implementor
+    /// Get a mutable reference to the underlying [Write] implementor
    pub fn inner_mut(&mut self) -> &mut T {
        &mut self.writer
    }

-    /// Consume the wrapper and return the underlying [Write](std::io::Write) implementor
+    /// Consume the wrapper and return the underlying [Write] implementor
    pub fn into_inner(self) -> T {
        self.writer
    }
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -17,5 +17,10 @@ postgres_ffi.workspace = true
 enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+hex.workspace = true
+thiserror.workspace = true

 workspace_hack.workspace = true
+
+[dev-dependencies]
+bincode.workspace = true
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -0,0 +1,47 @@
+//! Types in this file are for pageserver's upward-facing API calls to the control plane,
+//! required for acquiring and validating tenant generation numbers.
+//!
+//! See docs/rfcs/025-generation-numbers.md
+
+use serde::{Deserialize, Serialize};
+use utils::id::NodeId;
+
+use crate::shard::TenantShardId;
+
+#[derive(Serialize, Deserialize)]
+pub struct ReAttachRequest {
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ReAttachResponseTenant {
+    pub id: TenantShardId,
+    pub gen: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ReAttachResponse {
+    pub tenants: Vec<ReAttachResponseTenant>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ValidateRequestTenant {
+    pub id: TenantShardId,
+    pub gen: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ValidateRequest {
+    pub tenants: Vec<ValidateRequestTenant>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ValidateResponse {
+    pub tenants: Vec<ValidateResponseTenant>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct ValidateResponseTenant {
+    pub id: TenantShardId,
+    pub valid: bool,
+}
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -0,0 +1,146 @@
+use anyhow::{bail, Result};
+use byteorder::{ByteOrder, BE};
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+/// Key used in the Repository kv-store.
+///
+/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
+/// for what we actually store in these fields.
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+pub struct Key {
+    pub field1: u8,
+    pub field2: u32,
+    pub field3: u32,
+    pub field4: u32,
+    pub field5: u8,
+    pub field6: u32,
+}
+
+pub const KEY_SIZE: usize = 18;
+
+impl Key {
+    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
+    /// As long as Neon does not support tablespace (because of lack of access to local file system),
+    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
+    pub fn to_i128(&self) -> i128 {
+        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        (((self.field1 & 0xf) as i128) << 120)
+            | (((self.field2 & 0xFFFF) as i128) << 104)
+            | ((self.field3 as i128) << 72)
+            | ((self.field4 as i128) << 40)
+            | ((self.field5 as i128) << 32)
+            | self.field6 as i128
+    }
+
+    pub const fn from_i128(x: i128) -> Self {
+        Key {
+            field1: ((x >> 120) & 0xf) as u8,
+            field2: ((x >> 104) & 0xFFFF) as u32,
+            field3: (x >> 72) as u32,
+            field4: (x >> 40) as u32,
+            field5: (x >> 32) as u8,
+            field6: x as u32,
+        }
+    }
+
+    pub fn next(&self) -> Key {
+        self.add(1)
+    }
+
+    pub fn add(&self, x: u32) -> Key {
+        let mut key = *self;
+
+        let r = key.field6.overflowing_add(x);
+        key.field6 = r.0;
+        if r.1 {
+            let r = key.field5.overflowing_add(1);
+            key.field5 = r.0;
+            if r.1 {
+                let r = key.field4.overflowing_add(1);
+                key.field4 = r.0;
+                if r.1 {
+                    let r = key.field3.overflowing_add(1);
+                    key.field3 = r.0;
+                    if r.1 {
+                        let r = key.field2.overflowing_add(1);
+                        key.field2 = r.0;
+                        if r.1 {
+                            let r = key.field1.overflowing_add(1);
+                            key.field1 = r.0;
+                            assert!(!r.1);
+                        }
+                    }
+                }
+            }
+        }
+        key
+    }
+
+    pub fn from_slice(b: &[u8]) -> Self {
+        Key {
+            field1: b[0],
+            field2: u32::from_be_bytes(b[1..5].try_into().unwrap()),
+            field3: u32::from_be_bytes(b[5..9].try_into().unwrap()),
+            field4: u32::from_be_bytes(b[9..13].try_into().unwrap()),
+            field5: b[13],
+            field6: u32::from_be_bytes(b[14..18].try_into().unwrap()),
+        }
+    }
+
+    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
+        buf[0] = self.field1;
+        BE::write_u32(&mut buf[1..5], self.field2);
+        BE::write_u32(&mut buf[5..9], self.field3);
+        BE::write_u32(&mut buf[9..13], self.field4);
+        buf[13] = self.field5;
+        BE::write_u32(&mut buf[14..18], self.field6);
+    }
+}
+
+impl fmt::Display for Key {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}",
+            self.field1, self.field2, self.field3, self.field4, self.field5, self.field6
+        )
+    }
+}
+
+impl Key {
+    pub const MIN: Key = Key {
+        field1: u8::MIN,
+        field2: u32::MIN,
+        field3: u32::MIN,
+        field4: u32::MIN,
+        field5: u8::MIN,
+        field6: u32::MIN,
+    };
+    pub const MAX: Key = Key {
+        field1: u8::MAX,
+        field2: u32::MAX,
+        field3: u32::MAX,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX,
+    };
+
+    pub fn from_hex(s: &str) -> Result<Self> {
+        if s.len() != 36 {
+            bail!("parse error");
+        }
+        Ok(Key {
+            field1: u8::from_str_radix(&s[0..2], 16)?,
+            field2: u32::from_str_radix(&s[2..10], 16)?,
+            field3: u32::from_str_radix(&s[10..18], 16)?,
+            field4: u32::from_str_radix(&s[18..26], 16)?,
+            field5: u8::from_str_radix(&s[26..28], 16)?,
+            field6: u32::from_str_radix(&s[28..36], 16)?,
+        })
+    }
+}
+
+pub fn is_rel_block_key(key: &Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0
+}
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,8 +1,13 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
+pub mod control_api;
+pub mod key;
 pub mod models;
 pub mod reltag;
+pub mod shard;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,7 +6,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
+use serde_with::serde_as;
 use strum_macros;
 use utils::{
    completion,
@@ -15,7 +15,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::reltag::RelTag;
+use crate::{reltag::RelTag, shard::TenantShardId};
 use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};

@@ -109,7 +109,6 @@ impl TenantState {
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
            // tenant mgr startup distinguishes attaching from loading via marker file.
-            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
@@ -174,30 +173,37 @@ pub enum TimelineState {
    Broken { reason: String, backtrace: String },
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub new_timeline_id: TimelineId,
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub existing_initdb_timeline_id: Option<TimelineId>,
+    #[serde(default)]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
-    pub new_tenant_id: TenantId,
+    pub new_tenant_id: TenantShardId,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[derive(Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLoadRequest {
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
+}
+
 impl std::ops::Deref for TenantCreateRequest {
    type Target = TenantConfig;

@@ -206,6 +212,8 @@ impl std::ops::Deref for TenantCreateRequest {
    }
 }

+/// An alternative representation of `pageserver::tenant::TenantConf` with
+/// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default)]
 pub struct TenantConfig {
    pub checkpoint_distance: Option<u64>,
@@ -231,30 +239,68 @@ pub struct TenantConfig {
    pub gc_feedback: Option<bool>,
 }

-#[serde_as]
+/// A flattened analog of a `pagesever::tenant::LocationMode`, which
+/// lists out all possible states (and the virtual "Detached" state)
+/// in a flat form rather than using rust-style enums.
+#[derive(Serialize, Deserialize, Debug)]
+pub enum LocationConfigMode {
+    AttachedSingle,
+    AttachedMulti,
+    AttachedStale,
+    Secondary,
+    Detached,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct LocationConfigSecondary {
+    pub warm: bool,
+}
+
+/// An alternative representation of `pageserver::tenant::LocationConf`,
+/// for use in external-facing APIs.
+#[derive(Serialize, Deserialize, Debug)]
+pub struct LocationConfig {
+    pub mode: LocationConfigMode,
+    /// If attaching, in what generation?
+    #[serde(default)]
+    pub generation: Option<u32>,
+    #[serde(default)]
+    pub secondary_conf: Option<LocationConfigSecondary>,
+
+    // Shard parameters: if shard_count is nonzero, then other shard_* fields
+    // must be set accurately.
+    #[serde(default)]
+    pub shard_number: u8,
+    #[serde(default)]
+    pub shard_count: u8,
+    #[serde(default)]
+    pub shard_stripe_size: u32,
+
+    // If requesting mode `Secondary`, configuration for that.
+    // Custom storage configuration for the tenant, if any
+    pub tenant_conf: TenantConfig,
+}
+
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);
+pub struct TenantCreateResponse(pub TenantId);

 #[derive(Serialize)]
 pub struct StatusResponse {
    pub id: NodeId,
 }

-impl TenantCreateRequest {
-    pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
-        TenantCreateRequest {
-            new_tenant_id,
-            config: TenantConfig::default(),
-        }
-    }
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLocationConfigRequest {
+    pub tenant_id: TenantId,
+    #[serde(flatten)]
+    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -270,37 +316,22 @@ impl std::ops::Deref for TenantConfigRequest {

 impl TenantConfigRequest {
    pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
-        let config = TenantConfig {
-            checkpoint_distance: None,
-            checkpoint_timeout: None,
-            compaction_target_size: None,
-            compaction_period: None,
-            compaction_threshold: None,
-            gc_horizon: None,
-            gc_period: None,
-            image_creation_threshold: None,
-            pitr_interval: None,
-            walreceiver_connect_timeout: None,
-            lagging_wal_timeout: None,
-            max_lsn_wal_lag: None,
-            trace_read_requests: None,
-            eviction_policy: None,
-            min_resident_size_override: None,
-            evictions_low_residence_duration_metric_threshold: None,
-            gc_feedback: None,
-        };
+        let config = TenantConfig::default();
        TenantConfigRequest { tenant_id, config }
    }
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Deserialize)]
 pub struct TenantAttachRequest {
+    #[serde(default)]
    pub config: TenantAttachConfig,
+    #[serde(default)]
+    pub generation: Option<u32>,
 }

 /// Newtype to enforce deny_unknown_fields on TenantConfig for
 /// its usage inside `TenantAttachRequest`.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, Default)]
 #[serde(deny_unknown_fields)]
 pub struct TenantAttachConfig {
    #[serde(flatten)]
@@ -324,10 +355,8 @@ pub enum TenantAttachmentStatus {
    Failed { reason: String },
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
@@ -338,29 +367,27 @@ pub struct TenantInfo {
 }

 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,

-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_lsn: Option<Lsn>,
-    #[serde_as(as = "DisplayFromStr")]
    pub last_record_lsn: Lsn,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub prev_record_lsn: Option<Lsn>,
-    #[serde_as(as = "DisplayFromStr")]
    pub latest_gc_cutoff_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
+
+    /// The LSN that we have succesfully uploaded to remote storage
    pub remote_consistent_lsn: Lsn,
-    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+
+    /// The LSN that we are advertizing to safekeepers
+    pub remote_consistent_lsn_visible: Lsn,
+
+    pub current_logical_size: u64,
+    pub current_logical_size_is_accurate: bool,
+
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
@@ -369,13 +396,14 @@ pub struct TimelineInfo {
    pub timeline_dir_layer_file_size_sum: Option<u64>,

    pub wal_source_connstr: Option<String>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
    pub pg_version: u32,

    pub state: TimelineState,
+
+    pub walreceiver_status: String,
 }

 #[derive(Debug, Clone, Serialize)]
@@ -464,23 +492,13 @@ pub struct LayerAccessStats {
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

-#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
-    Open {
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_start: Lsn,
-    },
-    Frozen {
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_start: Lsn,
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_end: Lsn,
-    },
+    Open { lsn_start: Lsn },
+    Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }

-#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
@@ -488,9 +506,7 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

-        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
-        #[serde_as(as = "DisplayFromStr")]
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
@@ -499,7 +515,6 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

-        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
 /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
 ///
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
-// Then we could replace the custo Ord and PartialOrd implementations below with
-// deriving them.
-#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
+// Then we could replace the custom Ord and PartialOrd implementations below with
+// deriving them. This will require changes in walredoproc.c.
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -40,21 +40,9 @@ impl PartialOrd for RelTag {

 impl Ord for RelTag {
    fn cmp(&self, other: &Self) -> Ordering {
-        let mut cmp = self.spcnode.cmp(&other.spcnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.dbnode.cmp(&other.dbnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.relnode.cmp(&other.relnode);
-        if cmp != Ordering::Equal {
-            return cmp;
-        }
-        cmp = self.forknum.cmp(&other.forknum);
-
-        cmp
+        // Custom ordering where we put forknum to the end of the list
+        let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
+        (self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
    }
 }

--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -0,0 +1,757 @@
+use std::{ops::RangeInclusive, str::FromStr};
+
+use crate::key::{is_rel_block_key, Key};
+use hex::FromHex;
+use serde::{Deserialize, Serialize};
+use thiserror;
+use utils::id::TenantId;
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(pub u8);
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+/// TenantShardId identify the units of work for the Pageserver.
+///
+/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
+///
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// Historically, tenants could not have multiple shards, and were identified
+/// by TenantId.  To support this, TenantShardId has a special legacy
+/// mode where `shard_count` is equal to zero: this represents a single-sharded
+/// tenant which should be written as a TenantId with no suffix.
+///
+/// The human-readable encoding of TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+///
+/// Note that the binary encoding is _not_ backward compatible, because
+/// at the time sharding is introduced, there are no existing binary structures
+/// containing TenantId that we need to handle.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+}
+
+/// Formatting helper
+struct ShardSlug<'a>(&'a TenantShardId);
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+/// For use within the context of a particular tenant, when we need to know which
+/// shard we're dealing with, but do not need to know the full ShardIdentity (because
+/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
+/// TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+/// Stripe size in number of pages
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardStripeSize(pub u32);
+
+/// Layout version: for future upgrades where we might change how the key->shard mapping works
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardLayout(u8);
+
+const LAYOUT_V1: ShardLayout = ShardLayout(1);
+/// ShardIdentity uses a magic layout value to indicate if it is unusable
+const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
+
+/// Default stripe size in pages: 256MiB divided by 8kiB page size.
+const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
+
+/// The ShardIdentity contains the information needed for one member of map
+/// to resolve a key to a shard, and then check whether that shard is ==self.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardIdentity {
+    pub number: ShardNumber,
+    pub count: ShardCount,
+    stripe_size: ShardStripeSize,
+    layout: ShardLayout,
+}
+
+#[derive(thiserror::Error, Debug, PartialEq, Eq)]
+pub enum ShardConfigError {
+    #[error("Invalid shard count")]
+    InvalidCount,
+    #[error("Invalid shard number")]
+    InvalidNumber,
+    #[error("Invalid stripe size")]
+    InvalidStripeSize,
+}
+
+impl ShardIdentity {
+    /// An identity with number=0 count=0 is a "none" identity, which represents legacy
+    /// tenants.  Modern single-shard tenants should not use this: they should
+    /// have number=0 count=1.
+    pub fn unsharded() -> Self {
+        Self {
+            number: ShardNumber(0),
+            count: ShardCount(0),
+            layout: LAYOUT_V1,
+            stripe_size: DEFAULT_STRIPE_SIZE,
+        }
+    }
+
+    /// A broken instance of this type is only used for `TenantState::Broken` tenants,
+    /// which are constructed in code paths that don't have access to proper configuration.
+    ///
+    /// A ShardIdentity in this state may not be used for anything, and should not be persisted.
+    /// Enforcement is via assertions, to avoid making our interface fallible for this
+    /// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
+    /// state, and by extension to avoid trying to do any page->shard resolution.
+    pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            number,
+            count,
+            layout: LAYOUT_BROKEN,
+            stripe_size: DEFAULT_STRIPE_SIZE,
+        }
+    }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.number == ShardNumber(0) && self.count == ShardCount(0)
+    }
+
+    /// Count must be nonzero, and number must be < count. To construct
+    /// the legacy case (count==0), use Self::unsharded instead.
+    pub fn new(
+        number: ShardNumber,
+        count: ShardCount,
+        stripe_size: ShardStripeSize,
+    ) -> Result<Self, ShardConfigError> {
+        if count.0 == 0 {
+            Err(ShardConfigError::InvalidCount)
+        } else if number.0 > count.0 - 1 {
+            Err(ShardConfigError::InvalidNumber)
+        } else if stripe_size.0 == 0 {
+            Err(ShardConfigError::InvalidStripeSize)
+        } else {
+            Ok(Self {
+                number,
+                count,
+                layout: LAYOUT_V1,
+                stripe_size,
+            })
+        }
+    }
+
+    fn is_broken(&self) -> bool {
+        self.layout == LAYOUT_BROKEN
+    }
+
+    pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
+        assert!(!self.is_broken());
+        key_to_shard_number(self.count, self.stripe_size, key)
+    }
+
+    /// Return true if the key should be ingested by this shard
+    pub fn is_key_local(&self, key: &Key) -> bool {
+        assert!(!self.is_broken());
+        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
+            true
+        } else {
+            key_to_shard_number(self.count, self.stripe_size, key) == self.number
+        }
+    }
+
+    pub fn shard_slug(&self) -> String {
+        if self.count > ShardCount(0) {
+            format!("-{:02x}{:02x}", self.number.0, self.count.0)
+        } else {
+            String::new()
+        }
+    }
+
+    /// Convenience for checking if this identity is the 0th shard in a tenant,
+    /// for special cases on shard 0 such as ingesting relation sizes.
+    pub fn is_zero(&self) -> bool {
+        self.number == ShardNumber(0)
+    }
+}
+
+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
+/// in order to be able to serve basebackup requests without peer communication).
+fn key_is_shard0(key: &Key) -> bool {
+    // To decide what to shard out to shards >0, we apply a simple rule that only
+    // relation pages are distributed to shards other than shard zero. Everything else gets
+    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
+    // requests, and any request other than those for particular blocks in relations.
+    //
+    // In this condition:
+    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
+    // all metadata.
+    // - field6 is set to -1 for relation size pages.
+    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
+}
+
+/// Provide the same result as the function in postgres `hashfn.h` with the same name
+fn murmurhash32(mut h: u32) -> u32 {
+    h ^= h >> 16;
+    h = h.wrapping_mul(0x85ebca6b);
+    h ^= h >> 13;
+    h = h.wrapping_mul(0xc2b2ae35);
+    h ^= h >> 16;
+    h
+}
+
+/// Provide the same result as the function in postgres `hashfn.h` with the same name
+fn hash_combine(mut a: u32, mut b: u32) -> u32 {
+    b = b.wrapping_add(0x9e3779b9);
+    b = b.wrapping_add(a << 6);
+    b = b.wrapping_add(a >> 2);
+
+    a ^= b;
+    a
+}
+
+/// Where a Key is to be distributed across shards, select the shard.  This function
+/// does not account for keys that should be broadcast across shards.
+///
+/// The hashing in this function must exactly match what we do in postgres smgr
+/// code.  The resulting distribution of pages is intended to preserve locality within
+/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
+/// distributing data pseudo-randomly.
+///
+/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
+/// and will be handled at higher levels when shards are split.
+fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
+    // Fast path for un-sharded tenants or broadcast keys
+    if count < ShardCount(2) || key_is_shard0(key) {
+        return ShardNumber(0);
+    }
+
+    // relNode
+    let mut hash = murmurhash32(key.field4);
+    // blockNum/stripe size
+    hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
+
+    ShardNumber((hash % count.0 as u32) as u8)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use bincode;
+    use utils::{id::TenantId, Hex};
+
+    use super::*;
+
+    const EXAMPLE_TENANT_ID: &str = "1f359dd625e519a1a4e8d7509690f6fc";
+
+    #[test]
+    fn tenant_shard_id_string() -> Result<(), hex::FromHexError> {
+        let example = TenantShardId {
+            tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
+            shard_count: ShardCount(10),
+            shard_number: ShardNumber(7),
+        };
+
+        let encoded = format!("{example}");
+
+        let expected = format!("{EXAMPLE_TENANT_ID}-070a");
+        assert_eq!(&encoded, &expected);
+
+        let decoded = TenantShardId::from_str(&encoded)?;
+
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_binary() -> Result<(), hex::FromHexError> {
+        let example = TenantShardId {
+            tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
+            shard_count: ShardCount(10),
+            shard_number: ShardNumber(7),
+        };
+
+        let encoded = bincode::serialize(&example).unwrap();
+        let expected: [u8; 18] = [
+            0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
+            0xf6, 0xfc, 0x07, 0x0a,
+        ];
+        assert_eq!(Hex(&encoded), Hex(&expected));
+
+        let decoded = bincode::deserialize(&encoded).unwrap();
+
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_backward_compat() -> Result<(), hex::FromHexError> {
+        // Test that TenantShardId can decode a TenantId in human
+        // readable form
+        let example = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
+        let encoded = format!("{example}");
+
+        assert_eq!(&encoded, EXAMPLE_TENANT_ID);
+
+        let decoded = TenantShardId::from_str(&encoded)?;
+
+        assert_eq!(example, decoded.tenant_id);
+        assert_eq!(decoded.shard_count, ShardCount(0));
+        assert_eq!(decoded.shard_number, ShardNumber(0));
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_forward_compat() -> Result<(), hex::FromHexError> {
+        // Test that a legacy TenantShardId encodes into a form that
+        // can be decoded as TenantId
+        let example_tenant_id = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
+        let example = TenantShardId::unsharded(example_tenant_id);
+        let encoded = format!("{example}");
+
+        assert_eq!(&encoded, EXAMPLE_TENANT_ID);
+
+        let decoded = TenantId::from_str(&encoded)?;
+
+        assert_eq!(example_tenant_id, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_legacy_binary() -> Result<(), hex::FromHexError> {
+        // Unlike in human readable encoding, binary encoding does not
+        // do any special handling of legacy unsharded TenantIds: this test
+        // is equivalent to the main test for binary encoding, just verifying
+        // that the same behavior applies when we have used `unsharded()` to
+        // construct a TenantShardId.
+        let example = TenantShardId::unsharded(TenantId::from_str(EXAMPLE_TENANT_ID).unwrap());
+        let encoded = bincode::serialize(&example).unwrap();
+
+        let expected: [u8; 18] = [
+            0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
+            0xf6, 0xfc, 0x00, 0x00,
+        ];
+        assert_eq!(Hex(&encoded), Hex(&expected));
+
+        let decoded = bincode::deserialize::<TenantShardId>(&encoded).unwrap();
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn shard_identity_validation() -> Result<(), ShardConfigError> {
+        // Happy cases
+        ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?;
+        ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?;
+        ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?;
+
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidCount)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidNumber)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidNumber)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidNumber)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)),
+            Err(ShardConfigError::InvalidStripeSize)
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn shard_index_human_encoding() -> Result<(), hex::FromHexError> {
+        let example = ShardIndex {
+            shard_number: ShardNumber(13),
+            shard_count: ShardCount(17),
+        };
+        let expected: String = "0d11".to_string();
+        let encoded = format!("{example}");
+        assert_eq!(&encoded, &expected);
+
+        let decoded = ShardIndex::from_str(&encoded)?;
+        assert_eq!(example, decoded);
+        Ok(())
+    }
+
+    #[test]
+    fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> {
+        let example = ShardIndex {
+            shard_number: ShardNumber(13),
+            shard_count: ShardCount(17),
+        };
+        let expected: [u8; 2] = [0x0d, 0x11];
+
+        let encoded = bincode::serialize(&example).unwrap();
+        assert_eq!(Hex(&encoded), Hex(&expected));
+        let decoded = bincode::deserialize(&encoded).unwrap();
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    // These are only smoke tests to spot check that our implementation doesn't
+    // deviate from a few examples values: not aiming to validate the overall
+    // hashing algorithm.
+    #[test]
+    fn murmur_hash() {
+        assert_eq!(murmurhash32(0), 0);
+
+        assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
+    }
+
+    #[test]
+    fn shard_mapping() {
+        let key = Key {
+            field1: 0x00,
+            field2: 0x67f,
+            field3: 0x5,
+            field4: 0x400c,
+            field5: 0x00,
+            field6: 0x7d06,
+        };
+
+        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
+        assert_eq!(shard, ShardNumber(8));
+    }
+}
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,6 +2,8 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
@@ -15,12 +17,12 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tracing::{debug, error, info, trace};
+use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
-    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR,
-    SQLSTATE_SUCCESSFUL_COMPLETION,
+    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN,
+    SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION,
 };

 /// An error, occurred during query processing:
@@ -30,6 +32,14 @@ pub enum QueryError {
    /// The connection was lost while processing the query.
    #[error(transparent)]
    Disconnected(#[from] ConnectionError),
+    /// We were instructed to shutdown while processing the query
+    #[error("Shutting down")]
+    Shutdown,
+    /// Authentication failure
+    #[error("Unauthorized: {0}")]
+    Unauthorized(std::borrow::Cow<'static, str>),
+    #[error("Simulated Connection Error")]
+    SimulatedConnectionError,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -44,7 +54,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) => b"08006",         // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
+            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -238,6 +250,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
        }
    }

+    /// Cancellation safe as long as the underlying IO is cancellation safe.
    async fn shutdown(&mut self) -> io::Result<()> {
        match self {
            MaybeWriteOnly::Full(framed) => framed.shutdown().await,
@@ -389,14 +402,37 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        shutdown_watcher: F,
    ) -> Result<(), QueryError>
    where
-        F: Fn() -> S,
+        F: Fn() -> S + Clone,
        S: Future,
    {
-        let ret = self.run_message_loop(handler, shutdown_watcher).await;
-        // socket might be already closed, e.g. if previously received error,
-        // so ignore result.
-        self.framed.shutdown().await.ok();
-        ret
+        let ret = self
+            .run_message_loop(handler, shutdown_watcher.clone())
+            .await;
+
+        tokio::select! {
+            _ = shutdown_watcher() => {
+                // do nothing; we most likely got already stopped by shutdown and will log it next.
+            }
+            _ = self.framed.shutdown() => {
+                // socket might be already closed, e.g. if previously received error,
+                // so ignore result.
+            },
+        }
+
+        match ret {
+            Ok(()) => Ok(()),
+            Err(QueryError::Shutdown) => {
+                info!("Stopped due to shutdown");
+                Ok(())
+            }
+            Err(QueryError::Disconnected(e)) => {
+                info!("Disconnected ({e:#})");
+                // Disconnection is not an error: we just use it that way internally to drop
+                // out of loops.
+                Ok(())
+            }
+            e => e,
+        }
    }

    async fn run_message_loop<F, S>(
@@ -416,15 +452,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
-                return Ok(())
+                return Err(QueryError::Shutdown)
            },

-            result = self.handshake(handler) => {
-                // Handshake complete.
-                result?;
-                if self.state == ProtoState::Closed {
-                    return Ok(()); // EOF during handshake
-                }
+            handshake_r = self.handshake(handler) => {
+                handshake_r?;
            }
        );

@@ -435,17 +467,34 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
-                Ok(None)
+                return Err(QueryError::Shutdown)
            },
            msg = self.read_message() => { msg },
        )? {
            trace!("got message {:?}", msg);

            let result = self.process_message(handler, msg, &mut query_string).await;
-            self.flush().await?;
+            tokio::select!(
+                biased;
+                _ = shutdown_watcher() => {
+                    // We were requested to shut down.
+                    tracing::info!("shutdown request received during response flush");
+
+                    // If we exited process_message with a shutdown error, there may be
+                    // some valid response content on in our transmit buffer: permit sending
+                    // this within a short timeout.  This is a best effort thing so we don't
+                    // care about the result.
+                    tokio::time::timeout(std::time::Duration::from_millis(500), self.flush()).await.ok();
+
+                    return Err(QueryError::Shutdown)
+                },
+                flush_r = self.flush() => {
+                    flush_r?;
+                }
+            );
+
            match result? {
                ProcessMsgResult::Continue => {
-                    self.flush().await?;
                    continue;
                }
                ProcessMsgResult::Break => break,
@@ -550,7 +599,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        self.peer_addr
                    );
                    self.state = ProtoState::Closed;
-                    return Ok(());
+                    return Err(QueryError::Disconnected(ConnectionError::Protocol(
+                        ProtocolError::Protocol("EOF during handshake".to_string()),
+                    )));
                }
            }
        }
@@ -565,7 +616,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
                        self.write_message_noflush(&BeMessage::ErrorResponse(
-                            &e.to_string(),
+                            &short_error(&e),
                            Some(e.pg_error_code()),
                        ))?;
                        return Err(e);
@@ -589,7 +640,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        self.peer_addr
                    );
                    self.state = ProtoState::Closed;
-                    return Ok(());
+                    return Err(QueryError::Disconnected(ConnectionError::Protocol(
+                        ProtocolError::Protocol("EOF during auth".to_string()),
+                    )));
                }
            }
        }
@@ -683,12 +736,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                trace!("got query {query_string:?}");
                if let Err(e) = handler.process_query(self, query_string).await {
-                    log_query_error(query_string, &e);
-                    let short_error = short_error(&e);
-                    self.write_message_noflush(&BeMessage::ErrorResponse(
-                        &short_error,
-                        Some(e.pg_error_code()),
-                    ))?;
+                    match e {
+                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        QueryError::SimulatedConnectionError => {
+                            return Err(QueryError::SimulatedConnectionError)
+                        }
+                        e => {
+                            log_query_error(query_string, &e);
+                            let short_error = short_error(&e);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &short_error,
+                                Some(e.pg_error_code()),
+                            ))?;
+                        }
+                    }
                }
                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
            }
@@ -913,6 +974,9 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
+        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -929,6 +993,15 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
+        QueryError::SimulatedConnectionError => {
+            error!("query handler for query '{query}' failed due to a simulated connection error")
+        }
+        QueryError::Shutdown => {
+            info!("query handler for '{query}' cancelled during tenant shutdown")
+        }
+        QueryError::Unauthorized(e) => {
+            warn!("query handler for '{query}' failed with authentication error: {e}");
+        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/README.md
+++ b/libs/postgres_ffi/README.md
@@ -10,9 +10,11 @@ should be auto-generated too, but that's a TODO.
 The PostgreSQL on-disk file format is not portable across different
 CPU architectures and operating systems. It is also subject to change
 in each major PostgreSQL version. Currently, this module supports
-PostgreSQL v14 and v15: bindings and code that depends on them are version-specific.
-This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15`
-Version independend code is explicitly exported into shared `postgres_ffi`.
+PostgreSQL v14, v15 and v16: bindings and code that depends on them are
+version-specific.
+This code is organized in modules `postgres_ffi::v14`, `postgres_ffi::v15` and
+`postgres_ffi::v16`. Version independent code is explicitly exported into
+shared `postgres_ffi`.


 TODO: Currently, there is also some code that deals with WAL records
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
        PathBuf::from("pg_install")
    };

-    for pg_version in &["v14", "v15"] {
+    for pg_version in &["v14", "v15", "v16"] {
        let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
        if pg_install_dir_versioned.is_relative() {
            let cwd = env::current_dir().context("Failed to get current_dir")?;
@@ -125,6 +125,7 @@ fn main() -> anyhow::Result<()> {
            .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
            .allowlist_type("PageHeaderData")
            .allowlist_type("DBState")
+            .allowlist_type("RelMapFile")
            // Because structs are used for serialization, tell bindgen to emit
            // explicit padding fields.
            .explicit_padding(true)
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,6 +8,7 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
+                #![allow(clippy::undocumented_unsafe_blocks)]

                use serde::{Deserialize, Serialize};
                include!(concat!(
@@ -51,11 +53,59 @@ macro_rules! for_all_postgres_versions {
    ($macro:tt) => {
        $macro!(v14);
        $macro!(v15);
+        $macro!(v16);
    };
 }

 for_all_postgres_versions! { postgres_ffi }

+/// dispatch_pgversion
+///
+/// Run a code block in a context where the postgres_ffi bindings for a
+/// specific (supported) PostgreSQL version are `use`-ed in scope under the pgv
+/// identifier.
+/// If the provided pg_version is not supported, we panic!(), unless the
+/// optional third argument was provided (in which case that code will provide
+/// the default handling instead).
+///
+/// Use like
+///
+/// dispatch_pgversion!(my_pgversion, { pgv::constants::XLOG_DBASE_CREATE })
+/// dispatch_pgversion!(my_pgversion, pgv::constants::XLOG_DBASE_CREATE)
+///
+/// Other uses are for macro-internal purposes only and strictly unsupported.
+///
+#[macro_export]
+macro_rules! dispatch_pgversion {
+    ($version:expr, $code:expr) => {
+        dispatch_pgversion!($version, $code, panic!("Unknown PostgreSQL version {}", $version))
+    };
+    ($version:expr, $code:expr, $invalid_pgver_handling:expr) => {
+        dispatch_pgversion!(
+            $version => $code,
+            default = $invalid_pgver_handling,
+            pgversions = [
+                14 : v14,
+                15 : v15,
+                16 : v16,
+            ]
+        )
+    };
+    ($pgversion:expr => $code:expr,
+     default = $default:expr,
+     pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => {
+        match ($pgversion) {
+            $($sv => {
+                use $crate::$vsv as pgv;
+                $code
+            },)+
+            _ => {
+                $default
+            }
+        }
+    };
+}
+
 pub mod pg_constants;
 pub mod relfile_utils;

@@ -83,6 +133,7 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;

 // Export some version independent functions that are used outside of this mod
 pub use v14::xlog_utils::encode_logical_message;
+pub use v14::xlog_utils::from_pg_timestamp;
 pub use v14::xlog_utils::get_current_timestamp;
 pub use v14::xlog_utils::to_pg_timestamp;
 pub use v14::xlog_utils::XLogFileName;
@@ -90,13 +141,7 @@ pub use v14::xlog_utils::XLogFileName;
 pub use v14::bindings::DBState_DB_SHUTDOWNED;

 pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
-    match version {
-        14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
-        15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
-            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
-            || bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
-        _ => anyhow::bail!("Unknown version {}", version),
-    }
+    dispatch_pgversion!(version, Ok(pgv::bindings::bkpimg_is_compressed(bimg_info)))
 }

 pub fn generate_wal_segment(
@@ -107,11 +152,11 @@ pub fn generate_wal_segment(
 ) -> Result<Bytes, SerializeError> {
    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));

-    match pg_version {
-        14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
-        _ => Err(SerializeError::BadInput),
-    }
+    dispatch_pgversion!(
+        pg_version,
+        pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        Err(SerializeError::BadInput)
+    )
 }

 pub fn generate_pg_control(
@@ -120,11 +165,11 @@ pub fn generate_pg_control(
    lsn: Lsn,
    pg_version: u32,
 ) -> anyhow::Result<(Bytes, u64)> {
-    match pg_version {
-        14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
-        15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
-        _ => anyhow::bail!("Unknown version {}", pg_version),
-    }
+    dispatch_pgversion!(
+        pg_version,
+        pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
+        anyhow::bail!("Unknown version {}", pg_version)
+    )
 }

 // PG timeline is always 1, changing it doesn't have any useful meaning in Neon.
@@ -196,8 +241,6 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
 }

 pub mod waldecoder {
-
-    use crate::{v14, v15};
    use bytes::{Buf, Bytes, BytesMut};
    use std::num::NonZeroU32;
    use thiserror::Error;
@@ -248,22 +291,17 @@ pub mod waldecoder {
        }

        pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
-            match self.pg_version {
-                // This is a trick to support both versions simultaneously.
-                // See WalStreamDecoderHandler comments.
-                14 => {
-                    use self::v14::waldecoder_handler::WalStreamDecoderHandler;
+            dispatch_pgversion!(
+                self.pg_version,
+                {
+                    use pgv::waldecoder_handler::WalStreamDecoderHandler;
                    self.poll_decode_internal()
-                }
-                15 => {
-                    use self::v15::waldecoder_handler::WalStreamDecoderHandler;
-                    self.poll_decode_internal()
-                }
-                _ => Err(WalDecodeError {
+                },
+                Err(WalDecodeError {
                    msg: format!("Unknown version {}", self.pg_version),
                    lsn: self.lsn,
-                }),
-            }
+                })
+            )
        }
    }
 }
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -137,9 +137,12 @@ pub const XLOG_HEAP_INSERT: u8 = 0x00;
 pub const XLOG_HEAP_DELETE: u8 = 0x10;
 pub const XLOG_HEAP_UPDATE: u8 = 0x20;
 pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
+pub const XLOG_HEAP_LOCK: u8 = 0x60;
 pub const XLOG_HEAP_INIT_PAGE: u8 = 0x80;
 pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
 pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
+pub const XLOG_HEAP2_LOCK_UPDATED: u8 = 0x60;
+pub const XLH_LOCK_ALL_FROZEN_CLEARED: u8 = 0x01;
 pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
 pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
@@ -163,6 +166,20 @@ pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
 pub const RM_LOGICALMSG_ID: u8 = 21;

+// from neon_rmgr.h
+pub const RM_NEON_ID: u8 = 134;
+
+pub const XLOG_NEON_HEAP_INIT_PAGE: u8 = 0x80;
+
+pub const XLOG_NEON_HEAP_INSERT: u8 = 0x00;
+pub const XLOG_NEON_HEAP_DELETE: u8 = 0x10;
+pub const XLOG_NEON_HEAP_UPDATE: u8 = 0x20;
+pub const XLOG_NEON_HEAP_HOT_UPDATE: u8 = 0x30;
+pub const XLOG_NEON_HEAP_LOCK: u8 = 0x40;
+pub const XLOG_NEON_HEAP_MULTI_INSERT: u8 = 0x50;
+
+pub const XLOG_NEON_HEAP_VISIBLE: u8 = 0x40;
+
 // from xlogreader.h
 pub const XLR_INFO_MASK: u8 = 0x0F;
 pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
@@ -203,6 +220,10 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

+/* From replication/slot.h */
+pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
+   + 64 /* NameData */  + 4*4;
+
 /* From fsm_internals.h */
 const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4;
 const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1;
--- a/libs/postgres_ffi/src/pg_constants_v14.rs
+++ b/libs/postgres_ffi/src/pg_constants_v14.rs
@@ -3,3 +3,8 @@ pub const XLOG_DBASE_DROP: u8 = 0x10;

 pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
 pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
+pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    (bimg_info & BKPIMAGE_IS_COMPRESSED) != 0
+}
--- a/libs/postgres_ffi/src/pg_constants_v15.rs
+++ b/libs/postgres_ffi/src/pg_constants_v15.rs
@@ -1,10 +1,18 @@
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
-pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
 pub const XLOG_DBASE_DROP: u8 = 0x20;

 pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
 pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
 pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
 pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
+
+pub const SIZEOF_RELMAPFILE: usize = 512; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
+
+    (bimg_info & ANY_COMPRESS_FLAG) != 0
+}
--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -0,0 +1,18 @@
+pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
+
+pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
+pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x10;
+pub const XLOG_DBASE_DROP: u8 = 0x20;
+
+pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
+pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
+pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
+
+pub const SIZEOF_RELMAPFILE: usize = 524; /* sizeof(RelMapFile) in relmapper.c */
+
+pub fn bkpimg_is_compressed(bimg_info: u8) -> bool {
+    const ANY_COMPRESS_FLAG: u8 = BKPIMAGE_COMPRESS_PGLZ | BKPIMAGE_COMPRESS_LZ4 | BKPIMAGE_COMPRESS_ZSTD;
+
+    (bimg_info & ANY_COMPRESS_FLAG) != 0
+}
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -136,21 +136,42 @@ pub fn get_current_timestamp() -> TimestampTz {
    to_pg_timestamp(SystemTime::now())
 }

-pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
-    const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
-    const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
+// Module to reduce the scope of the constants
+mod timestamp_conversions {
+    use std::time::Duration;
+
+    use super::*;
+
+    const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
+    const POSTGRES_EPOCH_JDATE: u64 = 2451545; // == date2j(2000, 1, 1)
    const SECS_PER_DAY: u64 = 86400;
    const USECS_PER_SEC: u64 = 1000000;
-    match time.duration_since(SystemTime::UNIX_EPOCH) {
-        Ok(n) => {
-            ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
-                * USECS_PER_SEC
-                + n.subsec_micros() as u64) as i64
+    const SECS_DIFF_UNIX_TO_POSTGRES_EPOCH: u64 =
+        (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY;
+
+    pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
+        match time.duration_since(SystemTime::UNIX_EPOCH) {
+            Ok(n) => {
+                ((n.as_secs() - SECS_DIFF_UNIX_TO_POSTGRES_EPOCH) * USECS_PER_SEC
+                    + n.subsec_micros() as u64) as i64
+            }
+            Err(_) => panic!("SystemTime before UNIX EPOCH!"),
        }
-        Err(_) => panic!("SystemTime before UNIX EPOCH!"),
+    }
+
+    pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
+        let time: u64 = time
+            .try_into()
+            .expect("timestamp before millenium (postgres epoch)");
+        let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
+        SystemTime::UNIX_EPOCH
+            .checked_add(Duration::from_micros(since_unix_epoch))
+            .expect("SystemTime overflow")
    }
 }

+pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
+
 // Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
 // start_lsn must point to some previously known record boundary (beginning of
 // the next record). If no valid record after is found, start_lsn is returned
@@ -481,4 +502,24 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
    wal
 }

-// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ts_conversion() {
+        let now = SystemTime::now();
+        let round_trip = from_pg_timestamp(to_pg_timestamp(now));
+
+        let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
+        let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
+        assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
+
+        let now_pg = get_current_timestamp();
+        let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
+
+        assert_eq!(now_pg, round_trip_pg);
+    }
+
+    // If you need to craft WAL and write tests for this module, put it at wal_craft crate.
+}
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -12,7 +12,7 @@ log.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 postgres_ffi.workspace = true
-tempfile.workspace = true
+camino-tempfile.workspace = true

 workspace_hack.workspace = true

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -1,4 +1,5 @@
 use anyhow::{bail, ensure};
+use camino_tempfile::{tempdir, Utf8TempDir};
 use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
@@ -8,12 +9,12 @@ use std::cmp::Ordering;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
-use tempfile::{tempdir, TempDir};

 macro_rules! xlog_utils_test {
    ($version:ident) => {
        #[path = "."]
        mod $version {
+            #[allow(unused_imports)]
            pub use postgres_ffi::$version::wal_craft_test_export::*;
            #[allow(clippy::duplicate_mod)]
            #[cfg(test)]
@@ -33,7 +34,7 @@ pub struct Conf {

 pub struct PostgresServer {
    process: std::process::Child,
-    _unix_socket_dir: TempDir,
+    _unix_socket_dir: Utf8TempDir,
    client_config: postgres::Config,
 }

@@ -49,9 +50,9 @@ impl Conf {
    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

+        #[allow(clippy::manual_range_patterns)]
        match self.pg_version {
-            14 => Ok(path.join(format!("v{}", self.pg_version))),
-            15 => Ok(path.join(format!("v{}", self.pg_version))),
+            14 | 15 | 16 => Ok(path.join(format!("v{}", self.pg_version))),
            _ => bail!("Unsupported postgres version: {}", self.pg_version),
        }
    }
@@ -250,11 +251,18 @@ fn craft_internal<C: postgres::GenericClient>(
    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
    let last_lsn = match last_lsn {
        None => client.pg_current_wal_insert_lsn()?,
-        Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
-            Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
-            Ordering::Equal => last_lsn,
-            Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
-        },
+        Some(last_lsn) => {
+            let insert_lsn = client.pg_current_wal_insert_lsn()?;
+            match last_lsn.cmp(&insert_lsn) {
+                Ordering::Less => bail!(
+                    "Some records were inserted after the crafted WAL: {} vs {}",
+                    last_lsn,
+                    insert_lsn
+                ),
+                Ordering::Equal => last_lsn,
+                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
+            }
+        }
    };
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
@@ -363,8 +371,9 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
        );
        ensure!(
            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
-            "XLOG_SWITCH message ended not on page boundary: {}",
-            after_xlog_switch
+            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
+            after_xlog_switch,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
        );
        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -214,27 +214,24 @@ where
    }
 }

+/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn flush<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
 ) -> Result<(), io::Error> {
    while write_buf.has_remaining() {
-        let bytes_written = stream.write(write_buf.chunk()).await?;
+        let bytes_written = stream.write_buf(write_buf).await?;
        if bytes_written == 0 {
            return Err(io::Error::new(
                ErrorKind::WriteZero,
                "failed to write message",
            ));
        }
-        // The advanced part will be garbage collected, likely during shifting
-        // data left on next attempt to write to buffer when free space is not
-        // enough.
-        write_buf.advance(bytes_written);
    }
-    write_buf.clear();
    stream.flush().await
 }

+/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn shutdown<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,6 +1,7 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod framed;

@@ -288,10 +289,10 @@ impl FeStartupPacket {
        // We shouldn't advance `buf` as probably full message is not there yet,
        // so can't directly use Bytes::get_u32 etc.
        let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
-        // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
+        // The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
        // which is less readable
        #[allow(clippy::manual_range_contains)]
-        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
+        if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
            return Err(ProtocolError::Protocol(format!(
                "invalid startup packet message length {}",
                len
@@ -670,6 +671,7 @@ pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
 }

 pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
+pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
 pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";

 impl<'a> BeMessage<'a> {
@@ -959,7 +961,7 @@ mod tests {
        let make_params = |options| StartupMessageParams::new([("options", options)]);

        let params = StartupMessageParams::new([]);
-        assert!(matches!(params.options_escaped(), None));
+        assert!(params.options_escaped().is_none());

        let params = make_params("");
        assert!(split_options(&params).is_empty());
@@ -973,4 +975,10 @@ mod tests {
        let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
+
+    #[test]
+    fn parse_fe_startup_packet_regression() {
+        let data = [0, 0, 0, 7, 0, 0, 0, 0];
+        FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
+    }
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -8,16 +8,19 @@ license.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
-aws-smithy-http.workspace = true
-aws-types.workspace = true
+aws-smithy-async.workspace = true
+aws-smithy-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
+bytes.workspace = true
+camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
+futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
-tokio-util.workspace = true
+tokio-util = { workspace = true, features = ["compat"] }
 toml_edit.workspace = true
 tracing.workspace = true
 scopeguard.workspace = true
@@ -25,7 +28,15 @@ metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
 workspace_hack.workspace = true
+azure_core.workspace = true
+azure_identity.workspace = true
+azure_storage.workspace = true
+azure_storage_blobs.workspace = true
+futures-util.workspace = true
+http-types.workspace = true
+itertools.workspace = true

 [dev-dependencies]
-tempfile.workspace = true
+camino-tempfile.workspace = true
 test-context.workspace = true
+rand.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -0,0 +1,469 @@
+//! Azure Blob Storage wrapper
+
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::env;
+use std::num::NonZeroU32;
+use std::pin::Pin;
+use std::sync::Arc;
+
+use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
+use anyhow::Result;
+use azure_core::request_options::{MaxResults, Metadata, Range};
+use azure_core::RetryOptions;
+use azure_identity::DefaultAzureCredential;
+use azure_storage::StorageCredentials;
+use azure_storage_blobs::prelude::ClientBuilder;
+use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
+use bytes::Bytes;
+use futures::stream::Stream;
+use futures_util::StreamExt;
+use http_types::StatusCode;
+use tracing::debug;
+
+use crate::s3_bucket::RequestKind;
+use crate::{
+    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
+    RemoteStorage, StorageMetadata,
+};
+
+pub struct AzureBlobStorage {
+    client: ContainerClient,
+    prefix_in_container: Option<String>,
+    max_keys_per_list_response: Option<NonZeroU32>,
+    concurrency_limiter: ConcurrencyLimiter,
+}
+
+impl AzureBlobStorage {
+    pub fn new(azure_config: &AzureConfig) -> Result<Self> {
+        debug!(
+            "Creating azure remote storage for azure container {}",
+            azure_config.container_name
+        );
+
+        let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
+
+        // If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that,
+        // otherwise try the token based credentials.
+        let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
+            StorageCredentials::access_key(account.clone(), access_key)
+        } else {
+            let token_credential = DefaultAzureCredential::default();
+            StorageCredentials::token_credential(Arc::new(token_credential))
+        };
+
+        // we have an outer retry
+        let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
+
+        let client = builder.container_client(azure_config.container_name.to_owned());
+
+        let max_keys_per_list_response =
+            if let Some(limit) = azure_config.max_keys_per_list_response {
+                Some(
+                    NonZeroU32::new(limit as u32)
+                        .ok_or_else(|| anyhow::anyhow!("max_keys_per_list_response can't be 0"))?,
+                )
+            } else {
+                None
+            };
+
+        Ok(AzureBlobStorage {
+            client,
+            prefix_in_container: azure_config.prefix_in_container.to_owned(),
+            max_keys_per_list_response,
+            concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
+        })
+    }
+
+    pub fn relative_path_to_name(&self, path: &RemotePath) -> String {
+        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path
+            .get_path()
+            .as_str()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        match &self.prefix_in_container {
+            Some(prefix) => {
+                if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    prefix.clone() + path_string
+                } else {
+                    format!("{prefix}{REMOTE_STORAGE_PREFIX_SEPARATOR}{path_string}")
+                }
+            }
+            None => path_string.to_string(),
+        }
+    }
+
+    fn name_to_relative_path(&self, key: &str) -> RemotePath {
+        let relative_path =
+            match key.strip_prefix(self.prefix_in_container.as_deref().unwrap_or_default()) {
+                Some(stripped) => stripped,
+                // we rely on Azure to return properly prefixed paths
+                // for requests with a certain prefix
+                None => panic!(
+                    "Key {key} does not start with container prefix {:?}",
+                    self.prefix_in_container
+                ),
+            };
+        RemotePath(
+            relative_path
+                .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                .collect(),
+        )
+    }
+
+    async fn download_for_builder(
+        &self,
+        builder: GetBlobBuilder,
+    ) -> Result<Download, DownloadError> {
+        let mut response = builder.into_stream();
+
+        let mut metadata = HashMap::new();
+        // TODO give proper streaming response instead of buffering into RAM
+        // https://github.com/neondatabase/neon/issues/5563
+
+        let mut bufs = Vec::new();
+        while let Some(part) = response.next().await {
+            let part = part.map_err(to_download_error)?;
+            if let Some(blob_meta) = part.blob.metadata {
+                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+            }
+            let data = part
+                .data
+                .collect()
+                .await
+                .map_err(|e| DownloadError::Other(e.into()))?;
+            bufs.push(data);
+        }
+        Ok(Download {
+            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+            metadata: Some(StorageMetadata(metadata)),
+        })
+    }
+
+    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
+        self.concurrency_limiter
+            .acquire(kind)
+            .await
+            .expect("semaphore is never closed")
+    }
+}
+
+fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
+    let mut res = Metadata::new();
+    for (k, v) in metadata.0.into_iter() {
+        res.insert(k, v);
+    }
+    res
+}
+
+fn to_download_error(error: azure_core::Error) -> DownloadError {
+    if let Some(http_err) = error.as_http_error() {
+        match http_err.status() {
+            StatusCode::NotFound => DownloadError::NotFound,
+            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
+            _ => DownloadError::Other(anyhow::Error::new(error)),
+        }
+    } else {
+        DownloadError::Other(error.into())
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for AzureBlobStorage {
+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+    ) -> anyhow::Result<Listing, DownloadError> {
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let list_prefix = prefix
+            .map(|p| self.relative_path_to_name(p))
+            .or_else(|| self.prefix_in_container.clone())
+            .map(|mut p| {
+                // required to end with a separator
+                // otherwise request will return only the entry of a prefix
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
+                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                }
+                p
+            });
+
+        let mut builder = self.client.list_blobs();
+
+        if let ListingMode::WithDelimiter = mode {
+            builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+        }
+
+        if let Some(prefix) = list_prefix {
+            builder = builder.prefix(Cow::from(prefix.to_owned()));
+        }
+
+        if let Some(limit) = self.max_keys_per_list_response {
+            builder = builder.max_results(MaxResults::new(limit));
+        }
+
+        let mut response = builder.into_stream();
+        let mut res = Listing::default();
+        while let Some(l) = response.next().await {
+            let entry = l.map_err(to_download_error)?;
+            let prefix_iter = entry
+                .blobs
+                .prefixes()
+                .map(|prefix| self.name_to_relative_path(&prefix.name));
+            res.prefixes.extend(prefix_iter);
+
+            let blob_iter = entry
+                .blobs
+                .blobs()
+                .map(|k| self.name_to_relative_path(&k.name));
+            res.keys.extend(blob_iter);
+        }
+        Ok(res)
+    }
+
+    async fn upload(
+        &self,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Put).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+
+        let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
+            Box::pin(from);
+
+        let from = NonSeekableStream::new(from, data_size_bytes);
+
+        let body = azure_core::Body::SeekableStream(Box::new(from));
+
+        let mut builder = blob_client.put_block_blob(body);
+
+        if let Some(metadata) = metadata {
+            builder = builder.metadata(to_azure_metadata(metadata));
+        }
+
+        let _response = builder.into_future().await?;
+
+        Ok(())
+    }
+
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        let _permit = self.permit(RequestKind::Get).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
+
+        let builder = blob_client.get();
+
+        self.download_for_builder(builder).await
+    }
+
+    async fn download_byte_range(
+        &self,
+        from: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+    ) -> Result<Download, DownloadError> {
+        let _permit = self.permit(RequestKind::Get).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
+
+        let mut builder = blob_client.get();
+
+        if let Some(end_exclusive) = end_exclusive {
+            builder = builder.range(Range::new(start_inclusive, end_exclusive));
+        } else {
+            // Open ranges are not supported by the SDK so we work around
+            // by setting the upper limit extremely high (but high enough
+            // to still be representable by signed 64 bit integers).
+            // TODO remove workaround once the SDK adds open range support
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1438
+            let end_exclusive = u64::MAX / 4;
+            builder = builder.range(Range::new(start_inclusive, end_exclusive));
+        }
+
+        self.download_for_builder(builder).await
+    }
+
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Delete).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+
+        let builder = blob_client.delete();
+
+        match builder.into_future().await {
+            Ok(_response) => Ok(()),
+            Err(e) => {
+                if let Some(http_err) = e.as_http_error() {
+                    if http_err.status() == StatusCode::NotFound {
+                        return Ok(());
+                    }
+                }
+                Err(anyhow::Error::new(e))
+            }
+        }
+    }
+
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        // Permit is already obtained by inner delete function
+
+        // TODO batch requests are also not supported by the SDK
+        // https://github.com/Azure/azure-sdk-for-rust/issues/1068
+        // https://github.com/Azure/azure-sdk-for-rust/issues/1249
+        for path in paths {
+            self.delete(path).await?;
+        }
+        Ok(())
+    }
+}
+
+pin_project_lite::pin_project! {
+    /// Hack to work around not being able to stream once with azure sdk.
+    ///
+    /// Azure sdk clones streams around with the assumption that they are like
+    /// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
+    /// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
+    /// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
+    /// seekable, but we can also just re-try the request easier.
+    #[project = NonSeekableStreamProj]
+    enum NonSeekableStream<S> {
+        /// A stream wrappers initial form.
+        ///
+        /// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
+        /// clone before first request, then this must be changed.
+        Initial {
+            inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
+            len: usize,
+        },
+        /// The actually readable variant, produced by cloning the Initial variant.
+        ///
+        /// The sdk currently always clones once, even without retry policy.
+        Actual {
+            #[pin]
+            inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
+            len: usize,
+            read_any: bool,
+        },
+        /// Most likely unneeded, but left to make life easier, in case more clones are added.
+        Cloned {
+            len_was: usize,
+        }
+    }
+}
+
+impl<S> NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+{
+    fn new(inner: S, len: usize) -> NonSeekableStream<S> {
+        use tokio_util::compat::TokioAsyncReadCompatExt;
+
+        let inner = tokio_util::io::StreamReader::new(inner).compat();
+        let inner = Some(inner);
+        let inner = std::sync::Mutex::new(inner);
+        NonSeekableStream::Initial { inner, len }
+    }
+}
+
+impl<S> std::fmt::Debug for NonSeekableStream<S> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
+            Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
+            Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
+        }
+    }
+}
+
+impl<S> futures::io::AsyncRead for NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>>,
+{
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut [u8],
+    ) -> std::task::Poll<std::io::Result<usize>> {
+        match self.project() {
+            NonSeekableStreamProj::Actual {
+                inner, read_any, ..
+            } => {
+                *read_any = true;
+                inner.poll_read(cx, buf)
+            }
+            // NonSeekableStream::Initial does not support reading because it is just much easier
+            // to have the mutex in place where one does not poll the contents, or that's how it
+            // seemed originally. If there is a version upgrade which changes the cloning, then
+            // that support needs to be hacked in.
+            //
+            // including {self:?} into the message would be useful, but unsure how to unproject.
+            _ => std::task::Poll::Ready(Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "cloned or initial values cannot be read",
+            ))),
+        }
+    }
+}
+
+impl<S> Clone for NonSeekableStream<S> {
+    /// Weird clone implementation exists to support the sdk doing cloning before issuing the first
+    /// request, see type documentation.
+    fn clone(&self) -> Self {
+        use NonSeekableStream::*;
+
+        match self {
+            Initial { inner, len } => {
+                if let Some(inner) = inner.lock().unwrap().take() {
+                    Actual {
+                        inner,
+                        len: *len,
+                        read_any: false,
+                    }
+                } else {
+                    Self::Cloned { len_was: *len }
+                }
+            }
+            Actual { len, .. } => Cloned { len_was: *len },
+            Cloned { len_was } => Cloned { len_was: *len_was },
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl<S> azure_core::SeekableStream for NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
+{
+    async fn reset(&mut self) -> azure_core::error::Result<()> {
+        use NonSeekableStream::*;
+
+        let msg = match self {
+            Initial { inner, .. } => {
+                if inner.get_mut().unwrap().is_some() {
+                    return Ok(());
+                } else {
+                    "reset after first clone is not supported"
+                }
+            }
+            Actual { read_any, .. } if !*read_any => return Ok(()),
+            Actual { .. } => "reset after reading is not supported",
+            Cloned { .. } => "reset after second clone is not supported",
+        };
+        Err(azure_core::error::Error::new(
+            azure_core::error::ErrorKind::Io,
+            std::io::Error::new(std::io::ErrorKind::Other, msg),
+        ))
+    }
+
+    // Note: it is not documented if this should be the total or remaining length, total passes the
+    // tests.
+    fn len(&self) -> usize {
+        use NonSeekableStream::*;
+        match self {
+            Initial { len, .. } => *len,
+            Actual { len, .. } => *len,
+            Cloned { len_was, .. } => *len_was,
+        }
+    }
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -4,55 +4,59 @@
 //! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
 //!   * [`local_fs`] allows to use local file system as an external storage
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
+//!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
+
+mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;

-use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    path::{Path, PathBuf, StripPrefixError},
-    pin::Pin,
-    sync::Arc,
-};
+use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};

 use anyhow::{bail, Context};
+use camino::{Utf8Path, Utf8PathBuf};

+use bytes::Bytes;
+use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
-use tokio::io;
+use tokio::sync::Semaphore;
 use toml_edit::Item;
 use tracing::info;

-pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};
+pub use self::{
+    azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket,
+    simulate_failures::UnreliableWrapper,
+};
+use s3_bucket::RequestKind;

-/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
-/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
-/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
-/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
-pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
-pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
+/// We set this a little bit low as we currently buffer the entire file into RAM
+///
+/// Here, a limit of max 20k concurrent connections was noted.
+/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
+pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;

-const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
-
-// From the S3 spec
+/// As defined in S3 docs
 pub const MAX_KEYS_PER_DELETE: usize = 1000;

+const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
+
 /// Path on the remote storage, relative to some inner prefix.
 /// The prefix is an implementation detail, that allows representing local paths
 /// as the remote ones, stripping the local storage prefix away.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct RemotePath(PathBuf);
+pub struct RemotePath(Utf8PathBuf);

 impl Serialize for RemotePath {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
@@ -69,18 +73,18 @@ impl<'de> Deserialize<'de> for RemotePath {
        D: serde::Deserializer<'de>,
    {
        let str = String::deserialize(deserializer)?;
-        Ok(Self(PathBuf::from(&str)))
+        Ok(Self(Utf8PathBuf::from(&str)))
    }
 }

 impl std::fmt::Display for RemotePath {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0.display())
+        std::fmt::Display::fmt(&self.0, f)
    }
 }

 impl RemotePath {
-    pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
+    pub fn new(relative_path: &Utf8Path) -> anyhow::Result<Self> {
        anyhow::ensure!(
            relative_path.is_relative(),
            "Path {relative_path:?} is not relative"
@@ -89,34 +93,50 @@ impl RemotePath {
    }

    pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
-        Self::new(Path::new(relative_path))
+        Self::new(Utf8Path::new(relative_path))
    }

-    pub fn with_base(&self, base_path: &Path) -> PathBuf {
+    pub fn with_base(&self, base_path: &Utf8Path) -> Utf8PathBuf {
        base_path.join(&self.0)
    }

    pub fn object_name(&self) -> Option<&str> {
-        self.0.file_name().and_then(|os_str| os_str.to_str())
+        self.0.file_name()
    }

-    pub fn join(&self, segment: &Path) -> Self {
+    pub fn join(&self, segment: &Utf8Path) -> Self {
        Self(self.0.join(segment))
    }

-    pub fn get_path(&self) -> &PathBuf {
+    pub fn get_path(&self) -> &Utf8PathBuf {
        &self.0
    }

    pub fn extension(&self) -> Option<&str> {
-        self.0.extension()?.to_str()
+        self.0.extension()
    }

-    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, StripPrefixError> {
+    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
 }

+/// We don't need callers to be able to pass arbitrary delimiters: just control
+/// whether listings will use a '/' separator or not.
+///
+/// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
+/// NoDelimiter mode will only populate `keys`.
+pub enum ListingMode {
+    WithDelimiter,
+    NoDelimiter,
+}
+
+#[derive(Default)]
+pub struct Listing {
+    pub prefixes: Vec<RemotePath>,
+    pub keys: Vec<RemotePath>,
+}
+
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
@@ -129,8 +149,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError>;
-
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let result = self
+            .list(prefix, ListingMode::WithDelimiter)
+            .await?
+            .prefixes;
+        Ok(result)
+    }
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
@@ -142,12 +167,21 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
+    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
+        Ok(result)
+    }
+
+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        _mode: ListingMode,
+    ) -> anyhow::Result<Listing, DownloadError>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
@@ -174,7 +208,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
 }

 pub struct Download {
-    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
+    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
@@ -193,8 +227,9 @@ pub enum DownloadError {
    BadInput(anyhow::Error),
    /// The file was not found in the remote storage.
    NotFound,
-    /// The client was shut down
-    Shutdown,
+    /// A cancellation token aborted the download, typically during
+    /// tenant detach or process shutdown.
+    Cancelled,
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -205,8 +240,8 @@ impl std::fmt::Display for DownloadError {
            DownloadError::BadInput(e) => {
                write!(f, "Failed to download a remote file due to user input: {e}")
            }
+            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
-            DownloadError::Shutdown => write!(f, "Client shutting down"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
    }
@@ -220,10 +255,24 @@ impl std::error::Error for DownloadError {}
 pub enum GenericRemoteStorage {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
+    AzureBlob(Arc<AzureBlobStorage>),
    Unreliable(Arc<UnreliableWrapper>),
 }

 impl GenericRemoteStorage {
+    pub async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+    ) -> anyhow::Result<Listing, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list(prefix, mode).await,
+            Self::AwsS3(s) => s.list(prefix, mode).await,
+            Self::AzureBlob(s) => s.list(prefix, mode).await,
+            Self::Unreliable(s) => s.list(prefix, mode).await,
+        }
+    }
+
    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
@@ -231,6 +280,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.list_files(folder).await,
            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::AzureBlob(s) => s.list_files(folder).await,
            Self::Unreliable(s) => s.list_files(folder).await,
        }
    }
@@ -245,13 +295,14 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.list_prefixes(prefix).await,
            Self::AwsS3(s) => s.list_prefixes(prefix).await,
+            Self::AzureBlob(s) => s.list_prefixes(prefix).await,
            Self::Unreliable(s) => s.list_prefixes(prefix).await,
        }
    }

    pub async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -259,6 +310,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
+            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
        }
    }
@@ -267,22 +319,11 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.download(from).await,
            Self::AwsS3(s) => s.download(from).await,
+            Self::AzureBlob(s) => s.download(from).await,
            Self::Unreliable(s) => s.download(from).await,
        }
    }

-    /// For small, simple downloads where caller doesn't want to handle the streaming: return the full body
-    pub async fn download_all(&self, from: &RemotePath) -> Result<Vec<u8>, DownloadError> {
-        let mut download = self.download(from).await?;
-
-        let mut bytes = Vec::new();
-        tokio::io::copy(&mut download.download_stream, &mut bytes)
-            .await
-            .with_context(|| format!("Failed to download body from {from}"))
-            .map_err(DownloadError::Other)?;
-        Ok(bytes)
-    }
-
    pub async fn download_byte_range(
        &self,
        from: &RemotePath,
@@ -298,6 +339,10 @@ impl GenericRemoteStorage {
                s.download_byte_range(from, start_inclusive, end_exclusive)
                    .await
            }
+            Self::AzureBlob(s) => {
+                s.download_byte_range(from, start_inclusive, end_exclusive)
+                    .await
+            }
            Self::Unreliable(s) => {
                s.download_byte_range(from, start_inclusive, end_exclusive)
                    .await
@@ -309,6 +354,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.delete(path).await,
            Self::AwsS3(s) => s.delete(path).await,
+            Self::AzureBlob(s) => s.delete(path).await,
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
@@ -317,6 +363,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.delete_objects(paths).await,
            Self::AwsS3(s) => s.delete_objects(paths).await,
+            Self::AzureBlob(s) => s.delete_objects(paths).await,
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
@@ -326,7 +373,7 @@ impl GenericRemoteStorage {
    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        Ok(match &storage_config.storage {
            RemoteStorageKind::LocalFs(root) => {
-                info!("Using fs root '{}' as a remote storage", root.display());
+                info!("Using fs root '{root}' as a remote storage");
                Self::LocalFs(LocalFs::new(root.clone())?)
            }
            RemoteStorageKind::AwsS3(s3_config) => {
@@ -334,6 +381,11 @@ impl GenericRemoteStorage {
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
            }
+            RemoteStorageKind::AzureContainer(azure_config) => {
+                info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
+                      azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
+                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
+            }
        })
    }

@@ -348,7 +400,7 @@ impl GenericRemoteStorage {
    /// this path is used for the remote object id conversion only.
    pub async fn upload_storage_object(
        &self,
-        from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
    ) -> anyhow::Result<()> {
@@ -381,10 +433,6 @@ pub struct StorageMetadata(HashMap<String, String>);
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
-    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
-    pub max_concurrent_syncs: NonZeroUsize,
-    /// Max allowed errors before the sync task is considered failed and evicted.
-    pub max_sync_errors: NonZeroU32,
    /// The storage connection configuration.
    pub storage: RemoteStorageKind,
 }
@@ -394,10 +442,13 @@ pub struct RemoteStorageConfig {
 pub enum RemoteStorageKind {
    /// Storage based on local file system.
    /// Specify a root folder to place all stored files into.
-    LocalFs(PathBuf),
+    LocalFs(Utf8PathBuf),
    /// AWS S3 based storage, storing all files in the S3 bucket
    /// specified by the config
    AwsS3(S3Config),
+    /// Azure Blob based storage, storing all files in the container
+    /// specified by the config
+    AzureContainer(AzureConfig),
 }

 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
@@ -437,27 +488,53 @@ impl Debug for S3Config {
    }
 }

+/// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
+#[derive(Clone, PartialEq, Eq)]
+pub struct AzureConfig {
+    /// Name of the container to connect to.
+    pub container_name: String,
+    /// The region where the bucket is located at.
+    pub container_region: String,
+    /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
+    pub prefix_in_container: Option<String>,
+    /// Azure has various limits on its API calls, we need not to exceed those.
+    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
+    pub concurrency_limit: NonZeroUsize,
+    pub max_keys_per_list_response: Option<i32>,
+}
+
+impl Debug for AzureConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("AzureConfig")
+            .field("bucket_name", &self.container_name)
+            .field("bucket_region", &self.container_region)
+            .field("prefix_in_bucket", &self.prefix_in_container)
+            .field("concurrency_limit", &self.concurrency_limit)
+            .field(
+                "max_keys_per_list_response",
+                &self.max_keys_per_list_response,
+            )
+            .finish()
+    }
+}
+
 impl RemoteStorageConfig {
    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
        let local_path = toml.get("local_path");
        let bucket_name = toml.get("bucket_name");
        let bucket_region = toml.get("bucket_region");
+        let container_name = toml.get("container_name");
+        let container_region = toml.get("container_region");

-        let max_concurrent_syncs = NonZeroUsize::new(
-            parse_optional_integer("max_concurrent_syncs", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
-        )
-        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
-
-        let max_sync_errors = NonZeroU32::new(
-            parse_optional_integer("max_sync_errors", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
-        )
-        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
+        let use_azure = container_name.is_some() && container_region.is_some();

+        let default_concurrency_limit = if use_azure {
+            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
+        } else {
+            DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+        };
        let concurrency_limit = NonZeroUsize::new(
-            parse_optional_integer("concurrency_limit", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
+            parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
        )
        .context("Failed to parse 'concurrency_limit' as a positive integer")?;

@@ -466,40 +543,73 @@ impl RemoteStorageConfig {
                .context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
                .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);

-        let storage = match (local_path, bucket_name, bucket_region) {
+        let endpoint = toml
+            .get("endpoint")
+            .map(|endpoint| parse_toml_string("endpoint", endpoint))
+            .transpose()?;
+
+        let storage = match (
+            local_path,
+            bucket_name,
+            bucket_region,
+            container_name,
+            container_region,
+        ) {
            // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
-            (None, None, None) => return Ok(None),
-            (_, Some(_), None) => {
+            (None, None, None, None, None) => return Ok(None),
+            (_, Some(_), None, ..) => {
                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
            }
-            (_, None, Some(_)) => {
+            (_, None, Some(_), ..) => {
                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
            }
-            (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
-                bucket_name: parse_toml_string("bucket_name", bucket_name)?,
-                bucket_region: parse_toml_string("bucket_region", bucket_region)?,
-                prefix_in_bucket: toml
-                    .get("prefix_in_bucket")
-                    .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
-                    .transpose()?,
-                endpoint: toml
-                    .get("endpoint")
-                    .map(|endpoint| parse_toml_string("endpoint", endpoint))
-                    .transpose()?,
-                concurrency_limit,
-                max_keys_per_list_response,
-            }),
-            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
-                parse_toml_string("local_path", local_path)?,
-            )),
-            (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
+            (None, Some(bucket_name), Some(bucket_region), ..) => {
+                RemoteStorageKind::AwsS3(S3Config {
+                    bucket_name: parse_toml_string("bucket_name", bucket_name)?,
+                    bucket_region: parse_toml_string("bucket_region", bucket_region)?,
+                    prefix_in_bucket: toml
+                        .get("prefix_in_bucket")
+                        .map(|prefix_in_bucket| {
+                            parse_toml_string("prefix_in_bucket", prefix_in_bucket)
+                        })
+                        .transpose()?,
+                    endpoint,
+                    concurrency_limit,
+                    max_keys_per_list_response,
+                })
+            }
+            (_, _, _, Some(_), None) => {
+                bail!("'container_name' option is mandatory if 'container_region' is given ")
+            }
+            (_, _, _, None, Some(_)) => {
+                bail!("'container_name' option is mandatory if 'container_region' is given ")
+            }
+            (None, None, None, Some(container_name), Some(container_region)) => {
+                RemoteStorageKind::AzureContainer(AzureConfig {
+                    container_name: parse_toml_string("container_name", container_name)?,
+                    container_region: parse_toml_string("container_region", container_region)?,
+                    prefix_in_container: toml
+                        .get("prefix_in_container")
+                        .map(|prefix_in_container| {
+                            parse_toml_string("prefix_in_container", prefix_in_container)
+                        })
+                        .transpose()?,
+                    concurrency_limit,
+                    max_keys_per_list_response,
+                })
+            }
+            (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
+                Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
+            ),
+            (Some(_), Some(_), ..) => {
+                bail!("'local_path' and 'bucket_name' are mutually exclusive")
+            }
+            (Some(_), _, _, Some(_), Some(_)) => {
+                bail!("local_path and 'container_name' are mutually exclusive")
+            }
        };

-        Ok(Some(RemoteStorageConfig {
-            max_concurrent_syncs,
-            max_sync_errors,
-            storage,
-        }))
+        Ok(Some(RemoteStorageConfig { storage }))
    }
 }

@@ -528,29 +638,69 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
    Ok(s.to_string())
 }

+struct ConcurrencyLimiter {
+    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
+    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
+    // The helps to ensure we don't exceed the thresholds.
+    write: Arc<Semaphore>,
+    read: Arc<Semaphore>,
+}
+
+impl ConcurrencyLimiter {
+    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
+        match kind {
+            RequestKind::Get => &self.read,
+            RequestKind::Put => &self.write,
+            RequestKind::List => &self.read,
+            RequestKind::Delete => &self.write,
+        }
+    }
+
+    async fn acquire(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
+        self.for_kind(kind).acquire().await
+    }
+
+    async fn acquire_owned(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
+        Arc::clone(self.for_kind(kind)).acquire_owned().await
+    }
+
+    fn new(limit: usize) -> ConcurrencyLimiter {
+        Self {
+            read: Arc::new(Semaphore::new(limit)),
+            write: Arc::new(Semaphore::new(limit)),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;

    #[test]
    fn test_object_name() {
-        let k = RemotePath::new(Path::new("a/b/c")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
        assert_eq!(k.object_name(), Some("c"));

-        let k = RemotePath::new(Path::new("a/b/c/")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("a/b/c/")).unwrap();
        assert_eq!(k.object_name(), Some("c"));

-        let k = RemotePath::new(Path::new("a/")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("a/")).unwrap();
        assert_eq!(k.object_name(), Some("a"));

        // XXX is it impossible to have an empty key?
-        let k = RemotePath::new(Path::new("")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("")).unwrap();
        assert_eq!(k.object_name(), None);
    }

    #[test]
    fn rempte_path_cannot_be_created_from_absolute_ones() {
-        let err = RemotePath::new(Path::new("/")).expect_err("Should fail on absolute paths");
+        let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
        assert_eq!(err.to_string(), "Path \"/\" is not relative");
    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,23 +4,21 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.

-use std::{
-    borrow::Cow,
-    future::Future,
-    io::ErrorKind,
-    path::{Path, PathBuf},
-    pin::Pin,
-};
+use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};

 use anyhow::{bail, ensure, Context};
+use bytes::Bytes;
+use camino::{Utf8Path, Utf8PathBuf};
+use futures::stream::Stream;
 use tokio::{
    fs,
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
+use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, RemotePath};
+use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};

 use super::{RemoteStorage, StorageMetadata};

@@ -28,20 +26,20 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";

 #[derive(Debug, Clone)]
 pub struct LocalFs {
-    storage_root: PathBuf,
+    storage_root: Utf8PathBuf,
 }

 impl LocalFs {
    /// Attempts to create local FS storage, along with its root directory.
    /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
-    pub fn new(mut storage_root: PathBuf) -> anyhow::Result<Self> {
+    pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result<Self> {
        if !storage_root.exists() {
            std::fs::create_dir_all(&storage_root).with_context(|| {
                format!("Failed to create all directories in the given root path {storage_root:?}")
            })?;
        }
        if !storage_root.is_absolute() {
-            storage_root = storage_root.canonicalize().with_context(|| {
+            storage_root = storage_root.canonicalize_utf8().with_context(|| {
                format!("Failed to represent path {storage_root:?} as an absolute path")
            })?;
        }
@@ -50,7 +48,7 @@ impl LocalFs {
    }

    // mirrors S3Bucket::s3_object_to_relative_path
-    fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
+    fn local_file_to_relative_path(&self, key: Utf8PathBuf) -> RemotePath {
        let relative_path = key
            .strip_prefix(&self.storage_root)
            .expect("relative path must contain storage_root as prefix");
@@ -59,22 +57,18 @@ impl LocalFs {

    async fn read_storage_metadata(
        &self,
-        file_path: &Path,
+        file_path: &Utf8Path,
    ) -> anyhow::Result<Option<StorageMetadata>> {
        let metadata_path = storage_metadata_path(file_path);
        if metadata_path.exists() && metadata_path.is_file() {
            let metadata_string = fs::read_to_string(&metadata_path).await.with_context(|| {
-                format!(
-                    "Failed to read metadata from the local storage at '{}'",
-                    metadata_path.display()
-                )
+                format!("Failed to read metadata from the local storage at '{metadata_path}'")
            })?;

            serde_json::from_str(&metadata_string)
                .with_context(|| {
                    format!(
-                        "Failed to deserialize metadata from the local storage at '{}'",
-                        metadata_path.display()
+                        "Failed to deserialize metadata from the local storage at '{metadata_path}'",
                    )
                })
                .map(|metadata| Some(StorageMetadata(metadata)))
@@ -84,7 +78,7 @@ impl LocalFs {
    }

    #[cfg(test)]
-    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
        Ok(get_all_files(&self.storage_root, true)
            .await?
            .into_iter()
@@ -98,14 +92,97 @@ impl LocalFs {
            })
            .collect())
    }
+
+    // recursively lists all files in a directory,
+    // mirroring the `list_files` for `s3_bucket`
+    async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let full_path = match folder {
+            Some(folder) => folder.with_base(&self.storage_root),
+            None => self.storage_root.clone(),
+        };
+
+        // If we were given a directory, we may use it as our starting point.
+        // Otherwise, we must go up to the first ancestor dir that exists.  This is because
+        // S3 object list prefixes can be arbitrary strings, but when reading
+        // the local filesystem we need a directory to start calling read_dir on.
+        let mut initial_dir = full_path.clone();
+        loop {
+            // Did we make it to the root?
+            if initial_dir.parent().is_none() {
+                anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
+            }
+
+            match fs::metadata(initial_dir.clone()).await {
+                Ok(meta) if meta.is_dir() => {
+                    // We found a directory, break
+                    break;
+                }
+                Ok(_meta) => {
+                    // It's not a directory: strip back to the parent
+                    initial_dir.pop();
+                }
+                Err(e) if e.kind() == ErrorKind::NotFound => {
+                    // It's not a file that exists: strip the prefix back to the parent directory
+                    initial_dir.pop();
+                }
+                Err(e) => {
+                    // Unexpected I/O error
+                    anyhow::bail!(e)
+                }
+            }
+        }
+        // Note that Utf8PathBuf starts_with only considers full path segments, but
+        // object prefixes are arbitrary strings, so we need the strings for doing
+        // starts_with later.
+        let prefix = full_path.as_str();
+
+        let mut files = vec![];
+        let mut directory_queue = vec![initial_dir];
+        while let Some(cur_folder) = directory_queue.pop() {
+            let mut entries = cur_folder.read_dir_utf8()?;
+            while let Some(Ok(entry)) = entries.next() {
+                let file_name = entry.file_name();
+                let full_file_name = cur_folder.join(file_name);
+                if full_file_name.as_str().starts_with(prefix) {
+                    let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
+                    files.push(file_remote_path);
+                    if full_file_name.is_dir() {
+                        directory_queue.push(full_file_name);
+                    }
+                }
+            }
+        }
+
+        Ok(files)
+    }
 }

 #[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
-    async fn list_prefixes(
+    async fn list(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
+        mode: ListingMode,
+    ) -> Result<Listing, DownloadError> {
+        let mut result = Listing::default();
+
+        if let ListingMode::NoDelimiter = mode {
+            let keys = self
+                .list_recursive(prefix)
+                .await
+                .map_err(DownloadError::Other)?;
+
+            result.keys = keys
+                .into_iter()
+                .filter(|k| {
+                    let path = k.with_base(&self.storage_root);
+                    !path.is_dir()
+                })
+                .collect();
+
+            return Ok(result);
+        }
+
        let path = match prefix {
            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
            None => Cow::Borrowed(&self.storage_root),
@@ -115,8 +192,6 @@ impl RemoteStorage for LocalFs {
            .await
            .map_err(DownloadError::Other)?;

-        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
-
        // filter out empty directories to mirror s3 behavior.
        for prefix in prefixes_to_filter {
            if prefix.is_dir()
@@ -127,48 +202,27 @@ impl RemoteStorage for LocalFs {
                continue;
            }

-            prefixes.push(
-                prefix
-                    .strip_prefix(&self.storage_root)
-                    .context("Failed to strip prefix")
-                    .and_then(RemotePath::new)
-                    .expect(
-                        "We list files for storage root, hence should be able to remote the prefix",
-                    ),
-            )
-        }
+            let stripped = prefix
+                .strip_prefix(&self.storage_root)
+                .context("Failed to strip prefix")
+                .and_then(RemotePath::new)
+                .expect(
+                    "We list files for storage root, hence should be able to remote the prefix",
+                );

-        Ok(prefixes)
-    }
-
-    // recursively lists all files in a directory,
-    // mirroring the `list_files` for `s3_bucket`
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let full_path = match folder {
-            Some(folder) => folder.with_base(&self.storage_root),
-            None => self.storage_root.clone(),
-        };
-        let mut files = vec![];
-        let mut directory_queue = vec![full_path.clone()];
-
-        while let Some(cur_folder) = directory_queue.pop() {
-            let mut entries = fs::read_dir(cur_folder.clone()).await?;
-            while let Some(entry) = entries.next_entry().await? {
-                let file_name: PathBuf = entry.file_name().into();
-                let full_file_name = cur_folder.clone().join(&file_name);
-                let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
-                files.push(file_remote_path.clone());
-                if full_file_name.is_dir() {
-                    directory_queue.push(full_file_name);
-                }
+            if prefix.is_dir() {
+                result.prefixes.push(stripped);
+            } else {
+                result.keys.push(stripped);
            }
        }
-        Ok(files)
+
+        Ok(result)
    }

    async fn upload(
        &self,
-        data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -196,22 +250,21 @@ impl RemoteStorage for LocalFs {
                .open(&temp_file_path)
                .await
                .with_context(|| {
-                    format!(
-                        "Failed to open target fs destination at '{}'",
-                        target_file_path.display()
-                    )
+                    format!("Failed to open target fs destination at '{target_file_path}'")
                })?,
        );

        let from_size_bytes = data_size_bytes as u64;
+        let data = tokio_util::io::StreamReader::new(data);
+        let data = std::pin::pin!(data);
        let mut buffer_to_read = data.take(from_size_bytes);

-        let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
+        // alternatively we could just write the bytes to a file, but local_fs is a testing utility
+        let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
            .await
            .with_context(|| {
                format!(
-                    "Failed to upload file (write temp) to the local storage at '{}'",
-                    temp_file_path.display()
+                    "Failed to upload file (write temp) to the local storage at '{temp_file_path}'",
                )
            })?;

@@ -228,8 +281,7 @@ impl RemoteStorage for LocalFs {

        destination.flush().await.with_context(|| {
            format!(
-                "Failed to upload (flush temp) file to the local storage at '{}'",
-                temp_file_path.display()
+                "Failed to upload (flush temp) file to the local storage at '{temp_file_path}'",
            )
        })?;

@@ -237,8 +289,7 @@ impl RemoteStorage for LocalFs {
            .await
            .with_context(|| {
                format!(
-                    "Failed to upload (rename) file to the local storage at '{}'",
-                    target_file_path.display()
+                    "Failed to upload (rename) file to the local storage at '{target_file_path}'",
                )
            })?;

@@ -252,8 +303,7 @@ impl RemoteStorage for LocalFs {
            .await
            .with_context(|| {
                format!(
-                    "Failed to write metadata to the local storage at '{}'",
-                    storage_metadata_path.display()
+                    "Failed to write metadata to the local storage at '{storage_metadata_path}'",
                )
            })?;
        }
@@ -264,7 +314,7 @@ impl RemoteStorage for LocalFs {
    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let source = io::BufReader::new(
+            let source = ReaderStream::new(
                fs::OpenOptions::new()
                    .read(true)
                    .open(&target_path)
@@ -304,16 +354,14 @@ impl RemoteStorage for LocalFs {
        }
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let mut source = io::BufReader::new(
-                fs::OpenOptions::new()
-                    .read(true)
-                    .open(&target_path)
-                    .await
-                    .with_context(|| {
-                        format!("Failed to open source file {target_path:?} to use in the download")
-                    })
-                    .map_err(DownloadError::Other)?,
-            );
+            let mut source = tokio::fs::OpenOptions::new()
+                .read(true)
+                .open(&target_path)
+                .await
+                .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
+                })
+                .map_err(DownloadError::Other)?;
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
@@ -327,11 +375,13 @@ impl RemoteStorage for LocalFs {
            Ok(match end_exclusive {
                Some(end_exclusive) => Download {
                    metadata,
-                    download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
+                    download_stream: Box::pin(ReaderStream::new(
+                        source.take(end_exclusive - start_inclusive),
+                    )),
                },
                None => Download {
                    metadata,
-                    download_stream: Box::pin(source),
+                    download_stream: Box::pin(ReaderStream::new(source)),
                },
            })
        } else {
@@ -359,16 +409,16 @@ impl RemoteStorage for LocalFs {
    }
 }

-fn storage_metadata_path(original_path: &Path) -> PathBuf {
+fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }

 fn get_all_files<'a, P>(
    directory_path: P,
    recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
 where
-    P: AsRef<Path> + Send + Sync + 'a,
+    P: AsRef<Utf8Path> + Send + Sync + 'a,
 {
    Box::pin(async move {
        let directory_path = directory_path.as_ref();
@@ -378,7 +428,13 @@ where
                let mut dir_contents = fs::read_dir(directory_path).await?;
                while let Some(dir_entry) = dir_contents.next_entry().await? {
                    let file_type = dir_entry.file_type().await?;
-                    let entry_path = dir_entry.path();
+                    let entry_path =
+                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                            anyhow::Error::msg(format!(
+                                "non-Unicode path: {}",
+                                pb.to_string_lossy()
+                            ))
+                        })?;
                    if file_type.is_symlink() {
                        debug!("{entry_path:?} is a symlink, skipping")
                    } else if file_type.is_dir() {
@@ -401,13 +457,10 @@ where
    })
 }

-async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
+async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
-        None => bail!(
-            "File path '{}' has no parent directory",
-            target_file_path.display()
-        ),
+        None => bail!("File path '{target_file_path}' has no parent directory"),
    };
    if !target_dir.exists() {
        fs::create_dir_all(target_dir).await?;
@@ -415,13 +468,9 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
    Ok(())
 }

-fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
+fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
    if file_path.exists() {
-        ensure!(
-            file_path.is_file(),
-            "file path '{}' is not a file",
-            file_path.display()
-        );
+        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
        Ok(true)
    } else {
        Ok(false)
@@ -432,17 +481,19 @@ fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;

+    use bytes::Bytes;
+    use camino_tempfile::tempdir;
+    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};
-    use tempfile::tempdir;

    async fn read_and_assert_remote_file_contents(
        storage: &LocalFs,
        #[allow(clippy::ptr_arg)]
-        // have to use &PathBuf due to `storage.local_path` parameter requirements
+        // have to use &Utf8PathBuf due to `storage.local_path` parameter requirements
        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
-        let mut download = storage
+        let download = storage
            .download(remote_storage_path)
            .await
            .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
@@ -451,13 +502,9 @@ mod fs_tests {
            "Unexpected metadata returned for the downloaded file"
        );

-        let mut contents = String::new();
-        download
-            .download_stream
-            .read_to_string(&mut contents)
-            .await
-            .context("Failed to read remote file contents into string")?;
-        Ok(contents)
+        let contents = aggregate(download.download_stream).await?;
+
+        String::from_utf8(contents).map_err(anyhow::Error::new)
    }

    #[tokio::test]
@@ -466,7 +513,7 @@ mod fs_tests {

        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
        assert_eq!(
-            storage.list().await?,
+            storage.list_all().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );
@@ -485,32 +532,34 @@ mod fs_tests {
    async fn upload_file_negatives() -> anyhow::Result<()> {
        let storage = create_storage()?;

-        let id = RemotePath::new(Path::new("dummy"))?;
-        let content = std::io::Cursor::new(b"12345");
+        let id = RemotePath::new(Utf8Path::new("dummy"))?;
+        let content = Bytes::from_static(b"12345");
+        let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));

        // Check that you get an error if the size parameter doesn't match the actual
        // size of the stream.
        storage
-            .upload(Box::new(content.clone()), 0, &id, None)
+            .upload(content(), 0, &id, None)
            .await
            .expect_err("upload with zero size succeeded");
        storage
-            .upload(Box::new(content.clone()), 4, &id, None)
+            .upload(content(), 4, &id, None)
            .await
            .expect_err("upload with too short size succeeded");
        storage
-            .upload(Box::new(content.clone()), 6, &id, None)
+            .upload(content(), 6, &id, None)
            .await
            .expect_err("upload with too large size succeeded");

        // Correct size is 5, this should succeed.
-        storage.upload(Box::new(content), 5, &id, None).await?;
+        storage.upload(content(), 5, &id, None).await?;

        Ok(())
    }

    fn create_storage() -> anyhow::Result<LocalFs> {
-        LocalFs::new(tempdir()?.path().to_owned())
+        let storage_root = tempdir()?.path().to_path_buf();
+        LocalFs::new(storage_root)
    }

    #[tokio::test]
@@ -527,7 +576,7 @@ mod fs_tests {
        );

        let non_existing_path = "somewhere/else";
-        match storage.download(&RemotePath::new(Path::new(non_existing_path))?).await {
+        match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await {
            Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
            other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
        }
@@ -551,7 +600,7 @@ mod fs_tests {
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);

-        let mut first_part_download = storage
+        let first_part_download = storage
            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
        assert!(
@@ -559,21 +608,13 @@ mod fs_tests {
            "No metadata should be returned for no metadata upload"
        );

-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut first_part_download.download_stream,
-            &mut first_part_remote,
-        )
-        .await?;
-        first_part_remote.flush().await?;
-        let first_part_remote = first_part_remote.into_inner().into_inner();
+        let first_part_remote = aggregate(first_part_download.download_stream).await?;
        assert_eq!(
-            first_part_local,
-            first_part_remote.as_slice(),
+            first_part_local, first_part_remote,
            "First part bytes should be returned when requested"
        );

-        let mut second_part_download = storage
+        let second_part_download = storage
            .download_byte_range(
                &upload_target,
                first_part_local.len() as u64,
@@ -585,17 +626,9 @@ mod fs_tests {
            "No metadata should be returned for no metadata upload"
        );

-        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut second_part_download.download_stream,
-            &mut second_part_remote,
-        )
-        .await?;
-        second_part_remote.flush().await?;
-        let second_part_remote = second_part_remote.into_inner().into_inner();
+        let second_part_remote = aggregate(second_part_download.download_stream).await?;
        assert_eq!(
-            second_part_local,
-            second_part_remote.as_slice(),
+            second_part_local, second_part_remote,
            "Second part bytes should be returned when requested"
        );

@@ -653,7 +686,7 @@ mod fs_tests {
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        storage.delete(&upload_target).await?;
-        assert!(storage.list().await?.is_empty());
+        assert!(storage.list_all().await?.is_empty());

        storage
            .delete(&upload_target)
@@ -685,17 +718,10 @@ mod fs_tests {
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, _) = uploaded_bytes.split_at(3);

-        let mut partial_download_with_metadata = storage
+        let partial_download_with_metadata = storage
            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut partial_download_with_metadata.download_stream,
-            &mut first_part_remote,
-        )
-        .await?;
-        first_part_remote.flush().await?;
-        let first_part_remote = first_part_remote.into_inner().into_inner();
+        let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
        assert_eq!(
            first_part_local,
            first_part_remote.as_slice(),
@@ -711,6 +737,43 @@ mod fs_tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn list() -> anyhow::Result<()> {
+        // No delimiter: should recursively list everything
+        let storage = create_storage()?;
+        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
+        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
+
+        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
+        assert!(listing.prefixes.is_empty());
+        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+
+        // Delimiter: should only go one deep
+        let listing = storage.list(None, ListingMode::WithDelimiter).await?;
+
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("timelines").unwrap()].to_vec()
+        );
+        assert!(listing.keys.is_empty());
+
+        // Delimiter & prefix
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
+                ListingMode::WithDelimiter,
+            )
+            .await?;
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
+                .to_vec()
+        );
+        assert_eq!(listing.keys, [uncle.clone()].to_vec());
+
+        Ok(())
+    }
+
    async fn upload_dummy_file(
        storage: &LocalFs,
        name: &str,
@@ -734,16 +797,16 @@ mod fs_tests {
                )
            })?;

-        storage
-            .upload(Box::new(file), size, &relative_path, metadata)
-            .await?;
+        let file = tokio_util::io::ReaderStream::new(file);
+
+        storage.upload(file, size, &relative_path, metadata).await?;
        Ok(relative_path)
    }

    async fn create_file_for_upload(
-        path: &Path,
+        path: &Utf8Path,
        contents: &str,
-    ) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
+    ) -> anyhow::Result<(fs::File, usize)> {
        std::fs::create_dir_all(path.parent().unwrap())?;
        let mut file_for_writing = std::fs::OpenOptions::new()
            .write(true)
@@ -753,7 +816,7 @@ mod fs_tests {
        drop(file_for_writing);
        let file_size = path.metadata()?.len() as usize;
        Ok((
-            io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
+            fs::OpenOptions::new().read(true).open(&path).await?,
            file_size,
        ))
    }
@@ -763,8 +826,20 @@ mod fs_tests {
    }

    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
-        let mut files = storage.list().await?;
+        let mut files = storage.list_all().await?;
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
+
+    async fn aggregate(
+        stream: impl Stream<Item = std::io::Result<Bytes>>,
+    ) -> anyhow::Result<Vec<u8>> {
+        use futures::stream::StreamExt;
+        let mut out = Vec::new();
+        let mut stream = std::pin::pin!(stream);
+        while let Some(res) = stream.next().await {
+            out.extend_from_slice(&res?[..]);
+        }
+        Ok(out)
+    }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,43 +4,50 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::sync::Arc;
+use std::{
+    borrow::Cow,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};

-use anyhow::Context;
+use anyhow::Context as _;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig,
+    retry::{RetryConfigBuilder, RetryMode},
+    web_identity_token::WebIdentityTokenCredentialsProvider,
+    BehaviorVersion,
 };
-use aws_credential_types::cache::CredentialsCache;
+use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
-    config::{Config, Region},
+    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
-use aws_smithy_http::body::SdkBody;
-use hyper::{Body, StatusCode};
+use aws_smithy_async::rt::sleep::TokioSleep;
+
+use aws_smithy_types::body::SdkBody;
+use aws_smithy_types::byte_stream::ByteStream;
+use bytes::Bytes;
+use futures::stream::Stream;
+use hyper::Body;
 use scopeguard::ScopeGuard;
-use tokio::{
-    io::{self, AsyncRead},
-    sync::Semaphore,
-};
-use tokio_util::io::ReaderStream;
-use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
+    S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

-const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
-
 pub(super) mod metrics;

-use self::metrics::{AttemptOutcome, RequestKind};
+use self::metrics::AttemptOutcome;
+pub(super) use self::metrics::RequestKind;

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -48,10 +55,7 @@ pub struct S3Bucket {
    bucket_name: String,
    prefix_in_bucket: Option<String>,
    max_keys_per_list_response: Option<i32>,
-    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
-    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
-    // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Arc<Semaphore>,
+    concurrency_limiter: ConcurrencyLimiter,
 }

 #[derive(Default)]
@@ -63,7 +67,7 @@ struct GetObjectRequest {
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
-        debug!(
+        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
        );
@@ -80,7 +84,6 @@ impl S3Bucket {
            // needed to access remote extensions bucket
            .or_else("token", {
                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
                WebIdentityTokenCredentialsProvider::builder()
                    .configure(&provider_conf)
                    .build()
@@ -89,16 +92,31 @@ impl S3Bucket {
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

-        let mut config_builder = Config::builder()
+        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
+        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
+
+        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
+        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
+        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
+        let mut retry_config = RetryConfigBuilder::new();
+        retry_config
+            .set_max_attempts(Some(1))
+            .set_mode(Some(RetryMode::Adaptive));
+
+        let mut config_builder = Builder::default()
+            .behavior_version(BehaviorVersion::v2023_11_09())
            .region(region)
-            .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider);
+            .identity_cache(IdentityCache::lazy().build())
+            .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+            .retry_config(retry_config.build())
+            .sleep_impl(SharedAsyncSleep::from(sleep_impl));

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
                .endpoint_url(custom_endpoint)
                .force_path_style(true);
        }
+
        let client = Client::from_conf(config_builder.build());

        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
@@ -118,7 +136,7 @@ impl S3Bucket {
            bucket_name: aws_config.bucket_name.clone(),
            max_keys_per_list_response: aws_config.max_keys_per_list_response,
            prefix_in_bucket,
-            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
+            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
        })
    }

@@ -144,12 +162,11 @@ impl S3Bucket {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
        let path_string = path
            .get_path()
-            .to_string_lossy()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
-            .to_string();
+            .as_str()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
        match &self.prefix_in_bucket {
-            Some(prefix) => prefix.clone() + "/" + &path_string,
-            None => path_string,
+            Some(prefix) => prefix.clone() + "/" + path_string,
+            None => path_string.to_string(),
        }
    }

@@ -157,7 +174,7 @@ impl S3Bucket {
        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
-            .acquire()
+            .acquire(kind)
            .await
            .expect("semaphore is never closed");

@@ -173,8 +190,7 @@ impl S3Bucket {
        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
-            .clone()
-            .acquire_owned()
+            .acquire_owned(kind)
            .await
            .expect("semaphore is never closed");

@@ -213,12 +229,15 @@ impl S3Bucket {
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
+
+                let body = object_output.body;
+                let body = ByteStreamAsStream::from(body);
+                let body = PermitCarrying::new(permit, body);
+                let body = TimedDownload::new(started_at, body);
+
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
-                        started_at,
-                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
-                    ))),
+                    download_stream: Box::pin(body),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -231,29 +250,55 @@ impl S3Bucket {
    }
 }

+pin_project_lite::pin_project! {
+    struct ByteStreamAsStream {
+        #[pin]
+        inner: aws_smithy_types::byte_stream::ByteStream
+    }
+}
+
+impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
+    fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
+        ByteStreamAsStream { inner }
+    }
+}
+
+impl Stream for ByteStreamAsStream {
+    type Item = std::io::Result<Bytes>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        // this does the std::io::ErrorKind::Other conversion
+        self.project().inner.poll_next(cx).map_err(|x| x.into())
+    }
+
+    // cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
+    // sense and Stream::size_hint does not really
+}
+
 pin_project_lite::pin_project! {
    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    struct RatelimitedAsyncRead<S> {
+    struct PermitCarrying<S> {
        permit: tokio::sync::OwnedSemaphorePermit,
        #[pin]
        inner: S,
    }
 }

-impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+impl<S> PermitCarrying<S> {
    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        RatelimitedAsyncRead { permit, inner }
+        Self { permit, inner }
    }
 }

-impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
-        let this = self.project();
-        this.inner.poll_read(cx, buf)
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.project().inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
    }
 }

@@ -273,7 +318,7 @@ pin_project_lite::pin_project! {
    }
 }

-impl<S: AsyncRead> TimedDownload<S> {
+impl<S> TimedDownload<S> {
    fn new(started_at: std::time::Instant, inner: S) -> Self {
        TimedDownload {
            started_at,
@@ -283,37 +328,38 @@ impl<S: AsyncRead> TimedDownload<S> {
    }
 }

-impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        use std::task::ready;
+
        let this = self.project();
-        let before = buf.filled().len();
-        let read = std::task::ready!(this.inner.poll_read(cx, buf));

-        let read_eof = buf.filled().len() == before;
-
-        match read {
-            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
-            Ok(()) => { /* still in progress */ }
-            Err(_) => *this.outcome = AttemptOutcome::Err,
+        let res = ready!(this.inner.poll_next(cx));
+        match &res {
+            Some(Ok(_)) => {}
+            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
+            None => *this.outcome = metrics::AttemptOutcome::Ok,
        }

-        std::task::Poll::Ready(read)
+        Poll::Ready(res)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
    }
 }

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    /// See the doc for `RemoteStorage::list_prefixes`
-    /// Note: it wont include empty "directories"
-    async fn list_prefixes(
+    async fn list(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
+        mode: ListingMode,
+    ) -> Result<Listing, DownloadError> {
        let kind = RequestKind::List;
+        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -322,28 +368,33 @@ impl RemoteStorage for S3Bucket {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });

-        let mut document_keys = Vec::new();
-
        let mut continuation_token = None;

        loop {
            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);

-            let fetch_response = self
+            let mut request = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(list_prefix.clone())
                .set_continuation_token(continuation_token)
-                .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
-                .set_max_keys(self.max_keys_per_list_response)
+                .set_max_keys(self.max_keys_per_list_response);
+
+            if let ListingMode::WithDelimiter = mode {
+                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+            }
+
+            let response = request
                .send()
                .await
                .context("Failed to list S3 prefixes")
@@ -353,76 +404,40 @@ impl RemoteStorage for S3Bucket {

            metrics::BUCKET_METRICS
                .req_seconds
-                .observe_elapsed(kind, &fetch_response, started_at);
+                .observe_elapsed(kind, &response, started_at);

-            let fetch_response = fetch_response?;
+            let response = response?;

-            document_keys.extend(
-                fetch_response
-                    .common_prefixes
-                    .unwrap_or_default()
-                    .into_iter()
+            let keys = response.contents();
+            let empty = Vec::new();
+            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
+
+            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+
+            for object in keys {
+                let object_path = object.key().expect("response does not contain a key");
+                let remote_path = self.s3_object_to_relative_path(object_path);
+                result.keys.push(remote_path);
+            }
+
+            result.prefixes.extend(
+                prefixes
+                    .iter()
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            continuation_token = match fetch_response.next_continuation_token {
+            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
                None => break,
            };
        }

-        Ok(document_keys)
-    }
-
-    /// See the doc for `RemoteStorage::list_files`
-    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let kind = RequestKind::List;
-
-        let folder_name = folder
-            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone());
-
-        // AWS may need to break the response into several parts
-        let mut continuation_token = None;
-        let mut all_files = vec![];
-        loop {
-            let _guard = self.permit(kind).await;
-            let started_at = start_measuring_requests(kind);
-
-            let response = self
-                .client
-                .list_objects_v2()
-                .bucket(self.bucket_name.clone())
-                .set_prefix(folder_name.clone())
-                .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response)
-                .send()
-                .await
-                .context("Failed to list files in S3 bucket");
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
-
-            let response = response?;
-
-            for object in response.contents().unwrap_or_default() {
-                let object_path = object.key().expect("response does not contain a key");
-                let remote_path = self.s3_object_to_relative_path(object_path);
-                all_files.push(remote_path);
-            }
-            match response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-        Ok(all_files)
+        Ok(result)
    }

    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -432,8 +447,8 @@ impl RemoteStorage for S3Bucket {

        let started_at = start_measuring_requests(kind);

-        let body = Body::wrap_stream(ReaderStream::new(from));
-        let bytes_stream = ByteStream::new(SdkBody::from(body));
+        let body = Body::wrap_stream(from);
+        let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));

        let res = self
            .client
@@ -496,18 +511,22 @@ impl RemoteStorage for S3Bucket {
        for path in paths {
            let obj_id = ObjectIdentifier::builder()
                .set_key(Some(self.relative_path_to_s3_object(path)))
-                .build();
+                .build()?;
            delete_objects.push(obj_id);
        }

-        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
            let started_at = start_measuring_requests(kind);

            let resp = self
                .client
                .delete_objects()
                .bucket(self.bucket_name.clone())
-                .delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
+                .delete(
+                    Delete::builder()
+                        .set_objects(Some(chunk.to_vec()))
+                        .build()?,
+                )
                .send()
                .await;

@@ -522,6 +541,20 @@ impl RemoteStorage for S3Bucket {
                        .deleted_objects_total
                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
+                        // Log a bounded number of the errors within the response:
+                        // these requests can carry 1000 keys so logging each one
+                        // would be too verbose, especially as errors may lead us
+                        // to retry repeatedly.
+                        const LOG_UP_TO_N_ERRORS: usize = 10;
+                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
+                            tracing::warn!(
+                                "DeleteObjects key {} failed: {}: {}",
+                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
+                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
+                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
+                            );
+                        }
+
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
@@ -529,16 +562,7 @@ impl RemoteStorage for S3Bucket {
                    }
                }
                Err(e) => {
-                    if let Some(r) = e.raw_response() {
-                        if r.http().status() == StatusCode::NOT_FOUND {
-                            // 404 is acceptable for deletions.  AWS S3 does not return this, but
-                            // some other implementations might (e.g. GCS XML API returns 404 on DeleteObject
-                            // to a missing key)
-                            continue;
-                        } else {
-                            return Err(anyhow::format_err!("DeleteObjects response error: {e}"));
-                        }
-                    }
+                    return Err(e.into());
                }
            }
        }
@@ -575,17 +599,17 @@ fn start_measuring_requests(

 #[cfg(test)]
 mod tests {
+    use camino::Utf8Path;
    use std::num::NonZeroUsize;
-    use std::path::Path;

    use crate::{RemotePath, S3Bucket, S3Config};

    #[test]
    fn relative_path() {
-        let all_paths = vec!["", "some/path", "some/path/"];
+        let all_paths = ["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
-            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
+            .map(|x| RemotePath::new(Utf8Path::new(x)).expect("bad path"))
            .collect();
        let prefixes = [
            None,
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -6,7 +6,7 @@ use once_cell::sync::Lazy;
 pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);

 #[derive(Clone, Copy, Debug)]
-pub(super) enum RequestKind {
+pub(crate) enum RequestKind {
    Get = 0,
    Put = 1,
    Delete = 2,
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,11 +1,15 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
+use bytes::Bytes;
+use futures::stream::Stream;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;

-use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
+use crate::{
+    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+};

 pub struct UnreliableWrapper {
    inner: crate::GenericRemoteStorage,
@@ -95,9 +99,18 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_files(folder).await
    }

+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+    ) -> Result<Listing, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
+        self.inner.list(prefix, mode).await
+    }
+
    async fn upload(
        &self,
-        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -0,0 +1,643 @@
+use std::collections::HashSet;
+use std::env;
+use std::num::NonZeroUsize;
+use std::ops::ControlFlow;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::UNIX_EPOCH;
+
+use anyhow::Context;
+use bytes::Bytes;
+use camino::Utf8Path;
+use futures::stream::Stream;
+use once_cell::sync::OnceCell;
+use remote_storage::{
+    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+};
+use test_context::{test_context, AsyncTestContext};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info};
+
+static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+
+const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
+
+const BASE_PREFIX: &str = "test";
+
+/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
+/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
+/// See the client creation in [`create_azure_client`] for details on the required env vars.
+/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
+/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
+///
+/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
+/// where
+/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
+/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
+///
+/// Then, verifies that the client does return correct prefixes when queried:
+/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
+/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
+///
+/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
+///
+/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
+/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
+#[test_context(MaybeEnabledAzureWithTestBlobs)]
+#[tokio::test]
+async fn azure_pagination_should_work(
+    ctx: &mut MaybeEnabledAzureWithTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("Azure init failed: {e:?}")
+        }
+    };
+
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let expected_remote_prefixes = ctx.remote_prefixes.clone();
+
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
+    let root_remote_prefixes = test_client
+        .list_prefixes(None)
+        .await
+        .context("client list root prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
+    );
+
+    let nested_remote_prefixes = test_client
+        .list_prefixes(Some(&base_prefix))
+        .await
+        .context("client list nested prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let remote_only_prefixes = nested_remote_prefixes
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
+    Ok(())
+}
+
+/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
+/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
+/// See `Azure_pagination_should_work` for more information.
+///
+/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
+/// Then performs the following queries:
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
+#[tokio::test]
+async fn azure_list_files_works(
+    ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("Azure init failed: {e:?}")
+        }
+    };
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let base_prefix =
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
+    let root_files = test_client
+        .list_files(None)
+        .await
+        .context("client list root files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_files,
+        ctx.remote_blobs.clone(),
+        "remote storage list_files on root mismatches with the uploads."
+    );
+    let nested_remote_files = test_client
+        .list_files(Some(&base_prefix))
+        .await
+        .context("client list nested files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let trim_remote_blobs: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .map(|x| x.get_path())
+        .filter(|x| x.starts_with("folder1"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
+        .collect();
+    assert_eq!(
+        nested_remote_files, trim_remote_blobs,
+        "remote storage list_files on subdirrectory mismatches with the uploads."
+    );
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzure::Enabled(ctx) => ctx,
+        MaybeEnabledAzure::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzure::Enabled(ctx) => ctx,
+        MaybeEnabledAzure::Disabled => return Ok(()),
+    };
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;
+
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
+
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;
+
+    ctx.client.delete_objects(&[path1, path2]).await?;
+
+    let prefixes = ctx.client.list_prefixes(None).await?;
+
+    assert_eq!(prefixes.len(), 1);
+
+    ctx.client.delete_objects(&[path3]).await?;
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let MaybeEnabledAzure::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
+        let mut buf = Vec::new();
+        tokio::io::copy_buf(
+            &mut tokio_util::io::StreamReader::new(dl.download_stream),
+            &mut buf,
+        )
+        .await?;
+        Ok(buf)
+    }
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(len as u64))
+        .await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(&buf, &orig[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(&buf, &orig[8..]);
+
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(&buf, &orig[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
+}
+
+fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
+        )
+        .expect("logging init failed");
+    });
+}
+
+struct EnabledAzure {
+    client: Arc<GenericRemoteStorage>,
+    base_prefix: &'static str,
+}
+
+impl EnabledAzure {
+    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
+        let client = create_azure_client(max_keys_in_list_response)
+            .context("Azure client creation")
+            .expect("Azure client creation failed");
+
+        EnabledAzure {
+            client,
+            base_prefix: BASE_PREFIX,
+        }
+    }
+}
+
+enum MaybeEnabledAzure {
+    Enabled(EnabledAzure),
+    Disabled,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledAzure {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+
+        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        Self::Enabled(EnabledAzure::setup(None).await)
+    }
+}
+
+enum MaybeEnabledAzureWithTestBlobs {
+    Enabled(AzureWithTestBlobs),
+    Disabled,
+    UploadsFailed(anyhow::Error, AzureWithTestBlobs),
+}
+
+struct AzureWithTestBlobs {
+    enabled: EnabledAzure,
+    remote_prefixes: HashSet<RemotePath>,
+    remote_blobs: HashSet<RemotePath>,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        let max_keys_in_list_response = 10;
+        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
+
+        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
+
+        match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+            ControlFlow::Continue(uploads) => {
+                info!("Remote objects created successfully");
+
+                Self::Enabled(AzureWithTestBlobs {
+                    enabled,
+                    remote_prefixes: uploads.prefixes,
+                    remote_blobs: uploads.blobs,
+                })
+            }
+            ControlFlow::Break(uploads) => Self::UploadsFailed(
+                anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
+                AzureWithTestBlobs {
+                    enabled,
+                    remote_prefixes: uploads.prefixes,
+                    remote_blobs: uploads.blobs,
+                },
+            ),
+        }
+    }
+
+    async fn teardown(self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
+            }
+        }
+    }
+}
+
+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
+enum MaybeEnabledAzureWithSimpleTestBlobs {
+    Enabled(AzureWithSimpleTestBlobs),
+    Disabled,
+    UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
+}
+struct AzureWithSimpleTestBlobs {
+    enabled: EnabledAzure,
+    remote_blobs: HashSet<RemotePath>,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        let max_keys_in_list_response = 10;
+        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
+
+        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
+
+        match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
+            ControlFlow::Continue(uploads) => {
+                info!("Remote objects created successfully");
+
+                Self::Enabled(AzureWithSimpleTestBlobs {
+                    enabled,
+                    remote_blobs: uploads,
+                })
+            }
+            ControlFlow::Break(uploads) => Self::UploadsFailed(
+                anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
+                AzureWithSimpleTestBlobs {
+                    enabled,
+                    remote_blobs: uploads,
+                },
+            ),
+        }
+    }
+
+    async fn teardown(self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
+            }
+        }
+    }
+}
+
+fn create_azure_client(
+    max_keys_per_list_response: Option<i32>,
+) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    use rand::Rng;
+
+    let remote_storage_azure_container = env::var("REMOTE_STORAGE_AZURE_CONTAINER").context(
+        "`REMOTE_STORAGE_AZURE_CONTAINER` env var is not set, but real Azure tests are enabled",
+    )?;
+    let remote_storage_azure_region = env::var("REMOTE_STORAGE_AZURE_REGION").context(
+        "`REMOTE_STORAGE_AZURE_REGION` env var is not set, but real Azure tests are enabled",
+    )?;
+
+    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
+    // millis is just a debugging aid for easier finding the prefix later.
+    let millis = std::time::SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .context("random Azure test prefix part calculation")?
+        .as_millis();
+
+    // because nanos can be the same for two threads so can millis, add randomness
+    let random = rand::thread_rng().gen::<u32>();
+
+    let remote_storage_config = RemoteStorageConfig {
+        storage: RemoteStorageKind::AzureContainer(AzureConfig {
+            container_name: remote_storage_azure_container,
+            container_region: remote_storage_azure_region,
+            prefix_in_container: Some(format!("test_{millis}_{random:08x}/")),
+            concurrency_limit: NonZeroUsize::new(100).unwrap(),
+            max_keys_per_list_response,
+        }),
+    };
+    Ok(Arc::new(
+        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+    ))
+}
+
+struct Uploads {
+    prefixes: HashSet<RemotePath>,
+    blobs: HashSet<RemotePath>,
+}
+
+async fn upload_azure_data(
+    client: &Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    upload_tasks_count: usize,
+) -> ControlFlow<Uploads, Uploads> {
+    info!("Creating {upload_tasks_count} Azure files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
+                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok((upload_prefix, upload_path)) => {
+                uploaded_prefixes.insert(upload_prefix);
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    let uploads = Uploads {
+        prefixes: uploaded_prefixes,
+        blobs: uploaded_blobs,
+    };
+    if upload_tasks_failed {
+        ControlFlow::Break(uploads)
+    } else {
+        ControlFlow::Continue(uploads)
+    }
+}
+
+async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
+    info!(
+        "Removing {} objects from the remote storage during cleanup",
+        objects_to_delete.len()
+    );
+    let mut delete_tasks = JoinSet::new();
+    for object_to_delete in objects_to_delete {
+        let task_client = Arc::clone(client);
+        delete_tasks.spawn(async move {
+            debug!("Deleting remote item at path {object_to_delete:?}");
+            task_client
+                .delete(&object_to_delete)
+                .await
+                .with_context(|| format!("{object_to_delete:?} removal"))
+        });
+    }
+
+    while let Some(task_run_result) = delete_tasks.join_next().await {
+        match task_run_result {
+            Ok(task_result) => match task_result {
+                Ok(()) => {}
+                Err(e) => error!("Delete task failed: {e:?}"),
+            },
+            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
+        }
+    }
+}
+
+// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
+async fn upload_simple_azure_data(
+    client: &Arc<GenericRemoteStorage>,
+    upload_tasks_count: usize,
+) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
+    info!("Creating {upload_tasks_count} Azure files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>(blob_path)
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok(upload_path) => {
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    if upload_tasks_failed {
+        ControlFlow::Break(uploaded_blobs)
+    } else {
+        ControlFlow::Continue(uploaded_blobs)
+    }
+}
+
+// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
+// to binary
+fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,12 +1,15 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
-use std::path::{Path, PathBuf};
+use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use bytes::Bytes;
+use camino::Utf8Path;
+use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -55,7 +58,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
    let test_client = Arc::clone(&ctx.enabled.client);
    let expected_remote_prefixes = ctx.remote_prefixes.clone();

-    let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
        .list_prefixes(None)
@@ -108,7 +111,7 @@ async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> any
    };
    let test_client = Arc::clone(&ctx.enabled.client);
    let base_prefix =
-        RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
        .list_files(None)
        .await
@@ -129,9 +132,9 @@ async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> any
    let trim_remote_blobs: HashSet<_> = ctx
        .remote_blobs
        .iter()
-        .map(|x| x.get_path().to_str().expect("must be valid name"))
+        .map(|x| x.get_path())
        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
@@ -148,10 +151,9 @@ async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result
        MaybeEnabledS3::Disabled => return Ok(()),
    };

-    let path = RemotePath::new(&PathBuf::from(format!(
-        "{}/for_sure_there_is_nothing_there_really",
-        ctx.base_prefix,
-    )))
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
    .with_context(|| "RemotePath conversion")?;

    ctx.client.delete(&path).await.expect("should succeed");
@@ -167,32 +169,23 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
        MaybeEnabledS3::Disabled => return Ok(()),
    };

-    let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    let data3 = "remote blob data3".as_bytes();
-    let data3_len = data3.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;

-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;

-    ctx.client
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;

    ctx.client.delete_objects(&[path1, path2]).await?;

@@ -210,6 +203,7 @@ fn ensure_logging_ready() {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
        )
        .expect("logging init failed");
    });
@@ -378,21 +372,28 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
 fn create_s3_client(
    max_keys_per_list_response: Option<i32>,
 ) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    use rand::Rng;
+
    let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
        .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
    let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
        .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
-    let random_prefix_part = std::time::SystemTime::now()
+
+    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
+    // millis is just a debugging aid for easier finding the prefix later.
+    let millis = std::time::SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .context("random s3 test prefix part calculation")?
-        .as_nanos();
+        .as_millis();
+
+    // because nanos can be the same for two threads so can millis, add randomness
+    let random = rand::thread_rng().gen::<u32>();
+
    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
-            prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
+            prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
@@ -418,17 +419,15 @@ async fn upload_s3_data(
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
-            let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
-            let blob_prefix = RemotePath::new(&prefix)
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;

            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
@@ -503,15 +502,15 @@ async fn upload_simple_s3_data(
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(&blob_path)
-                .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;

            Ok::<_, anyhow::Error>(blob_path)
        });
@@ -540,3 +539,30 @@ async fn upload_simple_s3_data(
        ControlFlow::Continue(uploaded_blobs)
    }
 }
+
+fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
--- a/Show More
+++ b/Show More