Add prefetch_enabled GUC to trigger Neon prefetch mechansim

Bump posgres version
2026-01-31 17:20:37 +00:00 · 2022-09-14 17:12:01 +03:00 · 2022-09-14 12:59:37 +03:00 · 2022-09-13 10:49:07 +03:00 · 2022-09-12 17:36:30 +03:00 · 2022-09-12 17:26:52 +03:00
288 changed files with 22896 additions and 11705 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,18 +1,20 @@
-**/.git/
-**/__pycache__
-**/.pytest_cache
+*

-.git
-target
-tmp_check
-tmp_install
-tmp_check_cli
-test_output
-.vscode
-.neon
-integration_tests/.neon
-.mypy_cache
-
-Dockerfile
-.dockerignore
+!rust-toolchain.toml
+!Cargo.toml
+!Cargo.lock
+!Makefile

+!.cargo/
+!.config/
+!control_plane/
+!compute_tools/
+!libs/
+!pageserver/
+!pgxn/
+!proxy/
+!safekeeper/
+!vendor/postgres-v14/
+!vendor/postgres-v15/
+!workspace_hack/
+!neon_local/
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -0,0 +1 @@
+4c2bb43775947775401cbb9d774823c5723a91f8
--- a/.github/ISSUE_TEMPLATE/bug-template.md
+++ b/.github/ISSUE_TEMPLATE/bug-template.md
@@ -0,0 +1,23 @@
+---
+name: Bug Template
+about: Used for describing bugs
+title: ''
+labels: t/bug
+assignees: ''
+
+---
+
+## Steps to reproduce
+
+
+## Expected result
+
+
+## Actual result
+
+
+## Environment
+
+
+## Logs, links
+- 
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -0,0 +1,25 @@
+---
+name: Epic Template
+about: A set of related tasks contributing towards specific outcome, comprising of
+  more than 1 week of work.
+title: 'Epic: '
+labels: t/Epic
+assignees: ''
+
+---
+
+## Motivation
+
+
+## DoD
+
+
+## Implementation ideas
+
+
+## Tasks
+- [ ]
+
+
+## Other related tasks and Epics
+- 
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -0,0 +1,20 @@
+## Release 202Y-MM-DD
+
+**NB: this PR must be merged only by 'Create a merge commit'!**
+
+### Checklist when preparing for release
+- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow)
+- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
+- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?
+
+<!-- List everything that should be done **before** release, any issues / setting changes / etc -->
+
+### Checklist after release
+- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/120/files))
+- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
+- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
+- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
+- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1)
+- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time)
+
+<!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->
--- a/.github/actions/allure-report/action.yml
+++ b/.github/actions/allure-report/action.yml
@@ -0,0 +1,221 @@
+name: 'Create Allure report'
+description: 'Create and publish Allure report'
+
+inputs:
+  action:
+    desctiption: 'generate or store'
+    required: true
+  build_type:
+    description: '`build_type` from run-python-test-set action'
+    required: true
+  test_selection:
+    description: '`test_selector` from run-python-test-set action'
+    required: false
+outputs:
+  report-url:
+    description: 'Allure report URL'
+    value: ${{ steps.generate-report.outputs.report-url }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Validate input parameters
+      shell: bash -euxo pipefail {0}
+      run: |
+        if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then
+          echo 2>&1 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only"
+          exit 1
+        fi
+
+        if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then
+          echo 2>&1 "inputs.test_selection must be set for 'store' action"
+          exit 2
+        fi
+
+    - name: Calculate key
+      id: calculate-key
+      shell: bash -euxo pipefail {0}
+      run: |
+        # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
+
+        pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
+        if [ "${pr_number}" != "null" ]; then
+          key=pr-${pr_number}
+        elif [ "${GITHUB_REF}" = "refs/heads/main" ]; then
+          # Shortcut for a special branch
+          key=main
+        else
+          key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-")
+        fi
+        echo "::set-output name=KEY::${key}"
+
+    - uses: actions/setup-java@v3
+      if: ${{ inputs.action == 'generate' }}
+      with:
+        distribution: 'temurin'
+        java-version: '17'
+
+    - name: Install Allure
+      if: ${{ inputs.action == 'generate' }}
+      shell: bash -euxo pipefail {0}
+      run: |
+        if ! which allure; then
+          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
+          wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
+          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
+          unzip -q ${ALLURE_ZIP}
+          echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
+          rm -f ${ALLURE_ZIP}
+        fi
+      env:
+        ALLURE_VERSION: 2.19.0
+        ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464
+
+    - name: Upload Allure results
+      if: ${{ inputs.action == 'store' }}
+      env:
+        REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
+        RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
+        TEST_OUTPUT: /tmp/test_output
+        BUCKET: neon-github-public-dev
+      shell: bash -euxo pipefail {0}
+      run: |
+        # Add metadata
+        cat <<EOF > $TEST_OUTPUT/allure/results/executor.json
+          {
+            "name": "GitHub Actions",
+            "type": "github",
+            "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
+            "buildOrder": ${GITHUB_RUN_ID},
+            "buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}",
+            "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
+            "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
+            "reportName": "Allure Report"
+          }
+        EOF
+        cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
+          TEST_SELECTION=${{ inputs.test_selection }}
+          BUILD_TYPE=${{ inputs.build_type }}
+        EOF
+
+        ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
+        ZSTD_NBTHREADS=0
+
+        tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
+        aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
+
+    # Potentially we could have several running build for the same key (for example for the main branch),  so we use improvised lock for this
+    - name: Acquire Allure lock
+      if: ${{ inputs.action == 'generate' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
+        BUCKET: neon-github-public-dev
+      run: |
+        LOCK_TIMEOUT=300 # seconds
+
+        for _ in $(seq 1 5); do
+          for i in $(seq 1 ${LOCK_TIMEOUT}); do
+            LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
+            # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
+            if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
+              break
+            fi
+            sleep 1
+          done
+          echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" > lock.txt
+          aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"
+
+          # A double-check that exactly WE have acquired the lock
+          aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
+          if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
+            break
+          fi
+        done
+
+    - name: Generate and publish final Allure report
+      if: ${{ inputs.action == 'generate' }}
+      id: generate-report
+      env:
+        REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
+        RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
+        TEST_OUTPUT: /tmp/test_output
+        BUCKET: neon-github-public-dev
+      shell: bash -euxo pipefail {0}
+      run: |
+        # Get previously uploaded data for this run
+        ZSTD_NBTHREADS=0
+
+        s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output  '.Contents[].Key')
+        if [ -z "$s3_filepaths" ]; then
+          # There's no previously uploaded data for this run
+          exit 0
+        fi
+        for s3_filepath in ${s3_filepaths}; do
+          aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/"
+
+          archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath)
+          mkdir -p ${archive%.tar.zst}
+          tar -xf ${archive} -C ${archive%.tar.zst}
+          rm -f ${archive}
+        done
+
+        # Get history trend
+        aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true
+
+        # Generate report
+        allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/*
+
+        # Replace a logo link with a redirect to the latest version of the report
+        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html" class=|g' $TEST_OUTPUT/allure/report/app.js
+
+        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
+        aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
+        aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+
+        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
+
+        # Generate redirect
+        cat <<EOF > ./index.html
+          <!DOCTYPE html>
+
+          <meta charset="utf-8">
+          <title>Redirecting to ${REPORT_URL}</title>
+          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
+        EOF
+        aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
+
+        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
+        echo "::set-output name=report-url::${REPORT_URL}"
+
+    - name: Release Allure lock
+      if: ${{ inputs.action == 'generate' && always() }}
+      shell: bash -euxo pipefail {0}
+      env:
+        LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
+        BUCKET: neon-github-public-dev
+      run: |
+        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
+
+        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
+          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
+        fi
+
+    - uses: actions/github-script@v6
+      if: ${{ inputs.action == 'generate' && always() }}
+      env:
+        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
+        BUILD_TYPE: ${{ inputs.build_type }}
+        SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+      with:
+        script: |
+          const { REPORT_URL, BUILD_TYPE, SHA } = process.env
+
+          await github.rest.repos.createCommitStatus({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            sha: `${SHA}`,
+            state: 'success',
+            target_url: `${REPORT_URL}`,
+            context: `Allure report / ${BUILD_TYPE}`,
+          })
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -3,10 +3,7 @@ description: 'Runs a Neon python test set, performing all the required preparati

 inputs:
  build_type:
-    description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
-    required: true
-  rust_toolchain:
-    description: 'Rust toolchain version to fetch the caches'
+    description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster'
    required: true
  test_selection:
    description: 'A python test suite to run'
@@ -24,7 +21,7 @@ inputs:
    required: false
    default: 'true'
  save_perf_report:
-    description: 'Whether to upload the performance report'
+    description: 'Whether to upload the performance report, if true PERF_TEST_RESULT_CONNSTR env variable should be set'
    required: false
    default: 'false'
  run_with_real_s3:
@@ -52,9 +49,10 @@ runs:
  using: "composite"
  steps:
    - name: Get Neon artifact
+      if: inputs.build_type != 'remote'
      uses: ./.github/actions/download
      with:
-        name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
+        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
        path: /tmp/neon

    - name: Checkout
@@ -78,16 +76,21 @@ runs:
    - name: Run pytest
      env:
        NEON_BIN: /tmp/neon/bin
-        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
        TEST_OUTPUT: /tmp/test_output
-        # this variable will be embedded in perf test report
-        # and is needed to distinguish different environments
-        PLATFORM: github-actions-selfhosted
        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
      shell: bash -euxo pipefail {0}
      run: |
+        # PLATFORM will be embedded in the perf test report
+        # and it is needed to distinguish different environments
+        export PLATFORM=${PLATFORM:-github-actions-selfhosted}
+        export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install/v14}
+
+        if [ "${BUILD_TYPE}" = "remote" ]; then
+          export REMOTE_ENV=1
+        fi
+
        PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
        rm -rf $PERF_REPORT_DIR

@@ -119,6 +122,13 @@ runs:
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
          cov_prefix=()
+        else
+          cov_prefix=()
+        fi
+
+        # Wake up the cluster if we use remote neon instance
+        if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
+          ${POSTGRES_DISTRIB_DIR}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
        fi

        # Run the tests.
@@ -131,32 +141,26 @@ runs:
        # -n4 uses four processes to run tests via pytest-xdist
        # -s is not used to prevent pytest from capturing output, because tests are running
        # in parallel and logs are mixed between different tests
+        mkdir -p $TEST_OUTPUT/allure/results
        "${cov_prefix[@]}" ./scripts/pytest \
          --junitxml=$TEST_OUTPUT/junit.xml \
+          --alluredir=$TEST_OUTPUT/allure/results \
          --tb=short \
          --verbose \
-          -m "not remote_cluster" \
          -rA $TEST_SELECTION $EXTRA_PARAMS

        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
          if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
            export REPORT_FROM="$PERF_REPORT_DIR"
-            export REPORT_TO=local
+            export REPORT_TO="$PLATFORM"
            scripts/generate_and_push_perf_report.sh
          fi
        fi

-    - name: Delete all data but logs
-      shell: bash -euxo pipefail {0}
+    - name: Create Allure report
      if: always()
-      run: |
-        du -sh /tmp/test_output/*
-        find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
-        du -sh /tmp/test_output/*
-
-    - name: Upload python test logs
-      if: always()
-      uses: ./.github/actions/upload
+      uses: ./.github/actions/allure-report
      with:
-        name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
-        path: /tmp/test_output/
+        action: store
+        build_type: ${{ inputs.build_type }}
+        test_selection: ${{ inputs.test_selection }}
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -29,8 +29,12 @@ runs:
          time tar -C ${SOURCE} -cf ${ARCHIVE} --zstd .
        elif [ -f ${SOURCE} ]; then
          time tar -cf ${ARCHIVE} --zstd ${SOURCE}
+        elif ! ls ${SOURCE} > /dev/null 2>&1; then
+          echo 2>&1 "${SOURCE} does not exist"
+          exit 2
        else
-          echo 2>&1 "${SOURCE} neither directory nor file, don't know how to handle it"
+          echo 2>&1 "${SOURCE} is neither a directory nor a file, do not know how to handle it"
+          exit 3
        fi

    - name: Upload artifact
--- a/.github/ansible/get_binaries.sh
+++ b/.github/ansible/get_binaries.sh
@@ -2,30 +2,14 @@

 set -e

-RELEASE=${RELEASE:-false}
-
-# look at docker hub for latest tag for neon docker image
-if [ "${RELEASE}" = "true" ]; then
-    echo "search latest release tag"
-    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1)
-    if [ -z "${VERSION}" ]; then
-        echo "no any docker tags found, exiting..."
-        exit 1
-    else
-        TAG="release-${VERSION}"
-    fi
+if [ -n "${DOCKER_TAG}" ]; then
+  # Verson is DOCKER_TAG but without prefix
+  VERSION=$(echo $DOCKER_TAG | sed 's/^.*-//g')
 else
-    echo "search latest dev tag"
-    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1)
-    if [ -z "${VERSION}" ]; then
-        echo "no any docker tags found, exiting..."
-        exit 1
-    else
-        TAG="${VERSION}"
-    fi
+  echo "Please set DOCKER_TAG environment variable"
+  exit 1
 fi

-echo "found ${VERSION}"

 # do initial cleanup
 rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
@@ -33,8 +17,8 @@ mkdir neon_install

 # retrieve binaries from docker image
 echo "getting binaries from docker image"
-docker pull --quiet neondatabase/neon:${TAG}
-ID=$(docker create neondatabase/neon:${TAG})
+docker pull --quiet neondatabase/neon:${DOCKER_TAG}
+ID=$(docker create neondatabase/neon:${DOCKER_TAG})
 docker cp ${ID}:/data/postgres_install.tar.gz .
 tar -xzf postgres_install.tar.gz -C neon_install
 docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
--- a/.github/ansible/scripts/init_safekeeper.sh
+++ b/.github/ansible/scripts/init_safekeeper.sh
@@ -1,7 +1,8 @@
 #!/bin/sh

-# get instance id from meta-data service
+# fetch params from meta-data service
 INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)

 # store fqdn hostname in var
 HOST=$(hostname -f)
@@ -14,7 +15,8 @@ cat <<EOF | tee /tmp/payload
  "port": 6500,
  "http_port": 7676,
  "region_id": {{ console_region_id }},
-  "instance_id": "${INSTANCE_ID}"
+  "instance_id": "${INSTANCE_ID}",
+  "availability_zone_id": "${AZ_ID}"
 }
 EOF

--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -1,4 +1,4 @@
-name: benchmarking
+name: Benchmarking

 on:
  # uncomment to run on push for debugging your PR
@@ -15,6 +15,15 @@ on:

  workflow_dispatch: # adds ability to run this manually

+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
 jobs:
  bench:
    # this workflow runs on self hosteed runner
@@ -60,7 +69,6 @@ jobs:
    - name: Setup cluster
      env:
        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
-      shell: bash -euxo pipefail {0}
      run: |
        set -e

@@ -96,7 +104,9 @@ jobs:
        # since it might generate duplicates when calling ingest_perf_test_result.py
        rm -rf perf-report-staging
        mkdir -p perf-report-staging
-        ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600
+        # Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file,
+        # it's important for test_perf_pgbench.py::test_pgbench_remote_* tests
+        ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --out-dir perf-report-staging --timeout 5400

    - name: Submit result
      env:
@@ -113,3 +123,104 @@ jobs:
        slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+  pgbench-compare:
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
+      TEST_PG_BENCH_SCALES_MATRIX: "10gb"
+      POSTGRES_DISTRIB_DIR: /usr
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+
+    strategy:
+      fail-fast: false
+      matrix:
+        connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ]
+
+    runs-on: dev
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
+      options: --init
+
+    timeout-minutes: 360 # 6h
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Calculate platform
+      id: calculate-platform
+      env:
+        CONNSTR: ${{ matrix.connstr }}
+      run: |
+        if [ "${CONNSTR}" = "BENCHMARK_CAPTEST_CONNSTR" ]; then
+          PLATFORM=neon-captest
+        elif [ "${CONNSTR}" = "BENCHMARK_RDS_CONNSTR" ]; then
+          PLATFORM=rds-aurora
+        else
+          echo 2>&1 "Unknown CONNSTR=${CONNSTR}. Allowed are BENCHMARK_CAPTEST_CONNSTR, and BENCHMARK_RDS_CONNSTR only"
+          exit 1
+        fi
+
+        echo "::set-output name=PLATFORM::${PLATFORM}"
+
+    - name: Install Deps
+      run: |
+        sudo apt -y update
+        sudo apt install -y postgresql-14
+
+    - name: Benchmark init
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: true
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
+      env:
+        PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
+        BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Benchmark simple-update
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: true
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
+      env:
+        PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
+        BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Benchmark simple-update
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: true
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
+      env:
+        PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
+        BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Create Allure report
+      uses: ./.github/actions/allure-report
+      with:
+        action: generate
+        build_type: ${{ env.BUILD_TYPE }}
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -7,10 +7,6 @@ on:
      - release
  pull_request:

-defaults:
-  run:
-    shell: bash -euxo pipefail {0}
-
 concurrency:
  # Allow only one workflow per any non-`main` branch.
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
@@ -21,21 +17,50 @@ env:
  COPT: '-Werror'

 jobs:
+  tag:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Get build tag
+        run: |
+          echo run:$GITHUB_RUN_ID
+          echo ref:$GITHUB_REF_NAME
+          echo rev:$(git rev-list --count HEAD)
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "::set-output name=tag::$(git rev-list --count HEAD)"
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "::set-output name=tag::$GITHUB_RUN_ID"
+          fi
+        shell: bash
+        id: build-tag
+
  build-neon:
    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug, release ]
-        rust_toolchain: [ 1.58 ]

    env:
      BUILD_TYPE: ${{ matrix.build_type }}
      GIT_VERSION: ${{ github.sha }}

    steps:
-      - name: Fix git ownerwhip
+      - name: Fix git ownership
        run: |
          # Workaround for `fatal: detected dubious ownership in repository at ...`
          #
@@ -51,9 +76,15 @@ jobs:
          submodules: true
          fetch-depth: 1

-      - name: Set pg revision for caching
-        id: pg_ver
-        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v14)
+        shell: bash -euxo pipefail {0}
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-v15)
+        shell: bash -euxo pipefail {0}

      # Set some environment variables used by all the steps.
      #
@@ -68,15 +99,16 @@ jobs:
          if [[ $BUILD_TYPE == "debug" ]]; then
            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
            CARGO_FEATURES=""
-            CARGO_FLAGS=""
+            CARGO_FLAGS="--locked --timings"
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=""
            CARGO_FEATURES="--features profiling"
-            CARGO_FLAGS="--release $CARGO_FEATURES"
+            CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES"
          fi
          echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
          echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
          echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
+        shell: bash -euxo pipefail {0}

      # Don't include the ~/.cargo/registry/src directory. It contains just
      # uncompressed versions of the crates in ~/.cargo/registry/cache
@@ -93,27 +125,46 @@ jobs:
            target/
          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
          key: |
-            v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
-            v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
+            v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
+            v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-

-      - name: Cache postgres build
-        id: cache_pg
+      - name: Cache postgres v14 build
+        id: cache_pg_14
        uses: actions/cache@v3
        with:
-          path: tmp_install/
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

-      - name: Build postgres
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: mold -run make postgres -j$(nproc)
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+        shell: bash -euxo pipefail {0}
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+        shell: bash -euxo pipefail {0}
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+        shell: bash -euxo pipefail {0}

      - name: Run cargo build
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
+        shell: bash -euxo pipefail {0}

      - name: Run cargo test
        run: |
          ${cov_prefix} cargo test $CARGO_FLAGS
+        shell: bash -euxo pipefail {0}

      - name: Install rust binaries
        run: |
@@ -154,30 +205,44 @@ jobs:
              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
            done
          fi
+        shell: bash -euxo pipefail {0}

      - name: Install postgres binaries
-        run: cp -a tmp_install /tmp/neon/pg_install
+        run: cp -a pg_install /tmp/neon/pg_install
+        shell: bash -euxo pipefail {0}

      - name: Upload Neon artifact
        uses: ./.github/actions/upload
        with:
-          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
+          name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
          path: /tmp/neon

+      - name: Prepare cargo build timing stats for storing
+        run: |
+          mkdir -p "/tmp/neon/cargo-timings/$BUILD_TYPE/"
+          cp -r ./target/cargo-timings/* "/tmp/neon/cargo-timings/$BUILD_TYPE/"
+        shell: bash -euxo pipefail {0}
+      - name: Upload cargo build stats
+        uses: ./.github/actions/upload
+        with:
+          name: neon-${{ runner.os }}-${{ matrix.build_type }}-build-stats
+          path: /tmp/neon/cargo-timings/
+
      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug'
        uses: ./.github/actions/save-coverage-data

-  pg_regress-tests:
+  regress-tests:
    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
    needs: [ build-neon ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug, release ]
-        rust_toolchain: [ 1.58 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -185,59 +250,33 @@ jobs:
          submodules: true
          fetch-depth: 2

-      - name: Pytest regress tests
+      - name: Pytest regression tests
        uses: ./.github/actions/run-python-test-set
        with:
          build_type: ${{ matrix.build_type }}
-          rust_toolchain: ${{ matrix.rust_toolchain }}
-          test_selection: batch_pg_regress
+          test_selection: regress
          needs_postgres_source: true
-
-      - name: Merge and upload coverage data
-        if: matrix.build_type == 'debug'
-        uses: ./.github/actions/save-coverage-data
-
-  other-tests:
-    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
-    needs: [ build-neon ]
-    strategy:
-      fail-fast: false
-      matrix:
-        build_type: [ debug, release ]
-        rust_toolchain: [ 1.58 ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 2
-
-      - name: Pytest other tests
-        uses: ./.github/actions/run-python-test-set
-        with:
-          build_type: ${{ matrix.build_type }}
-          rust_toolchain: ${{ matrix.rust_toolchain }}
-          test_selection: batch_others
          run_with_real_s3: true
          real_s3_bucket: ci-tests-s3
          real_s3_region: us-west-2
          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
+
      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug'
        uses: ./.github/actions/save-coverage-data

  benchmarks:
    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
    needs: [ build-neon ]
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
      matrix:
        build_type: [ release ]
-        rust_toolchain: [ 1.58 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -249,7 +288,6 @@ jobs:
        uses: ./.github/actions/run-python-test-set
        with:
          build_type: ${{ matrix.build_type }}
-          rust_toolchain: ${{ matrix.rust_toolchain }}
          test_selection: performance
          run_in_parallel: false
          save_perf_report: true
@@ -259,15 +297,56 @@ jobs:
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

+  merge-allure-report:
+    runs-on: dev
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+    needs: [ regress-tests, benchmarks ]
+    if: always()
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: false
+
+      - name: Create Allure report
+        id: create-allure-report
+        uses: ./.github/actions/allure-report
+        with:
+          action: generate
+          build_type: ${{ matrix.build_type }}
+
+      - name: Store Allure test stat in the DB
+        env:
+          BUILD_TYPE: ${{ matrix.build_type }}
+          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+        shell: bash -euxo pipefail {0}
+        run: |
+          curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
+          ./scripts/pysync
+
+          # Workaround for https://github.com/neondatabase/cloud/issues/2188
+          psql "$TEST_RESULT_CONNSTR" -c "SELECT 1;" || sleep 10
+
+          DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
+
  coverage-report:
    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
-    needs: [ other-tests, pg_regress-tests ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+    needs: [ regress-tests ]
    strategy:
      fail-fast: false
      matrix:
        build_type: [ debug ]
-        rust_toolchain: [ 1.58 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -284,12 +363,12 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git/
            target/
-          key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+          key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}

      - name: Get Neon artifact
        uses: ./.github/actions/download
        with:
-          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
+          name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
          path: /tmp/neon

      - name: Get coverage artifact
@@ -300,6 +379,7 @@ jobs:

      - name: Merge coverage data
        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
+        shell: bash -euxo pipefail {0}

      - name: Build and upload coverage report
        run: |
@@ -332,9 +412,13 @@ jobs:
              \"description\": \"Coverage report is ready\",
              \"target_url\": \"$REPORT_URL\"
            }"
+        shell: bash -euxo pipefail {0}

  trigger-e2e-tests:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
+    runs-on: dev
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
    needs: [ build-neon ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
@@ -369,150 +453,168 @@ jobs:
              }
            }"

-  docker-image:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
-    needs: [ pg_regress-tests, other-tests ]
-    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    outputs:
-      build-tag: ${{steps.build-tag.outputs.tag}}
+  neon-image:
+    runs-on: dev
+    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1 # v3 won't work with kaniko
        with:
          submodules: true
          fetch-depth: 0

-      - name: Login to DockerHub
-        uses: docker/login-action@v1
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-        with:
-          driver: docker
+      - name: Kaniko build neon
+        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID

-      - name: Get build tag
-        run: |
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            echo "::set-output name=tag::$(git rev-list --count HEAD)"
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-        id: build-tag
+  compute-tools-image:
+    runs-on: dev
+    container: gcr.io/kaniko-project/executor:v1.9.0-debug

-      - name: Get legacy build tag
-        run: |
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            echo "::set-output name=tag::latest"
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            echo "::set-output name=tag::release"
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-        id: legacy-build-tag
-
-      - name: Build neon Docker image
-        uses: docker/build-push-action@v2
-        with:
-          context: .
-          build-args: |
-            GIT_VERSION="${{github.sha}}"
-            AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
-            AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
-          pull: true
-          push: true
-          tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}}
-
-  docker-image-compute:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
-    needs: [ pg_regress-tests, other-tests ]
-    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    outputs:
-      build-tag: ${{steps.build-tag.outputs.tag}}
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1 # v3 won't work with kaniko
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build compute tools
+        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
+
+  compute-node-image:
+    runs-on: dev
+    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1 # v3 won't work with kaniko
        with:
          submodules: true
          fetch-depth: 0

-      - name: Login to DockerHub
-        uses: docker/login-action@v1
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-        with:
-          driver: docker
+        # compute-node uses postgres 14, which is default now
+        # cloud repo depends on this image name, thus duplicating it
+        # remove compute-node when cloud repo is updated
+      - name: Kaniko build compute node with extensions v14 (compatibility)
+        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID

-      - name: Get build tag
+  compute-node-image-v14:
+    runs-on: dev
+    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1 # v3 won't work with kaniko
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build compute node with extensions v14
+        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:$GITHUB_RUN_ID
+
+
+  compute-node-image-v15:
+    runs-on: dev
+    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1 # v3 won't work with kaniko
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build compute node with extensions v15
+        run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:$GITHUB_RUN_ID
+
+  promote-images:
+    runs-on: dev
+    needs: [ neon-image, compute-node-image, compute-node-image-v14, compute-tools-image ]
+    if: github.event_name != 'workflow_dispatch'
+    container: amazon/aws-cli
+    strategy:
+      fail-fast: false
+      matrix:
+        # compute-node uses postgres 14, which is default now
+        # cloud repo depends on this image name, thus duplicating it
+        # remove compute-node when cloud repo is updated
+        name: [ neon, compute-node, compute-node-v14, compute-tools ]
+
+    steps:
+      - name: Promote image to latest
+        run:
+          MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
+
+  push-docker-hub:
+    runs-on: dev
+    needs: [ promote-images, tag ]
+    container: golang:1.19-bullseye
+
+    steps:
+      - name: Install Crane & ECR helper
        run: |
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            echo "::set-output name=tag::$(git rev-list --count HEAD)"
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-        id: build-tag
+          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0

-      - name: Get legacy build tag
+      - name: Configure ECR login
        run: |
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            echo "::set-output name=tag::latest"
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            echo "::set-output name=tag::release"
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-        id: legacy-build-tag
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json

-      - name: Build compute-tools Docker image
-        uses: docker/build-push-action@v2
-        with:
-          context: .
-          build-args: |
-            GIT_VERSION="${{github.sha}}"
-            AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
-            AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
-          push: false
-          file: Dockerfile.compute-tools
-          tags: neondatabase/compute-tools:local
+      - name: Pull neon image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:latest neon

-      - name: Push compute-tools Docker image
-        uses: docker/build-push-action@v2
-        with:
-          context: .
-          build-args: |
-            GIT_VERSION="${{github.sha}}"
-            AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
-            AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
-          push: true
-          file: Dockerfile.compute-tools
-          tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}}
+      - name: Pull compute tools image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest compute-tools

-      - name: Build compute-node Docker image
-        uses: docker/build-push-action@v2
-        with:
-          context: ./vendor/postgres/
-          build-args:
-            COMPUTE_TOOLS_TAG=local
-          push: true
-          tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}}
+      - name: Pull compute node image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node
+
+      - name: Pull compute node v14 image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest compute-node-v14
+
+      - name: Pull rust image from ECR
+        run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
+
+      - name: Configure docker login
+        run: |
+          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
+          echo "" > /github/home/.docker/config.json
+          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
+
+      - name: Push neon image to Docker Hub
+        run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}}
+
+      - name: Push compute tools image to Docker Hub
+        run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
+
+      - name: Push compute node image to Docker Hub
+        run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
+
+      - name: Push compute node v14 image to Docker Hub
+        run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
+
+      - name: Push rust image to Docker Hub
+        run: crane push rust neondatabase/rust:pinned
+
+      - name: Add latest tag to images
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
+        run: |
+          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest

  calculate-deploy-targets:
    runs-on: [ self-hosted, Linux, k8s-runner ]
@@ -538,14 +640,16 @@ jobs:

  deploy:
    runs-on: [ self-hosted, Linux, k8s-runner ]
-    # We need both storage **and** compute images for deploy, because control plane
-    # picks the compute version based on the storage version. If it notices a fresh
-    # storage it may bump the compute version. And if compute image failed to build
-    # it may break things badly.
-    needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
+    #container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
+    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
    if: |
      (github.ref_name == 'main' || github.ref_name == 'release') &&
      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
    strategy:
      matrix:
        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
@@ -556,12 +660,19 @@ jobs:
          submodules: true
          fetch-depth: 0

+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
      - name: Setup ansible
        run: |
+          export PATH="/root/.local/bin:$PATH"
          pip install --progress-bar off --user ansible boto3

      - name: Redeploy
        run: |
+          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
          cd "$(pwd)/.github/ansible"

          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
@@ -584,13 +695,16 @@ jobs:
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it
-    # to run all deploy jobs consistently.
-    needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
    if: |
      (github.ref_name == 'main' || github.ref_name == 'release') &&
      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
    strategy:
      matrix:
        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
@@ -603,6 +717,9 @@ jobs:
          submodules: true
          fetch-depth: 0

+      - name: Add curl
+        run: apt update && apt install curl -y
+
      - name: Store kubeconfig file
        run: |
          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
@@ -615,6 +732,6 @@ jobs:

      - name: Re-deploy proxy
        run: |
-          DOCKER_TAG=${{needs.docker-image.outputs.build-tag}}
+          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -17,18 +17,23 @@ concurrency:

 env:
  RUST_BACKTRACE: 1
+  COPT: '-Werror'

 jobs:
  check-codestyle-rust:
    strategy:
      fail-fast: false
      matrix:
-        # If we want to duplicate this job for different
-        # Rust toolchains (e.g. nightly or 1.37.0), add them here.
-        rust_toolchain: [1.58]
+        # XXX: both OSes have rustup
+        #   * https://github.com/actions/runner-images/blob/main/images/macos/macos-12-Readme.md#rust-tools
+        #   * https://github.com/actions/runner-images/blob/main/images/linux/Ubuntu2204-Readme.md#rust-tools
+        # this is all we need to install our toolchain later via rust-toolchain.toml
+        # so don't install any toolchain explicitly.
        os: [ubuntu-latest, macos-latest]
+        # To support several Postgres versions, add them here.
+        postgres_version: [v14, v15]
    timeout-minutes: 60
-    name: run regression test suite
+    name: check codestyle rust and postgres
    runs-on: ${{ matrix.os }}

    steps:
@@ -38,14 +43,6 @@ jobs:
          submodules: true
          fetch-depth: 2

-      - name: Install rust toolchain ${{ matrix.rust_toolchain }}
-        uses: actions-rs/toolchain@v1
-        with:
-          profile: minimal
-          toolchain: ${{ matrix.rust_toolchain }}
-          components: rustfmt, clippy
-          override: true
-
      - name: Check formatting
        run: cargo fmt --all -- --check

@@ -61,14 +58,14 @@ jobs:

      - name: Set pg revision for caching
        id: pg_ver
-        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
+        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres-${{matrix.postgres_version}})

-      - name: Cache postgres build
+      - name: Cache postgres ${{matrix.postgres_version}} build
        id: cache_pg
-        uses: actions/cache@v2
+        uses: actions/cache@v3
        with:
          path: |
-            tmp_install/
+            pg_install/${{matrix.postgres_version}}
          key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }}

      - name: Set extra env for macOS
@@ -81,33 +78,36 @@ jobs:
        if: steps.cache_pg.outputs.cache-hit != 'true'
        run: make postgres

+      - name: Build neon extensions
+        run: make neon-pg-ext
+
      # Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
      # and the real cause will be inside config.log
      - name: Print configure logs in case of failure
        if: failure()
        continue-on-error: true
        run: |
-          echo '' && echo '=== config.log ===' && echo ''
-          cat tmp_install/build/config.log
-          echo '' && echo '=== configure.log ===' && echo ''
-          cat tmp_install/build/configure.log
+          echo '' && echo '=== Postgres ${{matrix.postgres_version}} config.log ===' && echo ''
+          cat pg_install/build/${{matrix.postgres_version}}/config.log
+          echo '' && echo '=== Postgres ${{matrix.postgres_version}} configure.log ===' && echo ''
+          cat pg_install/build/${{matrix.postgres_version}}/configure.log

      - name: Cache cargo deps
        id: cache_cargo
-        uses: actions/cache@v2
+        uses: actions/cache@v3
        with:
          path: |
            ~/.cargo/registry
            !~/.cargo/registry/src
            ~/.cargo/git
            target
-          key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
+          key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust

      - name: Run cargo clippy
        run: ./run_clippy.sh

      - name: Ensure all project builds
-        run: cargo build --all --all-targets
+        run: cargo build --locked --all --all-targets

  check-codestyle-python:
    runs-on: [ self-hosted, Linux, k8s-runner ]
@@ -128,8 +128,14 @@ jobs:
      - name: Install Python deps
        run: ./scripts/pysync

-      - name: Run yapf to ensure code format
-        run: poetry run yapf --recursive --diff .
+      - name: Run isort to ensure code format
+        run: poetry run isort --diff --check .
+
+      - name: Run black to ensure code format
+        run: poetry run black --diff --check .
+
+      - name: Run flake8 to ensure code format
+        run: poetry run flake8 .

      - name: Run mypy to check types
        run: poetry run mypy .
--- a/.github/workflows/notifications.yml
+++ b/.github/workflows/notifications.yml
@@ -1,45 +0,0 @@
-name: Send Notifications
-
-on:
-  push:
-    branches: [ main ]
-
-jobs:
-  send-notifications:
-    timeout-minutes: 30
-    name: send commit notifications
-    runs-on: ubuntu-latest
-
-    steps:
-
-      - name: Checkout
-        uses: actions/checkout@v2
-        with:
-          submodules: true
-          fetch-depth: 2
-
-      - name: Form variables for notification message
-        id: git_info_grab
-        run: |
-          git_stat=$(git show --stat=50)
-          git_stat="${git_stat//'%'/'%25'}"
-          git_stat="${git_stat//$'\n'/'%0A'}"
-          git_stat="${git_stat//$'\r'/'%0D'}"
-          git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces
-          echo "::set-output name=git_stat::$git_stat"
-          echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
-          echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
-
-      - name: Send notification
-        uses: appleboy/telegram-action@master
-        with:
-          to: ${{ secrets.TELEGRAM_TO }}
-          token: ${{ secrets.TELEGRAM_TOKEN }}
-          format: markdown
-          args: |
-            *@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
-
-            ```
-            ${{ steps.git_info_grab.outputs.git_stat }}
-            ```
-
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -19,8 +19,12 @@ concurrency:

 jobs:
  test-postgres-client-libs:
+    # TODO: switch to gen2 runner, requires docker
    runs-on: [ ubuntu-latest ]

+    env:
+      TEST_OUTPUT: /tmp/test_output
+
    steps:
    - name: Checkout
      uses: actions/checkout@v3
@@ -47,8 +51,8 @@ jobs:
      env:
        REMOTE_ENV: 1
        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
-        TEST_OUTPUT: /tmp/test_output
-        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+
+        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14
      shell: bash -euxo pipefail {0}
      run: |
        # Test framework expects we have psql binary;
@@ -61,9 +65,18 @@ jobs:
          -m "remote_cluster" \
          -rA "test_runner/pg_clients"

+    # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
+    # It will be fixed after switching to gen2 runner
+    - name: Upload python test logs
+      if: always()
+      uses: actions/upload-artifact@v3
+      with:
+        retention-days: 7
+        name: python-test-pg_clients-${{ runner.os }}-stage-logs
+        path: ${{ env.TEST_OUTPUT }}
+
    - name: Post to a Slack channel
-      if: failure()
-      id: slack
+      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
+/pg_install
 /target
 /tmp_check
-/tmp_install
 /tmp_check_cli
 __pycache__/
 test_output/
@@ -15,3 +15,6 @@ test_output/

 *.key
 *.crt
+*.o
+*.so
+*.Po
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,8 @@
-[submodule "vendor/postgres"]
-	path = vendor/postgres
-	url = https://github.com/zenithdb/postgres
+[submodule "vendor/postgres-v14"]
+	path = vendor/postgres-v14
+	url = https://github.com/neondatabase/postgres.git
 	branch = main
+[submodule "vendor/postgres-v15"]
+	path = vendor/postgres-v15
+	url = https://github.com/neondatabase/postgres.git
+	branch = REL_15_STABLE_neon
--- a/.yapfignore
+++ b/.yapfignore
@@ -1,10 +0,0 @@
-# This file is only read when `yapf` is run from this directory.
-# Hence we only top-level directories here to avoid confusion.
-# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43
-vendor/
-target/
-tmp_install/
-__pycache__/
-test_output/
-.neon/
-.git/
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -48,9 +48,9 @@ dependencies = [

 [[package]]
 name = "anyhow"
-version = "1.0.58"
+version = "1.0.59"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704"
+checksum = "c91f1f46651137be86f3a2b9a8359f9ab421d04d941c62b5982e1ca21113adf9"
 dependencies = [
 "backtrace",
 ]
@@ -77,7 +77,7 @@ dependencies = [
 "num-traits",
 "rusticata-macros",
 "thiserror",
- "time 0.3.11",
+ "time 0.3.12",
 ]

 [[package]]
@@ -126,9 +126,9 @@ dependencies = [

 [[package]]
 name = "async-trait"
-version = "0.1.56"
+version = "0.1.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96cf8829f67d2eab0b2dfa42c5d0ef737e0724e4a82b01b3e292456202b19716"
+checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -166,7 +166,7 @@ dependencies = [
 "http",
 "http-body",
 "hyper",
- "itoa 1.0.2",
+ "itoa 1.0.3",
 "matchit",
 "memchr",
 "mime",
@@ -298,9 +298,9 @@ checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3"

 [[package]]
 name = "bytemuck"
-version = "1.10.0"
+version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c53dfa917ec274df8ed3c572698f381a24eef2efba9492d797301b72b6db408a"
+checksum = "a5377c8865e74a160d21f29c2d40669f53286db6eab59b88540cbb12ffc8b835"

 [[package]]
 name = "byteorder"
@@ -310,9 +310,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"

 [[package]]
 name = "bytes"
-version = "1.1.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8"
+checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db"
 dependencies = [
 "serde",
 ]
@@ -386,9 +386,9 @@ dependencies = [

 [[package]]
 name = "clap"
-version = "3.2.12"
+version = "3.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab8b79fe3946ceb4a0b1c080b4018992b8d27e9ff363644c1c9b6387c854614d"
+checksum = "a3dbbb6653e7c55cc8595ad3e1f7be8f32aba4eb7ff7f0fd1163d4f3d137c0a9"
 dependencies = [
 "atty",
 "bitflags",
@@ -455,7 +455,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "clap 3.2.12",
+ "clap 3.2.16",
 "env_logger",
 "hyper",
 "log",
@@ -495,6 +495,9 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "clap 3.2.16",
+ "comfy-table",
+ "git-version",
 "nix",
 "once_cell",
 "pageserver",
@@ -601,9 +604,9 @@ dependencies = [

 [[package]]
 name = "crossbeam-channel"
-version = "0.5.5"
+version = "0.5.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c02a4d71819009c192cf4872265391563fd6a84c81ff2c0f2a7026ca4c1d85c"
+checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
 dependencies = [
 "cfg-if",
 "crossbeam-utils",
@@ -611,9 +614,9 @@ dependencies = [

 [[package]]
 name = "crossbeam-deque"
-version = "0.8.1"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
+checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
 dependencies = [
 "cfg-if",
 "crossbeam-epoch",
@@ -622,9 +625,9 @@ dependencies = [

 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.9"
+version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07db9d94cbd326813772c968ccd25999e5f8ae22f4f8d1b11effa37ef6ce281d"
+checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1"
 dependencies = [
 "autocfg",
 "cfg-if",
@@ -636,9 +639,9 @@ dependencies = [

 [[package]]
 name = "crossbeam-utils"
-version = "0.8.10"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d82ee10ce34d7bc12c2122495e7593a9c41347ecdd64185af4ecf72cb1a7f83"
+checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
 dependencies = [
 "cfg-if",
 "once_cell",
@@ -917,9 +920,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"

 [[package]]
 name = "fastrand"
-version = "1.7.0"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf"
+checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
 dependencies = [
 "instant",
 ]
@@ -1086,9 +1089,9 @@ dependencies = [

 [[package]]
 name = "generic-array"
-version = "0.14.5"
+version = "0.14.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803"
+checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9"
 dependencies = [
 "typenum",
 "version_check",
@@ -1164,20 +1167,14 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"

-[[package]]
-name = "hashbrown"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
-dependencies = [
- "ahash",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+dependencies = [
+ "ahash",
+]

 [[package]]
 name = "heck"
@@ -1245,7 +1242,7 @@ checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399"
 dependencies = [
 "bytes",
 "fnv",
- "itoa 1.0.2",
+ "itoa 1.0.3",
 ]

 [[package]]
@@ -1308,7 +1305,7 @@ dependencies = [
 "http-body",
 "httparse",
 "httpdate",
- "itoa 1.0.2",
+ "itoa 1.0.3",
 "pin-project-lite",
 "socket2",
 "tokio",
@@ -1379,7 +1376,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
 dependencies = [
 "autocfg",
- "hashbrown 0.12.3",
+ "hashbrown",
 ]

 [[package]]
@@ -1391,7 +1388,7 @@ dependencies = [
 "ahash",
 "atty",
 "indexmap",
- "itoa 1.0.2",
+ "itoa 1.0.3",
 "lazy_static",
 "log",
 "num-format",
@@ -1432,15 +1429,15 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"

 [[package]]
 name = "itoa"
-version = "1.0.2"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d"
+checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754"

 [[package]]
 name = "js-sys"
-version = "0.3.58"
+version = "0.3.59"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3fac17f7123a73ca62df411b1bf727ccc805daa070338fda671c86dac1bdc27"
+checksum = "258451ab10b34f8af53416d1fdab72c22e805f0c92a1136d59470ec0b11138b2"
 dependencies = [
 "wasm-bindgen",
 ]
@@ -1482,9 +1479,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"

 [[package]]
 name = "libc"
-version = "0.2.126"
+version = "0.2.127"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836"
+checksum = "505e71a4706fa491e9b1b55f51b95d4037d0821ee40131190475f692b35b009b"

 [[package]]
 name = "libloading"
@@ -1654,23 +1651,6 @@ dependencies = [
 "tempfile",
 ]

-[[package]]
-name = "neon_local"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap 3.2.12",
- "comfy-table",
- "control_plane",
- "git-version",
- "pageserver",
- "postgres",
- "safekeeper",
- "serde_json",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "nix"
 version = "0.23.1"
@@ -1851,10 +1831,12 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-stream",
+ "async-trait",
 "byteorder",
 "bytes",
 "chrono",
- "clap 3.2.12",
+ "clap 3.2.16",
 "close_fds",
 "const_format",
 "crc32c",
@@ -1891,6 +1873,7 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-postgres",
+ "tokio-util",
 "toml_edit",
 "tracing",
 "url",
@@ -2111,7 +2094,6 @@ dependencies = [
 "bindgen",
 "byteorder",
 "bytes",
- "chrono",
 "crc32c",
 "env_logger",
 "hex",
@@ -2155,9 +2137,9 @@ checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"

 [[package]]
 name = "prettyplease"
-version = "0.1.16"
+version = "0.1.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da6ffbe862780245013cb1c0a48c4e44b7d665548088f91f6b90876d0625e4c2"
+checksum = "697ae720ee02011f439e0701db107ffe2916d83f718342d65d7f8bf7b8a5fee9"
 dependencies = [
 "proc-macro2",
 "syn",
@@ -2171,9 +2153,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"

 [[package]]
 name = "proc-macro2"
-version = "1.0.40"
+version = "1.0.43"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd96a1e8ed2596c337f8eae5f24924ec83f5ad5ab21ea8e455d3566c69fbcaf7"
+checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
 dependencies = [
 "unicode-ident",
 ]
@@ -2269,14 +2251,16 @@ dependencies = [
 "anyhow",
 "async-trait",
 "base64",
+ "bstr",
 "bytes",
- "clap 3.2.12",
+ "clap 3.2.16",
 "futures",
 "git-version",
- "hashbrown 0.11.2",
+ "hashbrown",
 "hex",
 "hmac 0.12.1",
 "hyper",
+ "itertools",
 "md5",
 "metrics",
 "once_cell",
@@ -2288,7 +2272,7 @@ dependencies = [
 "routerify",
 "rstest",
 "rustls",
- "rustls-pemfile 0.2.1",
+ "rustls-pemfile",
 "scopeguard",
 "serde",
 "serde_json",
@@ -2314,20 +2298,11 @@ dependencies = [
 "memchr",
 ]

-[[package]]
-name = "quickcheck"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
-dependencies = [
- "rand",
-]
-
 [[package]]
 name = "quote"
-version = "1.0.20"
+version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804"
+checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
 dependencies = [
 "proc-macro2",
 ]
@@ -2410,9 +2385,9 @@ dependencies = [

 [[package]]
 name = "redox_syscall"
-version = "0.2.13"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42"
+checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
 dependencies = [
 "bitflags",
 ]
@@ -2507,7 +2482,7 @@ dependencies = [
 "percent-encoding",
 "pin-project-lite",
 "rustls",
- "rustls-pemfile 1.0.0",
+ "rustls-pemfile",
 "serde",
 "serde_json",
 "serde_urlencoded",
@@ -2698,18 +2673,9 @@ dependencies = [

 [[package]]
 name = "rustls-pemfile"
-version = "0.2.1"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5eebeaeb360c87bfb72e84abdb3447159c0eaececf1bef2aecd65a8be949d1c9"
-dependencies = [
- "base64",
-]
-
-[[package]]
-name = "rustls-pemfile"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7522c9de787ff061458fe9a829dc790a3f5b22dc571694fc5883f448b94d9a9"
+checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55"
 dependencies = [
 "base64",
 ]
@@ -2725,15 +2691,15 @@ dependencies = [

 [[package]]
 name = "rustversion"
-version = "1.0.8"
+version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8"
+checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8"

 [[package]]
 name = "ryu"
-version = "1.0.10"
+version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695"
+checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09"

 [[package]]
 name = "safekeeper"
@@ -2743,7 +2709,7 @@ dependencies = [
 "async-trait",
 "byteorder",
 "bytes",
- "clap 3.2.12",
+ "clap 3.2.16",
 "const_format",
 "crc32c",
 "daemonize",
@@ -2834,15 +2800,15 @@ dependencies = [

 [[package]]
 name = "semver"
-version = "1.0.12"
+version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2333e6df6d6598f2b1974829f853c2b4c5f4a6e503c10af918081aa6f8564e1"
+checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711"

 [[package]]
 name = "serde"
-version = "1.0.139"
+version = "1.0.142"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0171ebb889e45aa68b44aee0859b3eede84c6f5f5c228e6f140c0b2a0a46cad6"
+checksum = "e590c437916fb6b221e1d00df6e3294f3fccd70ca7e92541c475d6ed6ef5fee2"
 dependencies = [
 "serde_derive",
 ]
@@ -2859,9 +2825,9 @@ dependencies = [

 [[package]]
 name = "serde_derive"
-version = "1.0.139"
+version = "1.0.142"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc1d3230c1de7932af58ad8ffbe1d784bd55efd5a9d84ac24f69c72d83543dfb"
+checksum = "34b5b8d809babe02f538c2cfec6f2c1ed10804c0e5a6a041a049a4f5588ccc2e"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -2870,11 +2836,11 @@ dependencies = [

 [[package]]
 name = "serde_json"
-version = "1.0.82"
+version = "1.0.83"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82c2c1fdcd807d1098552c5b9a36e425e42e9fbd7c6a37a8425f390f781f7fa7"
+checksum = "38dd04e3c8279e75b31ef29dbdceebfe5ad89f4d0937213c53f7d49d01b3d5a7"
 dependencies = [
- "itoa 1.0.2",
+ "itoa 1.0.3",
 "ryu",
 "serde",
 ]
@@ -2886,7 +2852,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
 dependencies = [
 "form_urlencoded",
- "itoa 1.0.2",
+ "itoa 1.0.3",
 "ryu",
 "serde",
 ]
@@ -2991,7 +2957,7 @@ dependencies = [
 "num-bigint",
 "num-traits",
 "thiserror",
- "time 0.3.11",
+ "time 0.3.12",
 ]

 [[package]]
@@ -3002,9 +2968,12 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"

 [[package]]
 name = "slab"
-version = "0.4.6"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32"
+checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef"
+dependencies = [
+ "autocfg",
+]

 [[package]]
 name = "smallvec"
@@ -3112,9 +3081,9 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "1.0.98"
+version = "1.0.99"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd"
+checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -3190,18 +3159,18 @@ checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"

 [[package]]
 name = "thiserror"
-version = "1.0.31"
+version = "1.0.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a"
+checksum = "f5f6586b7f764adc0231f4c79be7b920e766bb2f3e51b3661cdb263828f19994"
 dependencies = [
 "thiserror-impl",
 ]

 [[package]]
 name = "thiserror-impl"
-version = "1.0.31"
+version = "1.0.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a"
+checksum = "12bafc5b54507e0149cdf1b145a5d80ab80a90bcd9275df43d4fff68460f6c21"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -3230,14 +3199,14 @@ dependencies = [

 [[package]]
 name = "time"
-version = "0.3.11"
+version = "0.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72c91f41dcb2f096c05f0873d667dceec1087ce5bcf984ec8ffb19acddbb3217"
+checksum = "74b7cc93fc23ba97fde84f7eea56c55d1ba183f495c6715defdfc7b9cb8c870f"
 dependencies = [
- "itoa 1.0.2",
+ "itoa 1.0.3",
+ "js-sys",
 "libc",
 "num_threads",
- "quickcheck",
 "time-macros",
 ]

@@ -3274,9 +3243,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"

 [[package]]
 name = "tokio"
-version = "1.20.0"
+version = "1.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57aec3cfa4c296db7255446efb4928a6be304b431a806216105542a67b6ca82e"
+checksum = "7a8325f63a7d4774dd041e363b2409ed1c5cbbd0f867795e661df066b2b0a581"
 dependencies = [
 "autocfg",
 "bytes",
@@ -3515,9 +3484,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"

 [[package]]
 name = "tracing"
-version = "0.1.34"
+version = "0.1.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d0ecdcb44a79f0fe9844f0c4f33a342cbcbb5117de8001e6ba0dc2351327d09"
+checksum = "2fce9567bd60a67d08a16488756721ba392f24f29006402881e43b19aac64307"
 dependencies = [
 "cfg-if",
 "log",
@@ -3539,11 +3508,11 @@ dependencies = [

 [[package]]
 name = "tracing-core"
-version = "0.1.26"
+version = "0.1.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f54c8ca710e81886d498c2fd3331b56c93aa248d49de2222ad2742247c60072f"
+checksum = "5aeea4303076558a00714b823f9ad67d58a3bbda1df83d8827d21193156e22f7"
 dependencies = [
- "lazy_static",
+ "once_cell",
 "valuable",
 ]

@@ -3606,9 +3575,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992"

 [[package]]
 name = "unicode-ident"
-version = "1.0.2"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7"
+checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf"

 [[package]]
 name = "unicode-normalization"
@@ -3660,6 +3629,7 @@ name = "utils"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
 "bincode",
 "byteorder",
 "bytes",
@@ -3678,7 +3648,7 @@ dependencies = [
 "rand",
 "routerify",
 "rustls",
- "rustls-pemfile 0.2.1",
+ "rustls-pemfile",
 "rustls-split",
 "serde",
 "serde_json",
@@ -3687,6 +3657,7 @@ dependencies = [
 "tempfile",
 "thiserror",
 "tokio",
+ "tokio-rustls",
 "tracing",
 "tracing-subscriber",
 "workspace_hack",
@@ -3727,7 +3698,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "clap 3.2.12",
+ "clap 3.2.16",
 "env_logger",
 "log",
 "once_cell",
@@ -3771,9 +3742,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

 [[package]]
 name = "wasm-bindgen"
-version = "0.2.81"
+version = "0.2.82"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c53b543413a17a202f4be280a7e5c62a1c69345f5de525ee64f8cfdbc954994"
+checksum = "fc7652e3f6c4706c8d9cd54832c4a4ccb9b5336e2c3bd154d5cccfbf1c1f5f7d"
 dependencies = [
 "cfg-if",
 "wasm-bindgen-macro",
@@ -3781,13 +3752,13 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.81"
+version = "0.2.82"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5491a68ab4500fa6b4d726bd67408630c3dbe9c4fe7bda16d5c82a1fd8c7340a"
+checksum = "662cd44805586bd52971b9586b1df85cdbbd9112e4ef4d8f41559c334dc6ac3f"
 dependencies = [
 "bumpalo",
- "lazy_static",
 "log",
+ "once_cell",
 "proc-macro2",
 "quote",
 "syn",
@@ -3796,9 +3767,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.31"
+version = "0.4.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de9a9cec1733468a8c657e57fa2413d2ae2c0129b95e87c5b72b8ace4d13f31f"
+checksum = "fa76fb221a1f8acddf5b54ace85912606980ad661ac7a503b4570ffd3a624dad"
 dependencies = [
 "cfg-if",
 "js-sys",
@@ -3808,9 +3779,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.81"
+version = "0.2.82"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c441e177922bc58f1e12c022624b6216378e5febc2f0533e41ba443d505b80aa"
+checksum = "b260f13d3012071dfb1512849c033b1925038373aea48ced3012c09df952c602"
 dependencies = [
 "quote",
 "wasm-bindgen-macro-support",
@@ -3818,9 +3789,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.81"
+version = "0.2.82"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d94ac45fcf608c1f45ef53e748d35660f168490c10b23704c7779ab8f5c3048"
+checksum = "5be8e654bdd9b79216c2929ab90721aa82faf65c48cdf08bdc4e7f51357b80da"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -3831,15 +3802,15 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.81"
+version = "0.2.82"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a89911bd99e5f3659ec4acf9c4d93b0a90fe4a2a11f15328472058edc5261be"
+checksum = "6598dd0bd3c7d51095ff6531a5b23e02acdc81804e30d8f07afb77b7215a140a"

 [[package]]
 name = "web-sys"
-version = "0.3.58"
+version = "0.3.59"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2fed94beee57daf8dd7d51f2b15dc2bcde92d7a72304cdf662a4371008b71b90"
+checksum = "ed055ab27f941423197eb86b2035720b1a3ce40504df082cac2ecc6ed73335a1"
 dependencies = [
 "js-sys",
 "wasm-bindgen",
@@ -3964,6 +3935,7 @@ version = "0.1.0"
 dependencies = [
 "ahash",
 "anyhow",
+ "bstr",
 "bytes",
 "chrono",
 "clap 2.34.0",
@@ -3973,7 +3945,7 @@ dependencies = [
 "futures-task",
 "futures-util",
 "generic-array",
- "hashbrown 0.11.2",
+ "hashbrown",
 "hex",
 "hyper",
 "indexmap",
@@ -3988,11 +3960,12 @@ dependencies = [
 "prost",
 "rand",
 "regex",
+ "regex-automata",
 "regex-syntax",
 "scopeguard",
 "serde",
 "syn",
- "time 0.3.11",
+ "time 0.3.12",
 "tokio",
 "tokio-util",
 "tracing",
@@ -4014,7 +3987,7 @@ dependencies = [
 "oid-registry",
 "rusticata-macros",
 "thiserror",
- "time 0.3.11",
+ "time 0.3.12",
 ]

 [[package]]
@@ -4043,6 +4016,6 @@ dependencies = [

 [[package]]
 name = "zeroize"
-version = "1.5.6"
+version = "1.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20b578acffd8516a6c3f2a1bdefc1ec37e547bb4e0fb8b6b01a4cafc886b4442"
+checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,7 +6,6 @@ members = [
    "proxy",
    "safekeeper",
    "workspace_hack",
-    "neon_local",
    "libs/*",
 ]

@@ -15,6 +14,59 @@ members = [
 # Besides, debug info should not affect the performance.
 debug = true

+[profile.release-line-debug]
+inherits = "release"
+debug = 1 # true = 2 = all symbols, 1 = line only
+[profile.release-line-debug-lto]
+inherits = "release"
+debug = 1 # true = 2 = all symbols, 1 = line only
+lto = true
+
+[profile.release-line-debug-size]
+inherits = "release"
+debug = 1 # true = 2 = all symbols, 1 = line only
+opt-level = "s"
+[profile.release-line-debug-zize]
+inherits = "release"
+debug = 1 # true = 2 = all symbols, 1 = line only
+opt-level = "z"
+[profile.release-line-debug-size-lto]
+inherits = "release"
+debug = 1 # true = 2 = all symbols, 1 = line only
+opt-level = "s"
+lto = true
+[profile.release-line-debug-zize-lto]
+inherits = "release"
+debug = 1 # true = 2 = all symbols, 1 = line only
+opt-level = "z"
+lto = true
+
+[profile.release-no-debug]
+inherits = "release"
+debug = false # true = 2 = all symbols, 1 = line only
+
+[profile.release-no-debug-size]
+inherits = "release"
+debug = false # true = 2 = all symbols, 1 = line only
+opt-level = "s"
+[profile.release-no-debug-zize]
+inherits = "release"
+debug = false # true = 2 = all symbols, 1 = line only
+opt-level = "z"
+
+[profile.release-no-debug-size-lto]
+inherits = "release"
+debug = false # true = 2 = all symbols, 1 = line only
+opt-level = "s"
+lto = true
+
+[profile.release-no-debug-zize-lto]
+inherits = "release"
+debug = false # true = 2 = all symbols, 1 = line only
+opt-level = "z"
+lto = true
+
+
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 [patch.crates-io]
--- a/74
+++ b/74
@@ -1,37 +1,50 @@
+### Creates a storage Docker image with postgres, pageserver, safekeeper and proxy binaries.
+### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters.
+### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
+### inside this image in the real deployments.
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG IMAGE=rust
+ARG TAG=pinned
+
 # Build Postgres
-FROM neondatabase/rust:1.58 AS pg-build
-WORKDIR /pg
+FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
+WORKDIR /home/nonroot

-USER root
-
-COPY vendor/postgres vendor/postgres
-COPY Makefile Makefile
+COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
+COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
+COPY --chown=nonroot pgxn pgxn
+COPY --chown=nonroot Makefile Makefile

 ENV BUILD_TYPE release
 RUN set -e \
-    && mold -run make -j $(nproc) -s postgres \
-    && rm -rf tmp_install/build \
-    && tar -C tmp_install -czf /postgres_install.tar.gz .
+    && mold -run make -j $(nproc) -s neon-pg-ext \
+    && rm -rf pg_install/v14/build \
+    && rm -rf pg_install/v15/build \
+    && tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz .

 # Build zenith binaries
-FROM neondatabase/rust:1.58 AS build
+FROM $REPOSITORY/$IMAGE:$TAG AS build
+WORKDIR /home/nonroot
 ARG GIT_VERSION=local

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
-# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
+# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
 ARG RUSTC_WRAPPER=cachepot
-ARG CACHEPOT_BUCKET=zenith-rust-cachepot
-ARG AWS_ACCESS_KEY_ID
-ARG AWS_SECRET_ACCESS_KEY
+ENV AWS_REGION=eu-central-1
+ENV CACHEPOT_S3_KEY_PREFIX=cachepot
+ARG CACHEPOT_BUCKET=neon-github-dev
+#ARG AWS_ACCESS_KEY_ID
+#ARG AWS_SECRET_ACCESS_KEY

-COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY . .

 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && sudo -E "PATH=$PATH" mold -run cargo build --release \
+&& mold -run cargo build --locked --release \
    && cachepot -s

 # Build final image
@@ -40,8 +53,8 @@ FROM debian:bullseye-slim
 WORKDIR /data

 RUN set -e \
-    && apt-get update \
-    && apt-get install -y \
+    && apt update \
+    && apt install -y \
        libreadline-dev \
        libseccomp-dev \
        openssl \
@@ -50,17 +63,26 @@ RUN set -e \
    && useradd -d /data zenith \
    && chown -R zenith:zenith /data

-COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin
-COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin
-COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy      /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/pageserver /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/safekeeper /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy      /usr/local/bin

-COPY --from=pg-build /pg/tmp_install/         /usr/local/
-COPY --from=pg-build /postgres_install.tar.gz /data/
+# v14 is default for now
+COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/
+COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/

-COPY docker-entrypoint.sh /docker-entrypoint.sh
+# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
+# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
+RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \
+    && /usr/local/bin/pageserver -D /data/.neon/ --init \
+       -c "id=1234" \
+       -c "broker_endpoints=['http://etcd:2379']" \
+       -c "pg_distrib_dir='/usr/local'" \
+       -c "listen_pg_addr='0.0.0.0:6400'" \
+       -c "listen_http_addr='0.0.0.0:9898'"

 VOLUME ["/data"]
 USER zenith
 EXPOSE 6400
-ENTRYPOINT ["/docker-entrypoint.sh"]
-CMD ["pageserver"]
+EXPOSE 9898
+CMD ["/bin/bash"]
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -0,0 +1,167 @@
+ARG TAG=pinned
+# apparently, ARGs don't get replaced in RUN commands in kaniko
+# ARG POSTGIS_VERSION=3.3.0
+# ARG PLV8_VERSION=3.1.4
+# ARG PG_VERSION=v14
+
+#
+# Layer "build-deps"
+#
+FROM debian:bullseye-slim AS build-deps
+RUN apt update &&  \
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
+    libcurl4-openssl-dev libossp-uuid-dev
+
+#
+# Layer "pg-build"
+# Build Postgres from the neon postgres repository.
+#
+FROM build-deps AS pg-build
+COPY vendor/postgres-v14 postgres
+RUN cd postgres && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
+    # Install headers
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
+
+#
+# Layer "postgis-build"
+# Build PostGIS from the upstream PostGIS mirror.
+#
+# PostGIS compiles against neon postgres sources without changes. Perhaps we
+# could even use the upstream binaries, compiled against vanilla Postgres, but
+# it would require some investigation to check that it works, and also keeps
+# working in the future. So for now, we compile our own binaries.
+FROM build-deps AS postgis-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+RUN apt update && \
+    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget
+
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
+    tar xvzf postgis-3.3.0.tar.gz && \
+    cd postgis-3.3.0 && \
+    ./autogen.sh && \
+    export PATH="/usr/local/pgsql/bin:$PATH" && \
+    ./configure && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    cd extensions/postgis && \
+    make clean && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
+
+#
+# Layer "plv8-build"
+# Build plv8
+#
+FROM build-deps AS plv8-build
+COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+RUN apt update && \
+    apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev
+
+# https://github.com/plv8/plv8/issues/475
+# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
+RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
+    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
+    apt update && \
+    apt install -y --no-install-recommends -t testing binutils
+
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
+    tar xvzf v3.1.4.tar.gz && \
+    cd plv8-3.1.4 && \
+    export PATH="/usr/local/pgsql/bin:$PATH" && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    rm -rf /plv8-* && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
+
+#
+# Layer "neon-pg-ext-build"
+# compile neon extensions
+#
+FROM build-deps AS neon-pg-ext-build
+COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY pgxn/ pgxn/
+
+RUN make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon \
+        -s install
+
+# Compile and run the Neon-specific `compute_ctl` binary
+FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
+USER nonroot
+# Copy entire project to get Cargo.* files with proper dependencies for the whole project
+COPY --chown=nonroot . .
+RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
+
+#
+# Clean up postgres folder before inclusion
+#
+FROM neon-pg-ext-build AS postgres-cleanup-layer
+COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
+
+# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
+RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
+
+# Remove headers that we won't need anymore - we've completed installation of all extensions
+RUN rm -r /usr/local/pgsql/include
+
+# Remove now-useless PGXS src infrastructure
+RUN rm -r /usr/local/pgsql/lib/pgxs/src
+
+# Remove static postgresql libraries - all compilation is finished, so we
+# can now remove these files - they must be included in other binaries by now
+# if they were to be used by other libraries.
+RUN rm /usr/local/pgsql/lib/lib*.a
+
+#
+# Final layer
+# Put it all together into the final image
+#
+FROM debian:bullseye-slim
+# Add user postgres
+RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
+    echo "postgres:test_console_pass" | chpasswd && \
+    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
+    chown -R postgres:postgres /var/db/postgres && \
+    chmod 0750 /var/db/postgres/compute && \
+    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
+
+# TODO: Check if we can make the extension setup more modular versus a linear build
+# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc#
+COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
+COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
+
+# Install:
+# libreadline8 for psql
+# libossp-uuid16 for extension ossp-uuid
+# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
+# GLIBC 2.34 for plv8.
+#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
+#
+# Lastly, link compute_ctl into zenith_ctl while we're at it,
+# so that we don't need to put this in another layer.
+RUN apt update &&  \
+    apt install --no-install-recommends -y \
+        libreadline8 \
+        libossp-uuid16 \
+        libgeos-c1v5 \
+        libgdal28 \
+        libproj19 \
+        libprotobuf-c1 && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
+    echo "Installing GLIBC 2.34" && \
+    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
+    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
+    apt update && \
+    apt install -y --no-install-recommends -t testing libc6 && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
+    ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
+
+USER postgres
+ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -0,0 +1,172 @@
+#
+# This file is identical to the Dockerfile.compute-node-v14 file
+# except for the version of Postgres that is built.
+#
+
+ARG TAG=pinned
+# apparently, ARGs don't get replaced in RUN commands in kaniko
+# ARG POSTGIS_VERSION=3.3.0
+# ARG PLV8_VERSION=3.1.4
+# ARG PG_VERSION=v15
+
+#
+# Layer "build-deps"
+#
+FROM debian:bullseye-slim AS build-deps
+RUN apt update &&  \
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
+    libcurl4-openssl-dev libossp-uuid-dev
+
+#
+# Layer "pg-build"
+# Build Postgres from the neon postgres repository.
+#
+FROM build-deps AS pg-build
+COPY vendor/postgres-v15 postgres
+RUN cd postgres && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
+    # Install headers
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
+
+#
+# Layer "postgis-build"
+# Build PostGIS from the upstream PostGIS mirror.
+#
+# PostGIS compiles against neon postgres sources without changes. Perhaps we
+# could even use the upstream binaries, compiled against vanilla Postgres, but
+# it would require some investigation to check that it works, and also keeps
+# working in the future. So for now, we compile our own binaries.
+FROM build-deps AS postgis-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+RUN apt update && \
+    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget
+
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
+    tar xvzf postgis-3.3.0.tar.gz && \
+    cd postgis-3.3.0 && \
+    ./autogen.sh && \
+    export PATH="/usr/local/pgsql/bin:$PATH" && \
+    ./configure && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    cd extensions/postgis && \
+    make clean && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
+
+#
+# Layer "plv8-build"
+# Build plv8
+#
+FROM build-deps AS plv8-build
+COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+RUN apt update && \
+    apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev
+
+# https://github.com/plv8/plv8/issues/475
+# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
+RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
+    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
+    apt update && \
+    apt install -y --no-install-recommends -t testing binutils
+
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
+    tar xvzf v3.1.4.tar.gz && \
+    cd plv8-3.1.4 && \
+    export PATH="/usr/local/pgsql/bin:$PATH" && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    rm -rf /plv8-* && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
+
+#
+# Layer "neon-pg-ext-build"
+# compile neon extensions
+#
+FROM build-deps AS neon-pg-ext-build
+COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY pgxn/ pgxn/
+
+RUN make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon \
+        -s install
+
+# Compile and run the Neon-specific `compute_ctl` binary
+FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
+USER nonroot
+# Copy entire project to get Cargo.* files with proper dependencies for the whole project
+COPY --chown=nonroot . .
+RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
+
+#
+# Clean up postgres folder before inclusion
+#
+FROM neon-pg-ext-build AS postgres-cleanup-layer
+COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
+
+# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
+RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
+
+# Remove headers that we won't need anymore - we've completed installation of all extensions
+RUN rm -r /usr/local/pgsql/include
+
+# Remove now-useless PGXS src infrastructure
+RUN rm -r /usr/local/pgsql/lib/pgxs/src
+
+# Remove static postgresql libraries - all compilation is finished, so we
+# can now remove these files - they must be included in other binaries by now
+# if they were to be used by other libraries.
+RUN rm /usr/local/pgsql/lib/lib*.a
+
+#
+# Final layer
+# Put it all together into the final image
+#
+FROM debian:bullseye-slim
+# Add user postgres
+RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
+    echo "postgres:test_console_pass" | chpasswd && \
+    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
+    chown -R postgres:postgres /var/db/postgres && \
+    chmod 0750 /var/db/postgres/compute && \
+    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
+
+# TODO: Check if we can make the extension setup more modular versus a linear build
+# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc#
+COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
+COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
+
+# Install:
+# libreadline8 for psql
+# libossp-uuid16 for extension ossp-uuid
+# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
+# GLIBC 2.34 for plv8.
+#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
+#
+# Lastly, link compute_ctl into zenith_ctl while we're at it,
+# so that we don't need to put this in another layer.
+RUN apt update &&  \
+    apt install --no-install-recommends -y \
+        libreadline8 \
+        libossp-uuid16 \
+        libgeos-c1v5 \
+        libgdal28 \
+        libproj19 \
+        libprotobuf-c1 && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
+    echo "Installing GLIBC 2.34" && \
+    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
+    echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
+    apt update && \
+    apt install -y --no-install-recommends -t testing libc6 && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
+    ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
+
+USER postgres
+ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/Dockerfile.compute-node.legacy
+++ b/Dockerfile.compute-node.legacy
@@ -0,0 +1,88 @@
+#
+# Legacy version of the Dockerfile for the compute node.
+# Used by e2e CI. Building Dockerfile.compute-node will take
+# unreasonable ammount of time without v2 runners.
+#
+# TODO: remove once cloud repo CI is moved to v2 runners.
+#
+
+
+# Allow specifiyng different compute-tools tag and image repo, so we are
+# able to use different images
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG IMAGE=compute-tools
+ARG TAG=latest
+
+#
+# Image with pre-built tools
+#
+FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
+# Only to get ready compute_ctl binary as deppendency
+
+#
+# Image with Postgres build deps
+#
+FROM debian:bullseye-slim AS build-deps
+
+RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
+                                          libcurl4-openssl-dev libossp-uuid-dev
+
+#
+# Image with built Postgres
+#
+FROM build-deps AS pg-build
+
+# Add user postgres
+RUN adduser postgres
+RUN mkdir /pg && chown postgres:postgres /pg
+
+# Copy source files
+# version 14 is default for now
+COPY ./vendor/postgres-v14 /pg/
+COPY ./pgxn /pg/
+
+# Build and install Postgres locally
+RUN mkdir /pg/compute_build && cd /pg/compute_build && \
+    ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
+    # Install main binaries and contribs
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
+    # Install headers
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
+
+# Install neon contrib
+RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
+
+USER postgres
+WORKDIR /pg
+
+#
+# Final compute node image to be exported
+#
+FROM debian:bullseye-slim
+
+# libreadline-dev is required to run psql
+RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
+
+# Add user postgres
+RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
+    echo "postgres:test_console_pass" | chpasswd && \
+    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
+    chown -R postgres:postgres /var/db/postgres && \
+    chmod 0750 /var/db/postgres/compute
+
+# Copy ready Postgres binaries
+COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
+
+# Copy binaries from compute-tools
+COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
+
+# XXX: temporary symlink for compatibility with old control-plane
+RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
+
+# Add postgres shared objects to the search path
+RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
+
+USER postgres
+
+ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,22 +1,29 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
-FROM neondatabase/rust:1.58 AS rust-build
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG IMAGE=rust
+ARG TAG=pinned
+
+FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
+WORKDIR /home/nonroot

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
 # cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
 ARG RUSTC_WRAPPER=cachepot
-ARG CACHEPOT_BUCKET=zenith-rust-cachepot
-ARG AWS_ACCESS_KEY_ID
-ARG AWS_SECRET_ACCESS_KEY
+ENV AWS_REGION=eu-central-1
+ENV CACHEPOT_S3_KEY_PREFIX=cachepot
+ARG CACHEPOT_BUCKET=neon-github-dev
+#ARG AWS_ACCESS_KEY_ID
+#ARG AWS_SECRET_ACCESS_KEY

 COPY . .

 RUN set -e \
-    && sudo -E "PATH=$PATH" mold -run cargo build -p compute_tools --release \
+    && mold -run cargo build -p compute_tools --locked --release \
    && cachepot -s

 # Final image that only has one binary
-FROM debian:buster-slim
+FROM debian:bullseye-slim

-COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl
+COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl
--- a/171
+++ b/171
@@ -1,15 +1,7 @@
 ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))

-# Where to install Postgres, default is ./tmp_install, maybe useful for package managers
-POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install
-
-# Seccomp BPF is only available for Linux
-UNAME_S := $(shell uname -s)
-ifeq ($(UNAME_S),Linux)
-	SECCOMP = --with-libseccomp
-else
-	SECCOMP =
-endif
+# Where to install Postgres, default is ./pg_install, maybe useful for package managers
+POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/

 #
 # We differentiate between release / debug build types using the BUILD_TYPE
@@ -28,6 +20,12 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif

+# Seccomp BPF is only available for Linux
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+	PG_CONFIGURE_OPTS += --with-libseccomp
+endif
+
 # macOS with brew-installed openssl requires explicit paths
 # It can be configured with OPENSSL_PREFIX variable
 UNAME_S := $(shell uname -s)
@@ -48,64 +46,139 @@ CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1

 #
-# Top level Makefile to build Zenith and PostgreSQL
+# Top level Makefile to build Neon and PostgreSQL
 #
 .PHONY: all
-all: zenith postgres
+all: neon postgres neon-pg-ext

-### Zenith Rust bits
+### Neon Rust bits
 #
 # The 'postgres_ffi' depends on the Postgres headers.
-.PHONY: zenith
-zenith: postgres-headers
-	+@echo "Compiling Zenith"
+.PHONY: neon
+neon: postgres-v14-headers postgres-v15-headers
+	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

 ### PostgreSQL parts
-$(POSTGRES_INSTALL_DIR)/build/config.status:
-	+@echo "Configuring postgres build"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build
-	(cd $(POSTGRES_INSTALL_DIR)/build && \
-	$(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
+# The rules are duplicated for Postgres v14 and 15. We may want to refactor
+# to avoid the duplication in the future, but it's tolerable for now.
+#
+$(POSTGRES_INSTALL_DIR)/build/v14/config.status:
+	+@echo "Configuring Postgres v14 build"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
+	(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
+	$(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
-		$(SECCOMP) \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log)
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)

-# nicer alias for running 'configure'
-.PHONY: postgres-configure
-postgres-configure: $(POSTGRES_INSTALL_DIR)/build/config.status
+$(POSTGRES_INSTALL_DIR)/build/v15/config.status:
+	+@echo "Configuring Postgres v15 build"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
+	(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
+	$(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure CFLAGS='$(PG_CFLAGS)' \
+		$(PG_CONFIGURE_OPTS) \
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)

-# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include
-.PHONY: postgres-headers
-postgres-headers: postgres-configure
-	+@echo "Installing PostgreSQL headers"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/include MAKELEVEL=0 install
+# nicer alias to run 'configure'
+.PHONY: postgres-v14-configure
+postgres-v14-configure: $(POSTGRES_INSTALL_DIR)/build/v14/config.status

-# Compile and install PostgreSQL and contrib/neon
-.PHONY: postgres
-postgres: postgres-configure \
-		  postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
-	+@echo "Compiling PostgreSQL"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install
-	+@echo "Compiling contrib/neon"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install
-	+@echo "Compiling contrib/neon_test_utils"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install
-	+@echo "Compiling pg_buffercache"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install
-	+@echo "Compiling pageinspect"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install
+.PHONY: postgres-v15-configure
+postgres-v15-configure: $(POSTGRES_INSTALL_DIR)/build/v15/config.status

+# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
+.PHONY: postgres-v14-headers
+postgres-v14-headers: postgres-v14-configure
+	+@echo "Installing PostgreSQL v14 headers"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/include MAKELEVEL=0 install

-.PHONY: postgres-clean
-postgres-clean:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean
+.PHONY: postgres-v15-headers
+postgres-v15-headers: postgres-v15-configure
+	+@echo "Installing PostgreSQL v15 headers"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/include MAKELEVEL=0 install
+
+# Compile and install PostgreSQL
+.PHONY: postgres-v14
+postgres-v14: postgres-v14-configure \
+		  postgres-v14-headers # to prevent `make install` conflicts with neon's `postgres-headers`
+	+@echo "Compiling PostgreSQL v14"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
+	+@echo "Compiling libpq v14"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
+	+@echo "Compiling pg_buffercache v14"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
+	+@echo "Compiling pageinspect v14"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect install
+
+.PHONY: postgres-v15
+postgres-v15: postgres-v15-configure \
+		  postgres-v15-headers # to prevent `make install` conflicts with neon's `postgres-headers`
+	+@echo "Compiling PostgreSQL v15"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
+	+@echo "Compiling libpq v15"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
+	+@echo "Compiling pg_buffercache v15"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
+	+@echo "Compiling pageinspect v15"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect install
+
+# shorthand to build all Postgres versions
+postgres: postgres-v14 postgres-v15
+
+.PHONY: postgres-v14-clean
+postgres-v14-clean:
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pageinspect clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq clean
+
+.PHONY: postgres-v15-clean
+postgres-v15-clean:
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pageinspect clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq clean
+
+neon-pg-ext-v14: postgres-v14
+	+@echo "Compiling neon v14"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v14
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+	+@echo "Compiling neon_test_utils" v14
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
+
+neon-pg-ext-v15: postgres-v15
+	+@echo "Compiling neon v15"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-v15
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+	+@echo "Compiling neon_test_utils" v15
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
+	(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install)
+
+.PHONY: neon-pg-ext-clean
+	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean
+	$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean
+
+neon-pg-ext: neon-pg-ext-v14 neon-pg-ext-v15
+postgres-headers: postgres-v14-headers postgres-v15-headers
+postgres-clean: postgres-v14-clean postgres-v15-clean

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
 clean:
-	cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean
+	cd $(POSTGRES_INSTALL_DIR)/build/v14 && $(MAKE) clean
+	cd $(POSTGRES_INSTALL_DIR)/build/v15 && $(MAKE) clean
 	$(CARGO_CMD_PREFIX) cargo clean
+	cd pgxn/neon && $(MAKE) clean
+	cd pgxn/neon_test_utils && $(MAKE) clean

 # This removes everything
 .PHONY: distclean
--- a/4
+++ b/4
@@ -1,5 +1,5 @@
 Neon
 Copyright 2022 Neon Inc.

-The PostgreSQL submodule in vendor/postgres is licensed under the
-PostgreSQL license. See vendor/postgres/COPYRIGHT.
+The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the
+PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT.
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@ Pageserver consists of:
 - WAL receiver - service that receives WAL from WAL service and stores it in the repository.
 - Page service - service that communicates with compute nodes and responds with pages from the repository.
 - WAL redo - service that builds pages from base images and WAL records on Page service request
+
 ## Running local installation


@@ -68,6 +69,17 @@ brew install libpq
 brew link --force libpq
 ```

+#### Rustc version
+
+The project uses [rust toolchain file](./rust-toolchain.toml) to define the version it's built with in CI for testing and local builds.
+
+This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file.
+
+rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
+
+non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file.
+Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates.
+
 #### Building on Linux

 1. Build neon and patched postgres
@@ -77,9 +89,9 @@ brew link --force libpq
 git clone --recursive https://github.com/neondatabase/neon.git
 cd neon

-# The preferred and default is to make a debug build. This will create a 
-# demonstrably slower build than a release build. If you want to use a release
-# build, utilize "BUILD_TYPE=release make -j`nproc`" 
+# The preferred and default is to make a debug build. This will create a
+# demonstrably slower build than a release build. For a release build,
+# use "BUILD_TYPE=release make -j`nproc`"

 make -j`nproc`
 ```
@@ -93,15 +105,15 @@ make -j`nproc`
 git clone --recursive https://github.com/neondatabase/neon.git
 cd neon

-# The preferred and default is to make a debug build. This will create a 
-# demonstrably slower build than a release build. If you want to use a release
-# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" 
+# The preferred and default is to make a debug build. This will create a
+# demonstrably slower build than a release build. For a release build,
+# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"

 make -j`sysctl -n hw.logicalcpu`
 ```

 #### Dependency installation notes
-To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
+To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
 Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry](https://python-poetry.org/)) in the project directory.
@@ -208,7 +220,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne

 ```sh
 git clone --recursive https://github.com/neondatabase/neon.git
-make # builds also postgres and installs it to ./tmp_install
+make # builds also postgres and installs it to ./pg_install
 ./scripts/pytest
 ```

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -178,6 +178,7 @@ impl ComputeNode {
            .args(&["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .stdout(Stdio::piped())
+            .stderr(Stdio::piped())
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");

@@ -187,10 +188,13 @@ impl ComputeNode {
        let sync_output = sync_handle
            .wait_with_output()
            .expect("postgres --sync-safekeepers failed");
+
        if !sync_output.status.success() {
            anyhow::bail!(
-                "postgres --sync-safekeepers exited with non-zero status: {}",
+                "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}, stderr: {}",
                sync_output.status,
+                String::from_utf8(sync_output.stdout).expect("postgres --sync-safekeepers exited, and stdout is not utf-8"),
+                String::from_utf8(sync_output.stderr).expect("postgres --sync-safekeepers exited, and stderr is not utf-8"),
            );
        }

--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -62,9 +62,16 @@ impl GenericOption {
    /// Represent `GenericOption` as configuration option.
    pub fn to_pg_setting(&self) -> String {
        if let Some(val) = &self.value {
+            let name = match self.name.as_str() {
+                "safekeepers" => "neon.safekeepers",
+                "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
+                "wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
+                it => it,
+            };
+
            match self.vartype.as_ref() {
-                "string" => format!("{} = '{}'", self.name, val),
-                _ => format!("{} = {}", self.name, val),
+                "string" => format!("{} = '{}'", name, val),
+                _ => format!("{} = {}", name, val),
            }
        } else {
            self.name.to_owned()
--- a/compute_tools/tests/cluster_spec.json
+++ b/compute_tools/tests/cluster_spec.json
@@ -85,7 +85,7 @@
                "vartype": "bool"
            },
            {
-                "name": "safekeepers",
+                "name": "neon.safekeepers",
                "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
                "vartype": "string"
            },
@@ -181,7 +181,6 @@
            }
        ]
    },
-
    "delta_operations": [
        {
            "action": "delete_db",
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -28,7 +28,7 @@ mod pg_helpers_tests {

        assert_eq!(
            spec.cluster.settings.as_pg_settings(),
-            "fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
+            "fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
        );
    }

--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
+clap = "3.0"
+comfy-table = "5.0.1"
+git-version = "0.3.5"
 tar = "0.4.38"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 serde = { version = "1.0", features = ["derive"] }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1,3 +1,10 @@
+//!
+//! `neon_local` is an executable that can be used to create a local
+//! Neon environment, for testing purposes. The local environment is
+//! quite different from the cloud environment with Kubernetes, but it
+//! easier to work with locally. The python tests in `test_runner`
+//! rely on `neon_local` to set up the environment for each test.
+//!
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{App, AppSettings, Arg, ArgMatches};
 use control_plane::compute::ComputeControlPlane;
@@ -501,10 +508,10 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    // default_tenantid was generated by the `env.init()` call above
    let initial_tenant_id = env.default_tenant_id.unwrap();

-    // Call 'pageserver init'.
+    // Initialize pageserver, create initial tenant and timeline.
    let pageserver = PageServerNode::from_env(&env);
    let initial_timeline_id = pageserver
-        .init(
+        .initialize(
            Some(initial_tenant_id),
            initial_timeline_id_arg,
            &pageserver_config_overrides(init_match),
@@ -551,25 +558,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .values_of("config")
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();
-            let new_tenant_id = pageserver
-                .tenant_create(initial_tenant_id, tenant_conf)?
-                .ok_or_else(|| {
-                    anyhow!("Tenant with id {:?} was already created", initial_tenant_id)
-                })?;
-            println!(
-                "tenant {} successfully created on the pageserver",
-                new_tenant_id
-            );
+            let new_tenant_id = pageserver.tenant_create(initial_tenant_id, tenant_conf)?;
+            println!("tenant {new_tenant_id} successfully created on the pageserver");

            // Create an initial timeline for the new tenant
            let new_timeline_id = parse_timeline_id(create_match)?;
-            let timeline = pageserver
-                .timeline_create(new_tenant_id, new_timeline_id, None, None)?
-                .context(format!(
-                    "Failed to create initial timeline for tenant {new_tenant_id}"
-                ))?;
-            let new_timeline_id = timeline.timeline_id;
-            let last_record_lsn = timeline
+            let timeline_info =
+                pageserver.timeline_create(new_tenant_id, new_timeline_id, None, None)?;
+            let new_timeline_id = timeline_info.timeline_id;
+            let last_record_lsn = timeline_info
                .local
                .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))?
                .last_record_lsn;
@@ -616,20 +613,18 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            let new_branch_name = create_match
                .value_of("branch-name")
                .ok_or_else(|| anyhow!("No branch name provided"))?;
-            let timeline = pageserver
-                .timeline_create(tenant_id, None, None, None)?
-                .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?;
-            let new_timeline_id = timeline.timeline_id;
+            let timeline_info = pageserver.timeline_create(tenant_id, None, None, None)?;
+            let new_timeline_id = timeline_info.timeline_id;

-            let last_record_lsn = timeline
+            let last_record_lsn = timeline_info
                .local
                .expect("no local timeline info")
                .last_record_lsn;
            env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;

            println!(
-                "Created timeline '{}' at Lsn {} for tenant: {}",
-                timeline.timeline_id, last_record_lsn, tenant_id,
+                "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
+                timeline_info.timeline_id
            );
        }
        Some(("import", import_match)) => {
@@ -680,10 +675,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            let ancestor_timeline_id = env
                .get_branch_timeline_id(ancestor_branch_name, tenant_id)
                .ok_or_else(|| {
-                    anyhow!(
-                        "Found no timeline id for branch name '{}'",
-                        ancestor_branch_name
-                    )
+                    anyhow!("Found no timeline id for branch name '{ancestor_branch_name}'")
                })?;

            let start_lsn = branch_match
@@ -691,12 +683,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                .map(Lsn::from_str)
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
-            let timeline = pageserver
-                .timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))?
-                .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?;
-            let new_timeline_id = timeline.timeline_id;
+            let timeline_info = pageserver.timeline_create(
+                tenant_id,
+                None,
+                start_lsn,
+                Some(ancestor_timeline_id),
+            )?;
+            let new_timeline_id = timeline_info.timeline_id;

-            let last_record_lsn = timeline
+            let last_record_lsn = timeline_info
                .local
                .expect("no local timeline info")
                .last_record_lsn;
@@ -704,11 +699,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;

            println!(
-                "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'",
-                timeline.timeline_id, last_record_lsn, tenant_id, ancestor_branch_name,
+                "Created timeline '{}' at Lsn {last_record_lsn} for tenant: {tenant_id}. Ancestor timeline: '{ancestor_branch_name}'",
+                timeline_info.timeline_id
            );
        }
-        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
+        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{sub_name}'"),
        None => bail!("no tenant subcommand provided"),
    }

--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -150,7 +150,7 @@ impl PostgresNode {
        let port: u16 = conf.parse_field("port", &context)?;
        let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?;
        let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?;
-        let uses_wal_proposer = conf.get("safekeepers").is_some();
+        let uses_wal_proposer = conf.get("neon.safekeepers").is_some();

        // parse recovery_target_lsn, if any
        let recovery_target_lsn: Option<Lsn> =
@@ -341,7 +341,7 @@ impl PostgresNode {
                .map(|sk| format!("localhost:{}", sk.pg_port))
                .collect::<Vec<String>>()
                .join(",");
-            conf.append("safekeepers", &safekeepers);
+            conf.append("neon.safekeepers", &safekeepers);
        } else {
            // We only use setup without safekeepers for tests,
            // and don't care about data durability on pageserver,
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -24,7 +24,7 @@ use crate::safekeeper::SafekeeperNode;
 // This data structures represents neon_local CLI config
 //
 // It is deserialized from the .neon/config file, or the config file passed
-// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
+// to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
 #[serde_as]
@@ -289,13 +289,13 @@ impl LocalEnv {
        let mut env: LocalEnv = toml::from_str(toml)?;

        // Find postgres binaries.
-        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
+        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14".
        if env.pg_distrib_dir == Path::new("") {
            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
                env.pg_distrib_dir = postgres_bin.into();
            } else {
                let cwd = env::current_dir()?;
-                env.pg_distrib_dir = cwd.join("tmp_install")
+                env.pg_distrib_dir = cwd.join("pg_install/v14")
            }
        }

@@ -320,7 +320,7 @@ impl LocalEnv {

        if !repopath.exists() {
            bail!(
-                "Zenith config is not found in {}. You need to run 'zenith init' first",
+                "Zenith config is not found in {}. You need to run 'neon_local init' first",
                repopath.to_str().unwrap()
            );
        }
@@ -337,12 +337,12 @@ impl LocalEnv {
    }

    pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
-        // Currently, the user first passes a config file with 'zenith init --config=<path>'
+        // Currently, the user first passes a config file with 'neon_local init --config=<path>'
        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
        // a bit sad.
        let mut conf_content = r#"# This file describes a locale deployment of the page server
-# and safekeeeper node. It is read by the 'zenith' command-line
+# and safekeeeper node. It is read by the 'neon_local' command-line
 # utility.
 "#
        .to_string();
@@ -382,7 +382,7 @@ impl LocalEnv {
    }

    //
-    // Initialize a new Zenith repository
+    // Initialize a new Neon repository
    //
    pub fn init(&mut self) -> anyhow::Result<()> {
        // check if config already exists
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,5 +1,4 @@
 use std::io::Write;
-use std::net::TcpStream;
 use std::path::PathBuf;
 use std::process::Command;
 use std::sync::Arc;
@@ -47,12 +46,12 @@ impl ResponseErrorMessageExt for Response {
            return Ok(self);
        }

-        // reqwest do not export it's error construction utility functions, so lets craft the message ourselves
+        // reqwest does not export its error construction utility functions, so let's craft the message ourselves
        let url = self.url().to_owned();
        Err(SafekeeperHttpError::Response(
            match self.json::<HttpErrorBody>() {
                Ok(err_body) => format!("Error: {}", err_body.msg),
-                Err(_) => format!("Http error ({}) at {url}.", status.as_u16()),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            },
        ))
    }
@@ -241,37 +240,23 @@ impl SafekeeperNode {
            ),
        }

-        let address = connection_address(&self.pg_connection_config);
-
-        // TODO Remove this "timeout" and handle it on caller side instead.
-        // Shutting down may take a long time,
-        // if safekeeper flushes a lot of data
-        let mut tcp_stopped = false;
+        // Wait until process is gone
        for i in 0..600 {
-            if !tcp_stopped {
-                if let Err(err) = TcpStream::connect(&address) {
-                    tcp_stopped = true;
-                    if err.kind() != io::ErrorKind::ConnectionRefused {
-                        eprintln!("\nSafekeeper connection failed with error: {err}");
-                    }
+            let signal = None; // Send no signal, just get the error code
+            match kill(pid, signal) {
+                Ok(_) => (), // Process exists, keep waiting
+                Err(Errno::ESRCH) => {
+                    // Process not found, we're done
+                    println!("done!");
+                    return Ok(());
                }
-            }
-            if tcp_stopped {
-                // Also check status on the HTTP port
-                match self.check_status() {
-                    Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => {
-                        println!("done!");
-                        return Ok(());
-                    }
-                    Err(err) => {
-                        eprintln!("\nSafekeeper status check failed with error: {err}");
-                        return Ok(());
-                    }
-                    Ok(()) => {
-                        // keep waiting
-                    }
-                }
-            }
+                Err(err) => bail!(
+                    "Failed to send signal to pageserver with pid {}: {}",
+                    pid,
+                    err.desc()
+                ),
+            };
+
            if i % 10 == 0 {
                print!(".");
                io::stdout().flush().unwrap();
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -1,9 +1,8 @@
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::{BufReader, Write};
-use std::net::TcpStream;
 use std::num::NonZeroU64;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::Duration;
 use std::{io, result, thread};
@@ -58,7 +57,7 @@ impl ResponseErrorMessageExt for Response {
            return Ok(self);
        }

-        // reqwest do not export it's error construction utility functions, so lets craft the message ourselves
+        // reqwest does not export its error construction utility functions, so let's craft the message ourselves
        let url = self.url().to_owned();
        Err(PageserverHttpError::Response(
            match self.json::<HttpErrorBody>() {
@@ -103,23 +102,19 @@ impl PageServerNode {

    /// Construct libpq connection string for connecting to the pageserver.
    fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
-        format!("postgresql://no_user:{}@{}/no_db", password, listen_addr)
+        format!("postgresql://no_user:{password}@{listen_addr}/no_db")
            .parse()
            .unwrap()
    }

-    pub fn init(
+    pub fn initialize(
        &self,
        create_tenant: Option<ZTenantId>,
        initial_timeline_id: Option<ZTimelineId>,
        config_overrides: &[&str],
    ) -> anyhow::Result<ZTimelineId> {
-        let mut cmd = Command::new(self.env.pageserver_bin()?);
-
        let id = format!("id={}", self.env.pageserver.id);
-
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
-        let base_data_dir_param = self.env.base_data_dir.display().to_string();
        let pg_distrib_dir_param =
            format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display());
        let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type);
@@ -139,67 +134,52 @@ impl PageServerNode {
                .collect::<Vec<_>>()
                .join(",")
        );
-        let mut args = Vec::with_capacity(20);
-
-        args.push("--init");
-        args.extend(["-D", &base_data_dir_param]);
-        args.extend(["-c", &pg_distrib_dir_param]);
-        args.extend(["-c", &authg_type_param]);
-        args.extend(["-c", &listen_http_addr_param]);
-        args.extend(["-c", &listen_pg_addr_param]);
-        args.extend(["-c", &broker_endpoints_param]);
-        args.extend(["-c", &id]);
-
        let broker_etcd_prefix_param = self
            .env
            .etcd_broker
            .broker_etcd_prefix
            .as_ref()
            .map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
-        if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
-            args.extend(["-c", broker_etcd_prefix_param]);
-        }

-        for config_override in config_overrides {
-            args.extend(["-c", config_override]);
+        let mut init_config_overrides = config_overrides.to_vec();
+        init_config_overrides.push(&id);
+        init_config_overrides.push(&pg_distrib_dir_param);
+        init_config_overrides.push(&authg_type_param);
+        init_config_overrides.push(&listen_http_addr_param);
+        init_config_overrides.push(&listen_pg_addr_param);
+        init_config_overrides.push(&broker_endpoints_param);
+
+        if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
+            init_config_overrides.push(broker_etcd_prefix_param);
        }

        if self.env.pageserver.auth_type != AuthType::Trust {
-            args.extend([
-                "-c",
-                "auth_validation_public_key_path='auth_public_key.pem'",
-            ]);
+            init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
        }

-        let create_tenant = create_tenant.map(|id| id.to_string());
-        if let Some(tenant_id) = create_tenant.as_deref() {
-            args.extend(["--create-tenant", tenant_id])
+        self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
+        let init_result = self
+            .try_init_timeline(create_tenant, initial_timeline_id)
+            .context("Failed to create initial tenant and timeline for pageserver");
+        match &init_result {
+            Ok(initial_timeline_id) => {
+                println!("Successfully initialized timeline {initial_timeline_id}")
+            }
+            Err(e) => eprintln!("{e:#}"),
        }
+        self.stop(false)?;
+        init_result
+    }

-        let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
-        let initial_timeline_id_string = initial_timeline_id.to_string();
-        args.extend(["--initial-timeline-id", &initial_timeline_id_string]);
-
-        let cmd_with_args = cmd.args(args);
-        let init_output = fill_rust_env_vars(cmd_with_args)
-            .output()
-            .with_context(|| {
-                format!("failed to init pageserver with command {:?}", cmd_with_args)
-            })?;
-
-        if !init_output.status.success() {
-            bail!(
-                "init invocation failed, {}\nStdout: {}\nStderr: {}",
-                init_output.status,
-                String::from_utf8_lossy(&init_output.stdout),
-                String::from_utf8_lossy(&init_output.stderr)
-            );
-        }
-
-        // echo the captured output of the init command
-        println!("{}", String::from_utf8_lossy(&init_output.stdout));
-
-        Ok(initial_timeline_id)
+    fn try_init_timeline(
+        &self,
+        new_tenant_id: Option<ZTenantId>,
+        new_timeline_id: Option<ZTimelineId>,
+    ) -> anyhow::Result<ZTimelineId> {
+        let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?;
+        let initial_timeline_info =
+            self.timeline_create(initial_tenant_id, new_timeline_id, None, None)?;
+        Ok(initial_timeline_info.timeline_id)
    }

    pub fn repo_path(&self) -> PathBuf {
@@ -211,15 +191,35 @@ impl PageServerNode {
    }

    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
-        print!(
+        self.start_node(config_overrides, &self.repo_path(), false)
+    }
+
+    fn start_node(
+        &self,
+        config_overrides: &[&str],
+        datadir: &Path,
+        update_config: bool,
+    ) -> anyhow::Result<()> {
+        println!(
            "Starting pageserver at '{}' in '{}'",
            connection_address(&self.pg_connection_config),
-            self.repo_path().display()
+            datadir.display()
        );
-        io::stdout().flush().unwrap();
+        io::stdout().flush()?;

-        let repo_path = self.repo_path();
-        let mut args = vec!["-D", repo_path.to_str().unwrap()];
+        let mut args = vec![
+            "-D",
+            datadir.to_str().with_context(|| {
+                format!(
+                    "Datadir path '{}' cannot be represented as a unicode string",
+                    datadir.display()
+                )
+            })?,
+        ];
+
+        if update_config {
+            args.push("--update-config");
+        }

        for config_override in config_overrides {
            args.extend(["-c", config_override]);
@@ -231,8 +231,8 @@ impl PageServerNode {

        if !filled_cmd.status()?.success() {
            bail!(
-                "Pageserver failed to start. See '{}' for details.",
-                self.repo_path().join("pageserver.log").display()
+                "Pageserver failed to start. See console output and '{}' for details.",
+                datadir.join("pageserver.log").display()
            );
        }

@@ -241,7 +241,7 @@ impl PageServerNode {
        const RETRIES: i8 = 15;
        for retries in 1..RETRIES {
            match self.check_status() {
-                Ok(_) => {
+                Ok(()) => {
                    println!("\nPageserver started");
                    return Ok(());
                }
@@ -255,21 +255,18 @@ impl PageServerNode {
                                if retries == 5 {
                                    println!() // put a line break after dots for second message
                                }
-                                println!(
-                                    "Pageserver not responding yet, err {} retrying ({})...",
-                                    err, retries
-                                );
+                                println!("Pageserver not responding yet, err {err} retrying ({retries})...");
                            }
                        }
                        PageserverHttpError::Response(msg) => {
-                            bail!("pageserver failed to start: {} ", msg)
+                            bail!("pageserver failed to start: {msg} ")
                        }
                    }
                    thread::sleep(Duration::from_secs(1));
                }
            }
        }
-        bail!("pageserver failed to start in {} seconds", RETRIES);
+        bail!("pageserver failed to start in {RETRIES} seconds");
    }

    ///
@@ -299,51 +296,32 @@ impl PageServerNode {
        match kill(pid, sig) {
            Ok(_) => (),
            Err(Errno::ESRCH) => {
-                println!(
-                    "Pageserver with pid {} does not exist, but a PID file was found",
-                    pid
-                );
+                println!("Pageserver with pid {pid} does not exist, but a PID file was found");
                return Ok(());
            }
            Err(err) => bail!(
-                "Failed to send signal to pageserver with pid {}: {}",
-                pid,
+                "Failed to send signal to pageserver with pid {pid}: {}",
                err.desc()
            ),
        }

-        let address = connection_address(&self.pg_connection_config);
-
-        // TODO Remove this "timeout" and handle it on caller side instead.
-        // Shutting down may take a long time,
-        // if pageserver checkpoints a lot of data
-        let mut tcp_stopped = false;
+        // Wait until process is gone
        for i in 0..600 {
-            if !tcp_stopped {
-                if let Err(err) = TcpStream::connect(&address) {
-                    tcp_stopped = true;
-                    if err.kind() != io::ErrorKind::ConnectionRefused {
-                        eprintln!("\nPageserver connection failed with error: {err}");
-                    }
+            let signal = None; // Send no signal, just get the error code
+            match kill(pid, signal) {
+                Ok(_) => (), // Process exists, keep waiting
+                Err(Errno::ESRCH) => {
+                    // Process not found, we're done
+                    println!("done!");
+                    return Ok(());
                }
-            }
-            if tcp_stopped {
-                // Also check status on the HTTP port
+                Err(err) => bail!(
+                    "Failed to send signal to pageserver with pid {}: {}",
+                    pid,
+                    err.desc()
+                ),
+            };

-                match self.check_status() {
-                    Err(PageserverHttpError::Transport(err)) if err.is_connect() => {
-                        println!("done!");
-                        return Ok(());
-                    }
-                    Err(err) => {
-                        eprintln!("\nPageserver status check failed with error: {err}");
-                        return Ok(());
-                    }
-                    Ok(()) => {
-                        // keep waiting
-                    }
-                }
-            }
            if i % 10 == 0 {
                print!(".");
                io::stdout().flush().unwrap();
@@ -351,13 +329,13 @@ impl PageServerNode {
            thread::sleep(Duration::from_millis(100));
        }

-        bail!("Failed to stop pageserver with pid {}", pid);
+        bail!("Failed to stop pageserver with pid {pid}");
    }

    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
        let mut client = self.pg_connection_config.connect(NoTls).unwrap();

-        println!("Pageserver query: '{}'", sql);
+        println!("Pageserver query: '{sql}'");
        client.simple_query(sql).unwrap()
    }

@@ -392,9 +370,8 @@ impl PageServerNode {
        &self,
        new_tenant_id: Option<ZTenantId>,
        settings: HashMap<&str, &str>,
-    ) -> anyhow::Result<Option<ZTenantId>> {
-        let tenant_id_string = self
-            .http_request(Method::POST, format!("{}/tenant", self.http_base_url))
+    ) -> anyhow::Result<ZTenantId> {
+        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
            .json(&TenantCreateRequest {
                new_tenant_id,
                checkpoint_distance: settings
@@ -433,18 +410,16 @@ impl PageServerNode {
            })
            .send()?
            .error_from_body()?
-            .json::<Option<String>>()?;
-
-        tenant_id_string
-            .map(|id| {
-                id.parse().with_context(|| {
-                    format!(
-                        "Failed to parse tennat creation response as tenant id: {}",
-                        id
-                    )
+            .json::<Option<String>>()
+            .with_context(|| {
+                format!("Failed to parse tenant creation response for tenant id: {new_tenant_id:?}")
+            })?
+            .context("No tenant id was found in the tenant creation response")
+            .and_then(|tenant_id_string| {
+                tenant_id_string.parse().with_context(|| {
+                    format!("Failed to parse response string as tenant id: '{tenant_id_string}'")
                })
            })
-            .transpose()
    }

    pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> {
@@ -515,22 +490,27 @@ impl PageServerNode {
        new_timeline_id: Option<ZTimelineId>,
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<ZTimelineId>,
-    ) -> anyhow::Result<Option<TimelineInfo>> {
-        let timeline_info_response = self
-            .http_request(
-                Method::POST,
-                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
+    ) -> anyhow::Result<TimelineInfo> {
+        self.http_request(
+            Method::POST,
+            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
+        )
+        .json(&TimelineCreateRequest {
+            new_timeline_id,
+            ancestor_start_lsn,
+            ancestor_timeline_id,
+        })
+        .send()?
+        .error_from_body()?
+        .json::<Option<TimelineInfo>>()
+        .with_context(|| {
+            format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
+        })?
+        .with_context(|| {
+            format!(
+                "No timeline id was found in the timeline creation response for tenant {tenant_id}"
            )
-            .json(&TimelineCreateRequest {
-                new_timeline_id,
-                ancestor_start_lsn,
-                ancestor_timeline_id,
-            })
-            .send()?
-            .error_from_body()?
-            .json::<Option<TimelineInfo>>()?;
-
-        Ok(timeline_info_response)
+        })
    }

    /// Import a basebackup prepared using either:
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -1,24 +0,0 @@
-#!/bin/sh
-set -eux
-
-pageserver_id_param="${NODE_ID:-10}"
-
-broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
-if [ "$broker_endpoints_param" != "absent" ]; then
-    broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
-else
-    broker_endpoints_param=''
-fi
-
-remote_storage_param="${REMOTE_STORAGE:-}"
-
-if [ "$1" = 'pageserver' ]; then
-    if [ ! -d "/data/tenants" ]; then
-        echo "Initializing pageserver data directory"
-        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param
-    fi
-    echo "Staring pageserver at 0.0.0.0:6400"
-    pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data
-else
-    "$@"
-fi
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -92,6 +92,7 @@ The layer map tracks what layers exist in a timeline.
 ### Layered repository

 Neon repository implementation that keeps data in layers.
+
 ### LSN

 The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
@@ -125,6 +126,26 @@ TODO: use this name consistently in remote storage code. Now `disk_consistent_ls
 * `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created)

 TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs.
+
+### Logical size
+
+The pageserver tracks the "logical size" of a timeline. It is the
+total size of all relations in all Postgres databases on the
+timeline. It includes all user and system tables, including their FSM
+and VM forks. But it does not include SLRUs, twophase files or any
+other such data or metadata that lives outside relations.
+
+The logical size is calculated by the pageserver, and is sent to
+PostgreSQL via feedback messages to the safekeepers. PostgreSQL uses
+the logical size to enforce the size limit in the free tier. The
+logical size is also shown to users in the web console.
+
+The logical size is not affected by branches or the physical layout of
+layer files in the pageserver. If you have a database with 1 GB
+logical size and you create a branch of it, both branches will have 1
+GB logical size, even though the branch is copy-on-write and won't
+consume any extra physical disk space until you make changes to it.
+
 ### Page (block)

 The basic structure used to store relation data. All pages are of the same size.
--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -1,26 +1,39 @@
 ## Thread management

-Each thread in the system is tracked by the `thread_mgr` module. It
-maintains a registry of threads, and which tenant or timeline they are
-operating on. This is used for safe shutdown of a tenant, or the whole
-system.
+The pageserver uses Tokio for handling concurrency. Everything runs in
+Tokio tasks, although some parts are written in blocking style and use
+spawn_blocking().
+
+Each Tokio task is tracked by the `task_mgr` module. It maintains a
+registry of tasks, and which tenant or timeline they are operating
+on.

 ### Handling shutdown

-When a tenant or timeline is deleted, we need to shut down all threads
-operating on it, before deleting the data on disk. A thread registered
-in the thread registry can check if it has been requested to shut down,
-by calling `is_shutdown_requested()`. For async operations, there's also
-a `shudown_watcher()` async task that can be used to wake up on shutdown.
+When a tenant or timeline is deleted, we need to shut down all tasks
+operating on it, before deleting the data on disk. There's a function,
+`shutdown_tasks`, to request all tasks of a particular tenant or
+timeline to shutdown. It will also wait for them to finish.
+
+A task registered in the task registry can check if it has been
+requested to shut down, by calling `is_shutdown_requested()`. There's
+also a `shudown_watcher()` Future that can be used with `tokio::select!`
+or similar, to wake up on shutdown.
+

 ### Sync vs async

-The primary programming model in the page server is synchronous,
-blocking code. However, there are some places where async code is
-used. Be very careful when mixing sync and async code.
-
-Async is primarily used to wait for incoming data on network
-connections. For example, all WAL receivers have a shared thread pool,
-with one async Task for each connection. Once a piece of WAL has been
-received from the network, the thread calls the blocking functions in
+We use async to wait for incoming data on network connections, and to
+perform other long-running operations. For example, each WAL receiver
+connection is handled by a tokio Task. Once a piece of WAL has been
+received from the network, the task calls the blocking functions in
 the Repository to process the WAL.
+
+The core storage code in `layered_repository/` is synchronous, with
+blocking locks and I/O calls. The current model is that we consider
+disk I/Os to be short enough that we perform them while running in a
+Tokio task. If that becomes a problem, we should use `spawn_blocking`
+before entering the synchronous parts of the code, or switch to using
+tokio I/O functions.
+
+Be very careful when mixing sync and async code!
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -157,7 +157,7 @@ for other files and for sockets for incoming connections.
 A directory with Postgres installation to use during pageserver activities.
 Inside that dir, a `bin/postgres` binary should be present.

-The default distrib dir is `./tmp_install/`.
+The default distrib dir is `./pg_install/`.

 #### workdir (-D)

--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -40,15 +40,15 @@ and create new databases and accounts (control plane API in our case).

 Integration tests, written in Python using the `pytest` framework.

-`/vendor/postgres`:
+`/vendor/postgres-v14`:

 PostgreSQL source tree, with the modifications needed for Neon.

-`/vendor/postgres/contrib/neon`:
+`/pgxn/neon`:

 PostgreSQL extension that implements storage manager API and network communications with remote page server.

-`/vendor/postgres/contrib/neon_test_utils`:
+`/pgxn/neon_test_utils`:

 PostgreSQL extension that contains functions needed for testing and debugging.

@@ -112,11 +112,13 @@ Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.

 ### Obligatory checks
-We force code formatting via `yapf` and type hints via `mypy`.
-Run the following commands in the repository's root (next to `setup.cfg`):
+We force code formatting via `black`, `isort` and type hints via `mypy`.
+Run the following commands in the repository's root (next to `pyproject.toml`):

 ```bash
-poetry run yapf -ri .  # All code is reformatted
+poetry run isort .  # Imports are reformatted
+poetry run black .  # All code is reformatted
+poetry run flake8 .  # Python linter
 poetry run mypy .  # Ensure there are no typing errors
 ```

@@ -125,7 +127,7 @@ Otherwise it will not find its configuration.

 Also consider:

-* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
+* Running `pycodestyle` (or a linter of your choice) and fixing possible defects, if any.
 * Adding more type hints to your code to avoid `Any`.

 ### Changing dependencies
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -4,7 +4,6 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-chrono = "0.4.19"
 rand = "0.8.3"
 regex = "1.4.5"
 bytes = "1.0.1"
--- a/libs/postgres_ffi/README.md
+++ b/libs/postgres_ffi/README.md
@@ -9,9 +9,11 @@ should be auto-generated too, but that's a TODO.

 The PostgreSQL on-disk file format is not portable across different
 CPU architectures and operating systems. It is also subject to change
-in each major PostgreSQL version. Currently, this module is based on
-PostgreSQL v14, but in the future we will probably need a separate
-copy for each PostgreSQL version.
+in each major PostgreSQL version. Currently, this module supports
+PostgreSQL v14 and v15: bindings and code that depends on them are version-specific.
+This code is organized in modules: `postgres_ffi::v14` and `postgres_ffi::v15`
+Version independend code is explicitly exported into shared `postgres_ffi`.
+

 TODO: Currently, there is also some code that deals with WAL records
 in pageserver/src/waldecoder.rs.  That should be moved into this
--- a/libs/postgres_ffi/pg_control_ffi.h
+++ b/libs/postgres_ffi/pg_control_ffi.h
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -44,91 +44,102 @@ impl ParseCallbacks for PostgresFfiCallbacks {

 fn main() {
    // Tell cargo to invalidate the built crate whenever the wrapper changes
-    println!("cargo:rerun-if-changed=pg_control_ffi.h");
+    println!("cargo:rerun-if-changed=bindgen_deps.h");

    // Finding the location of C headers for the Postgres server:
-    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/tmp_install`
-    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/tmp_install/include/postgresql/server`
-    let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR")
-    {
+    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
+    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
+    let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
        postgres_install_dir.into()
    } else {
-        PathBuf::from("tmp_install")
+        PathBuf::from("pg_install")
    };

-    if pg_install_dir.is_relative() {
-        let cwd = env::current_dir().unwrap();
-        pg_install_dir = cwd.join("..").join("..").join(pg_install_dir);
-    }
-
-    let pg_config_bin = pg_install_dir.join("bin").join("pg_config");
-    let inc_server_path: String = if pg_config_bin.exists() {
-        let output = Command::new(pg_config_bin)
-            .arg("--includedir-server")
-            .output()
-            .expect("failed to execute `pg_config --includedir-server`");
-
-        if !output.status.success() {
-            panic!("`pg_config --includedir-server` failed")
+    for pg_version in &["v14", "v15"] {
+        let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
+        if pg_install_dir_versioned.is_relative() {
+            let cwd = env::current_dir().unwrap();
+            pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned);
        }

-        String::from_utf8(output.stdout).unwrap().trim_end().into()
-    } else {
-        pg_install_dir
-            .join("include")
-            .join("postgresql")
-            .join("server")
-            .into_os_string()
-            .into_string()
-            .unwrap()
-    };
+        let pg_config_bin = pg_install_dir_versioned
+            .join(pg_version)
+            .join("bin")
+            .join("pg_config");
+        let inc_server_path: String = if pg_config_bin.exists() {
+            let output = Command::new(pg_config_bin)
+                .arg("--includedir-server")
+                .output()
+                .expect("failed to execute `pg_config --includedir-server`");

-    // The bindgen::Builder is the main entry point
-    // to bindgen, and lets you build up options for
-    // the resulting bindings.
-    let bindings = bindgen::Builder::default()
-        //
-        // All the needed PostgreSQL headers are included from 'pg_control_ffi.h'
-        //
-        .header("pg_control_ffi.h")
-        //
-        // Tell cargo to invalidate the built crate whenever any of the
-        // included header files changed.
-        //
-        .parse_callbacks(Box::new(PostgresFfiCallbacks))
-        //
-        // These are the types and constants that we want to generate bindings for
-        //
-        .allowlist_type("BlockNumber")
-        .allowlist_type("OffsetNumber")
-        .allowlist_type("MultiXactId")
-        .allowlist_type("MultiXactOffset")
-        .allowlist_type("MultiXactStatus")
-        .allowlist_type("ControlFileData")
-        .allowlist_type("CheckPoint")
-        .allowlist_type("FullTransactionId")
-        .allowlist_type("XLogRecord")
-        .allowlist_type("XLogPageHeaderData")
-        .allowlist_type("XLogLongPageHeaderData")
-        .allowlist_var("XLOG_PAGE_MAGIC")
-        .allowlist_var("PG_CONTROL_FILE_SIZE")
-        .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
-        .allowlist_type("PageHeaderData")
-        .allowlist_type("DBState")
-        // Because structs are used for serialization, tell bindgen to emit
-        // explicit padding fields.
-        .explicit_padding(true)
-        //
-        .clang_arg(format!("-I{inc_server_path}"))
-        //
-        // Finish the builder and generate the bindings.
-        //
-        .generate()
-        .expect("Unable to generate bindings");
+            if !output.status.success() {
+                panic!("`pg_config --includedir-server` failed")
+            }

-    // Write the bindings to the $OUT_DIR/bindings.rs file.
-    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-    bindings
-        .write_to_file(out_path.join("bindings.rs"))
-        .expect("Couldn't write bindings!");
+            String::from_utf8(output.stdout).unwrap().trim_end().into()
+        } else {
+            pg_install_dir_versioned
+                .join("include")
+                .join("postgresql")
+                .join("server")
+                .into_os_string()
+                .into_string()
+                .unwrap()
+        };
+
+        // The bindgen::Builder is the main entry point
+        // to bindgen, and lets you build up options for
+        // the resulting bindings.
+        let bindings = bindgen::Builder::default()
+            //
+            // All the needed PostgreSQL headers are included from 'bindgen_deps.h'
+            //
+            .header("bindgen_deps.h")
+            //
+            // Tell cargo to invalidate the built crate whenever any of the
+            // included header files changed.
+            //
+            .parse_callbacks(Box::new(PostgresFfiCallbacks))
+            //
+            // These are the types and constants that we want to generate bindings for
+            //
+            .allowlist_type("BlockNumber")
+            .allowlist_type("OffsetNumber")
+            .allowlist_type("XLogRecPtr")
+            .allowlist_type("XLogSegNo")
+            .allowlist_type("TimeLineID")
+            .allowlist_type("TimestampTz")
+            .allowlist_type("MultiXactId")
+            .allowlist_type("MultiXactOffset")
+            .allowlist_type("MultiXactStatus")
+            .allowlist_type("ControlFileData")
+            .allowlist_type("CheckPoint")
+            .allowlist_type("FullTransactionId")
+            .allowlist_type("XLogRecord")
+            .allowlist_type("XLogPageHeaderData")
+            .allowlist_type("XLogLongPageHeaderData")
+            .allowlist_var("XLOG_PAGE_MAGIC")
+            .allowlist_var("PG_CONTROL_FILE_SIZE")
+            .allowlist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
+            .allowlist_type("PageHeaderData")
+            .allowlist_type("DBState")
+            // Because structs are used for serialization, tell bindgen to emit
+            // explicit padding fields.
+            .explicit_padding(true)
+            //
+            .clang_arg(format!("-I{inc_server_path}"))
+            //
+            // Finish the builder and generate the bindings.
+            //
+            .generate()
+            .expect("Unable to generate bindings");
+
+        // Write the bindings to the $OUT_DIR/bindings_$pg_version.rs file.
+        let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+        let filename = format!("bindings_{pg_version}.rs");
+
+        bindings
+            .write_to_file(out_path.join(filename))
+            .expect("Couldn't write bindings!");
+    }
 }
--- a/libs/postgres_ffi/src/controlfile_utils.rs
+++ b/libs/postgres_ffi/src/controlfile_utils.rs
@@ -23,7 +23,7 @@
 //! information. You can use PostgreSQL's pg_controldata utility to view its
 //! contents.
 //!
-use crate::{ControlFileData, PG_CONTROL_FILE_SIZE};
+use super::bindings::{ControlFileData, PG_CONTROL_FILE_SIZE};

 use anyhow::{bail, Result};
 use bytes::{Bytes, BytesMut};
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -7,21 +7,74 @@
 // https://github.com/rust-lang/rust-bindgen/issues/1651
 #![allow(deref_nullptr)]

-use serde::{Deserialize, Serialize};
 use utils::lsn::Lsn;

-include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
+macro_rules! postgres_ffi {
+    ($version:ident) => {
+        #[path = "."]
+        pub mod $version {
+            pub mod bindings {
+                // bindgen generates bindings for a lot of stuff we don't need
+                #![allow(dead_code)]

-pub mod controlfile_utils;
-pub mod nonrelfile_utils;
-pub mod pg_constants;
-pub mod relfile_utils;
-pub mod waldecoder;
-pub mod xlog_utils;
+                use serde::{Deserialize, Serialize};
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/bindings_",
+                    stringify!($version),
+                    ".rs"
+                ));
+            }
+            pub mod controlfile_utils;
+            pub mod nonrelfile_utils;
+            pub mod pg_constants;
+            pub mod relfile_utils;
+            pub mod waldecoder;
+            pub mod xlog_utils;
+
+            pub const PG_MAJORVERSION: &str = stringify!($version);
+
+            // Re-export some symbols from bindings
+            pub use bindings::DBState_DB_SHUTDOWNED;
+            pub use bindings::{CheckPoint, ControlFileData, XLogRecord};
+        }
+    };
+}
+
+postgres_ffi!(v14);
+postgres_ffi!(v15);
+
+// Export some widely used datatypes that are unlikely to change across Postgres versions
+pub use v14::bindings::{uint32, uint64, Oid};
+pub use v14::bindings::{BlockNumber, OffsetNumber};
+pub use v14::bindings::{MultiXactId, TransactionId};
+pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
+
+// Likewise for these, although the assumption that these don't change is a little more iffy.
+pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
+pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+
+// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and
+// --with-segsize=SEGSIZE, but assume the defaults for now.
+pub const BLCKSZ: u16 = 8192;
+pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);
+pub const XLOG_BLCKSZ: usize = 8192;
+pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
+
+pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
+
+// PG timeline is always 1, changing it doesn't have any useful meaning in Neon.
+//
+// NOTE: this is not to be confused with Neon timelines; different concept!
+//
+// It's a shaky assumption, that it's always 1. We might import a
+// PostgreSQL data directory that has gone through timeline bumps,
+// for example. FIXME later.
+pub const PG_TLI: u32 = 1;

 //  See TransactionIdIsNormal in transam.h
 pub const fn transaction_id_is_normal(id: TransactionId) -> bool {
-    id > pg_constants::FIRST_NORMAL_TRANSACTION_ID
+    id > v14::pg_constants::FIRST_NORMAL_TRANSACTION_ID
 }

 // See TransactionIdPrecedes in transam.c
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -1,11 +1,12 @@
 //!
 //! Common utilities for dealing with PostgreSQL non-relation files.
 //!
-use crate::{pg_constants, transaction_id_precedes};
+use super::pg_constants;
+use crate::transaction_id_precedes;
 use bytes::BytesMut;
 use log::*;

-use crate::MultiXactId;
+use super::bindings::MultiXactId;

 pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
    trace!(
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -7,7 +7,8 @@
 //! comments on them.
 //!

-use crate::PageHeaderData;
+use super::bindings::{PageHeaderData, XLogRecord};
+use crate::BLCKSZ;

 //
 // From pg_tablespace_d.h
@@ -31,11 +32,6 @@ pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
 pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
 pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;

-// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and
-// --with-segsize=SEGSIZE, but assume the defaults for now.
-pub const BLCKSZ: u16 = 8192;
-pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);
-
 //
 // From bufpage.h
 //
@@ -180,7 +176,7 @@ pub const XLOG_DBASE_DROP: u8 = 0x10;
 pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
 pub const XLOG_TBLSPC_DROP: u8 = 0x10;

-pub const SIZEOF_XLOGRECORD: u32 = 24;
+pub const SIZEOF_XLOGRECORD: u32 = std::mem::size_of::<XLogRecord>() as u32;

 //
 // from xlogrecord.h
@@ -210,16 +206,10 @@ pub const INVALID_TRANSACTION_ID: u32 = 0;
 pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
 pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;

-/* FIXME: pageserver should request wal_seg_size from compute node */
-pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
-
-pub const XLOG_BLCKSZ: usize = 8192;
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

-pub const PG_MAJORVERSION: &str = "14";
-
 // List of subdirectories inside pgdata.
 // Copied from src/bin/initdb/initdb.c
 pub const PGDATA_SUBDIRS: [&str; 22] = [
--- a/libs/postgres_ffi/src/relfile_utils.rs
+++ b/libs/postgres_ffi/src/relfile_utils.rs
@@ -1,11 +1,11 @@
 //!
 //! Common utilities for dealing with PostgreSQL relation files.
 //!
-use crate::pg_constants;
+use super::pg_constants;
 use once_cell::sync::OnceCell;
 use regex::Regex;

-#[derive(Debug, Clone, thiserror::Error, PartialEq)]
+#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
 pub enum FilePathError {
    #[error("invalid relation fork name")]
    InvalidForkName,
--- a/libs/postgres_ffi/src/waldecoder.rs
+++ b/libs/postgres_ffi/src/waldecoder.rs
@@ -8,12 +8,9 @@
 //! to look deeper into the WAL records to also understand which blocks they modify, the code
 //! for that is in pageserver/src/walrecord.rs
 //!
-use super::pg_constants;
+use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC};
 use super::xlog_utils::*;
-use super::XLogLongPageHeaderData;
-use super::XLogPageHeaderData;
-use super::XLogRecord;
-use super::XLOG_PAGE_MAGIC;
+use crate::WAL_SEGMENT_SIZE;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use crc32c::*;
 use log::*;
@@ -136,7 +133,7 @@ impl WalStreamDecoder {
            // However, we may have to skip some page headers if we're processing the XLOG_SWITCH record or skipping padding for whatever reason.
            match self.state {
                State::WaitingForRecord | State::ReassemblingRecord { .. } => {
-                    if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
+                    if self.lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 {
                        // parse long header

                        if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
@@ -173,6 +170,7 @@ impl WalStreamDecoder {
                }
                State::SkippingEverything { .. } => {}
            }
+            // now read page contents
            match &mut self.state {
                State::WaitingForRecord => {
                    // need to have at least the xl_tot_len field
@@ -197,8 +195,8 @@ impl WalStreamDecoder {
                        return Ok(Some(self.complete_record(recordbuf)?));
                    } else {
                        // Need to assemble the record from pieces. Remember the size of the
-                        // record, and loop back. On next iteration, we will reach the 'else'
-                        // branch below, and copy the part of the record that was on this page
+                        // record, and loop back. On next iterations, we will reach the branch
+                        // below, and copy the part of the record that was on this or next page(s)
                        // to 'recordbuf'.  Subsequent iterations will skip page headers, and
                        // append the continuations from the next pages to 'recordbuf'.
                        self.state = State::ReassemblingRecord {
@@ -267,7 +265,7 @@ impl WalStreamDecoder {
        // to the next WAL segment.
        let next_lsn = if xlogrec.is_xlog_switch_record() {
            trace!("saw xlog switch record at {}", self.lsn);
-            self.lsn + self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64)
+            self.lsn + self.lsn.calc_padding(WAL_SEGMENT_SIZE as u64)
        } else {
            // Pad to an 8-byte boundary
            self.lsn.align()
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -7,39 +7,39 @@
 // have been named the same as the corresponding PostgreSQL functions instead.
 //

-use crate::pg_constants;
-use crate::CheckPoint;
-use crate::FullTransactionId;
-use crate::XLogLongPageHeaderData;
-use crate::XLogPageHeaderData;
-use crate::XLogRecord;
-use crate::XLOG_PAGE_MAGIC;
+use crc32c::crc32c_append;
+
+use super::bindings::{
+    CheckPoint, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData,
+    XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
+};
+use super::pg_constants;
+use super::waldecoder::WalStreamDecoder;
+use crate::PG_TLI;
+use crate::{uint32, uint64, Oid};
+use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};

-use crate::pg_constants::WAL_SEGMENT_SIZE;
-use anyhow::{anyhow, bail, ensure};
-use byteorder::{ByteOrder, LittleEndian};
 use bytes::BytesMut;
 use bytes::{Buf, Bytes};
-use crc32c::*;
+
 use log::*;
-use std::cmp::max;
-use std::cmp::min;
-use std::fs::{self, File};
+
+use serde::Serialize;
+use std::fs::File;
 use std::io::prelude::*;
+use std::io::ErrorKind;
 use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
 use std::time::SystemTime;
 use utils::bin_ser::DeserializeError;
 use utils::bin_ser::SerializeError;
-use utils::const_assert;
+
 use utils::lsn::Lsn;

 pub const XLOG_FNAME_LEN: usize = 24;
-pub const XLOG_BLCKSZ: usize = 8192;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
 pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
-pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;

 pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
 pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
@@ -47,14 +47,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
 #[allow(clippy::identity_op)]
 pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;

-// PG timeline is always 1, changing it doesn't have useful meaning in Zenith.
-pub const PG_TLI: u32 = 1;
-
-pub type XLogRecPtr = u64;
-pub type TimeLineID = u32;
-pub type TimestampTz = i64;
-pub type XLogSegNo = u64;
-
 /// Interval of checkpointing metadata file. We should store metadata file to enforce
 /// predicate that checkpoint.nextXid is larger than any XID in WAL.
 /// But flushing checkpoint file for each transaction seems to be too expensive,
@@ -80,12 +72,12 @@ pub fn XLogSegNoOffsetToRecPtr(

 #[allow(non_snake_case)]
 pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
-    return format!(
+    format!(
        "{:>08X}{:>08X}{:>08X}",
        tli,
        logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
        logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
-    );
+    )
 }

 #[allow(non_snake_case)]
@@ -140,338 +132,93 @@ pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
    }
 }

-/// Return offset of the last valid record in the segment segno, starting
-/// looking at start_offset. Returns start_offset if no records found.
-fn find_end_of_wal_segment(
-    data_dir: &Path,
-    segno: XLogSegNo,
-    tli: TimeLineID,
-    wal_seg_size: usize,
-    start_offset: usize, // start reading at this point
-) -> anyhow::Result<u32> {
-    // step back to the beginning of the page to read it in...
-    let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ;
-    let mut skipping_first_contrecord: bool = false;
-    let mut contlen: usize = 0;
-    let mut xl_crc: u32 = 0;
-    let mut crc: u32 = 0;
-    let mut rec_offs: usize = 0;
-    let mut buf = [0u8; XLOG_BLCKSZ];
-    let file_name = XLogFileName(tli, segno, wal_seg_size);
-    let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
-    let mut file = File::open(data_dir.join(file_name.clone() + ".partial"))?;
-    file.seek(SeekFrom::Start(offs as u64))?;
-    // xl_crc is the last field in XLogRecord, will not be read into rec_hdr
-    const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD);
-    let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS];
-
-    trace!("find_end_of_wal_segment(data_dir={}, segno={}, tli={}, wal_seg_size={}, start_offset=0x{:x})", data_dir.display(), segno, tli, wal_seg_size, start_offset);
-    while offs < wal_seg_size {
-        // we are at the beginning of the page; read it in
-        if offs % XLOG_BLCKSZ == 0 {
-            trace!("offs=0x{:x}: new page", offs);
-            let bytes_read = file.read(&mut buf)?;
-            if bytes_read != buf.len() {
-                bail!(
-                    "failed to read {} bytes from {} at {}",
-                    XLOG_BLCKSZ,
-                    file_name,
-                    offs
-                );
-            }
-
-            let xlp_magic = LittleEndian::read_u16(&buf[0..2]);
-            let xlp_info = LittleEndian::read_u16(&buf[2..4]);
-            let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]);
-            trace!(
-                "  xlp_magic=0x{:x}, xlp_info=0x{:x}, xlp_rem_len={}",
-                xlp_magic,
-                xlp_info,
-                xlp_rem_len
-            );
-            // this is expected in current usage when valid WAL starts after page header
-            if xlp_magic != XLOG_PAGE_MAGIC as u16 {
-                trace!(
-                    "  invalid WAL file {}.partial magic {} at {:?}",
-                    file_name,
-                    xlp_magic,
-                    Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
-                );
-            }
-            if offs == 0 {
-                offs += XLOG_SIZE_OF_XLOG_LONG_PHD;
-                if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 {
-                    trace!("  first record is contrecord");
-                    skipping_first_contrecord = true;
-                    contlen = xlp_rem_len as usize;
-                    if offs < start_offset {
-                        // Pre-condition failed: the beginning of the segment is unexpectedly corrupted.
-                        ensure!(start_offset - offs >= contlen,
-                            "start_offset is in the middle of the first record (which happens to be a contrecord), \
-                             expected to be on a record boundary. Is beginning of the segment corrupted?");
-                        contlen = 0;
-                        // keep skipping_first_contrecord to avoid counting the contrecord as valid, we did not check it.
-                    }
-                } else {
-                    trace!("  first record is not contrecord");
-                }
-            } else {
-                offs += XLOG_SIZE_OF_XLOG_SHORT_PHD;
-            }
-            // ... and step forward again if asked
-            trace!("  skipped header to 0x{:x}", offs);
-            offs = max(offs, start_offset);
-        // beginning of the next record
-        } else if contlen == 0 {
-            let page_offs = offs % XLOG_BLCKSZ;
-            let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
-            trace!("offs=0x{:x}: new record, xl_tot_len={}", offs, xl_tot_len);
-            if xl_tot_len == 0 {
-                info!(
-                    "find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}",
-                    Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
-                    Lsn(XLogSegNoOffsetToRecPtr(
-                        segno,
-                        last_valid_rec_pos as u32,
-                        wal_seg_size
-                    ))
-                );
-                break; // zeros, reached the end
-            }
-            if skipping_first_contrecord {
-                skipping_first_contrecord = false;
-                trace!("  first contrecord has been just completed");
-            } else {
-                trace!(
-                    "  updating last_valid_rec_pos: 0x{:x} --> 0x{:x}",
-                    last_valid_rec_pos,
-                    offs
-                );
-                last_valid_rec_pos = offs;
-            }
-            offs += 4;
-            rec_offs = 4;
-            contlen = xl_tot_len - 4;
-            trace!(
-                "  reading rec_hdr[0..4] <-- [0x{:x}; 0x{:x})",
-                page_offs,
-                page_offs + 4
-            );
-            rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]);
-        } else {
-            // we're continuing a record, possibly from previous page.
-            let page_offs = offs % XLOG_BLCKSZ;
-            let pageleft = XLOG_BLCKSZ - page_offs;
-
-            // read the rest of the record, or as much as fits on this page.
-            let n = min(contlen, pageleft);
-            trace!(
-                "offs=0x{:x}, record continuation, pageleft={}, contlen={}",
-                offs,
-                pageleft,
-                contlen
-            );
-            // fill rec_hdr header up to (but not including) xl_crc field
-            trace!(
-                "  rec_offs={}, XLOG_RECORD_CRC_OFFS={}, XLOG_SIZE_OF_XLOG_RECORD={}",
-                rec_offs,
-                XLOG_RECORD_CRC_OFFS,
-                XLOG_SIZE_OF_XLOG_RECORD
-            );
-            if rec_offs < XLOG_RECORD_CRC_OFFS {
-                let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n);
-                trace!(
-                    "  reading rec_hdr[{}..{}] <-- [0x{:x}; 0x{:x})",
-                    rec_offs,
-                    rec_offs + len,
-                    page_offs,
-                    page_offs + len
-                );
-                rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]);
-            }
-            if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD {
-                let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS;
-                // All records are aligned on 8-byte boundary, so their 8-byte frames
-                // cannot be split between pages. As xl_crc is the last field,
-                // its content is always on the same page.
-                const_assert!(XLOG_RECORD_CRC_OFFS % 8 == 4);
-                // We should always start reading aligned records even in incorrect WALs so if
-                // the condition is false it is likely a bug. However, it is localized somewhere
-                // in this function, hence we do not crash and just report failure instead.
-                ensure!(crc_offs % 8 == 4, "Record is not aligned properly (bug?)");
-                xl_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
-                trace!(
-                    "  reading xl_crc: [0x{:x}; 0x{:x}) = 0x{:x}",
-                    crc_offs,
-                    crc_offs + 4,
-                    xl_crc
-                );
-                crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]);
-                trace!(
-                    "  initializing crc: [0x{:x}; 0x{:x}); crc = 0x{:x}",
-                    crc_offs + 4,
-                    page_offs + n,
-                    crc
-                );
-            } else if rec_offs > XLOG_RECORD_CRC_OFFS {
-                // As all records are 8-byte aligned, the header is already fully read and `crc` is initialized in the branch above.
-                ensure!(rec_offs >= XLOG_SIZE_OF_XLOG_RECORD);
-                let old_crc = crc;
-                crc = crc32c_append(crc, &buf[page_offs..page_offs + n]);
-                trace!(
-                    "  appending to crc: [0x{:x}; 0x{:x}); 0x{:x} --> 0x{:x}",
-                    page_offs,
-                    page_offs + n,
-                    old_crc,
-                    crc
-                );
-            } else {
-                // Correct because of the way conditions are written above.
-                assert!(rec_offs + n < XLOG_SIZE_OF_XLOG_RECORD);
-                // If `skipping_first_contrecord == true`, we may be reading from a middle of a record
-                // which started in the previous segment. Hence there is no point in validating the header.
-                if !skipping_first_contrecord && rec_offs + n > XLOG_RECORD_CRC_OFFS {
-                    info!(
-                        "Curiously corrupted WAL: a record stops inside the header; \
-                             offs=0x{:x}, record continuation, pageleft={}, contlen={}",
-                        offs, pageleft, contlen
-                    );
-                    break;
-                }
-                // Do nothing: we are still reading the header. It's accounted in CRC in the end of the record.
-            }
-            rec_offs += n;
-            offs += n;
-            contlen -= n;
-
-            if contlen == 0 {
-                trace!("  record completed at 0x{:x}", offs);
-                crc = crc32c_append(crc, &rec_hdr);
-                offs = (offs + 7) & !7; // pad on 8 bytes boundary */
-                trace!(
-                    "  padded offs to 0x{:x}, crc is {:x}, expected crc is {:x}",
-                    offs,
-                    crc,
-                    xl_crc
-                );
-                if skipping_first_contrecord {
-                    // do nothing, the flag will go down on next iteration when we're reading new record
-                    trace!("  first conrecord has been just completed");
-                } else if crc == xl_crc {
-                    // record is valid, advance the result to its end (with
-                    // alignment to the next record taken into account)
-                    trace!(
-                        "  updating last_valid_rec_pos: 0x{:x} --> 0x{:x}",
-                        last_valid_rec_pos,
-                        offs
-                    );
-                    last_valid_rec_pos = offs;
-                } else {
-                    info!(
-                        "CRC mismatch {} vs {} at {}",
-                        crc, xl_crc, last_valid_rec_pos
-                    );
-                    break;
-                }
-            }
-        }
-    }
-    trace!("last_valid_rec_pos=0x{:x}", last_valid_rec_pos);
-    Ok(last_valid_rec_pos as u32)
-}
-
-///
-/// Scan a directory that contains PostgreSQL WAL files, for the end of WAL.
-/// If precise, returns end LSN (next insertion point, basically);
-/// otherwise, start of the last segment.
-/// Returns (0, 0) if there is no WAL.
-///
+// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
+// start_lsn must point to some previously known record boundary (beginning of
+// the next record). If no valid record after is found, start_lsn is returned
+// back.
 pub fn find_end_of_wal(
    data_dir: &Path,
    wal_seg_size: usize,
-    precise: bool,
-    start_lsn: Lsn, // start reading WAL at this point or later
-) -> anyhow::Result<(XLogRecPtr, TimeLineID)> {
-    let mut high_segno: XLogSegNo = 0;
-    let mut high_tli: TimeLineID = 0;
-    let mut high_ispartial = false;
+    start_lsn: Lsn, // start reading WAL at this point; must point at record start_lsn.
+) -> anyhow::Result<Lsn> {
+    let mut result = start_lsn;
+    let mut curr_lsn = start_lsn;
+    let mut buf = [0u8; XLOG_BLCKSZ];
+    let mut decoder = WalStreamDecoder::new(start_lsn);

-    for entry in fs::read_dir(data_dir)?.flatten() {
-        let ispartial: bool;
-        let entry_name = entry.file_name();
-        let fname = entry_name
-            .to_str()
-            .ok_or_else(|| anyhow!("Invalid file name"))?;
-
-        /*
-         * Check if the filename looks like an xlog file, or a .partial file.
-         */
-        if IsXLogFileName(fname) {
-            ispartial = false;
-        } else if IsPartialXLogFileName(fname) {
-            ispartial = true;
-        } else {
-            continue;
-        }
-        let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
-        if !ispartial && entry.metadata()?.len() != wal_seg_size as u64 {
-            continue;
-        }
-        if segno > high_segno
-            || (segno == high_segno && tli > high_tli)
-            || (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
-        {
-            high_segno = segno;
-            high_tli = tli;
-            high_ispartial = ispartial;
-        }
-    }
-    if high_segno > 0 {
-        let mut high_offs = 0;
-        /*
-         * Move the starting pointer to the start of the next segment, if the
-         * highest one we saw was completed.
-         */
-        if !high_ispartial {
-            high_segno += 1;
-        } else if precise {
-            /* otherwise locate last record in last partial segment */
-            if start_lsn.segment_number(wal_seg_size) > high_segno {
-                bail!(
-                    "provided start_lsn {:?} is beyond highest segno {:?} available",
-                    start_lsn,
-                    high_segno,
+    // loop over segments
+    loop {
+        let segno = curr_lsn.segment_number(wal_seg_size);
+        let seg_file_name = XLogFileName(PG_TLI, segno, wal_seg_size);
+        let seg_file_path = data_dir.join(seg_file_name);
+        match open_wal_segment(&seg_file_path)? {
+            None => {
+                // no more segments
+                info!(
+                    "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist",
+                    result, seg_file_path
                );
+                return Ok(result);
+            }
+            Some(mut segment) => {
+                let seg_offs = curr_lsn.segment_offset(wal_seg_size);
+                segment.seek(SeekFrom::Start(seg_offs as u64))?;
+                // loop inside segment
+                loop {
+                    let bytes_read = segment.read(&mut buf)?;
+                    if bytes_read == 0 {
+                        break; // EOF
+                    }
+                    curr_lsn += bytes_read as u64;
+                    decoder.feed_bytes(&buf[0..bytes_read]);
+
+                    // advance result past all completely read records
+                    loop {
+                        match decoder.poll_decode() {
+                            Ok(Some(record)) => result = record.0,
+                            Err(e) => {
+                                info!(
+                                    "find_end_of_wal reached end at {:?}, decode error: {:?}",
+                                    result, e
+                                );
+                                return Ok(result);
+                            }
+                            Ok(None) => break, // need more data
+                        }
+                    }
+                }
            }
-            let start_offset = if start_lsn.segment_number(wal_seg_size) == high_segno {
-                start_lsn.segment_offset(wal_seg_size)
-            } else {
-                0
-            };
-            high_offs = find_end_of_wal_segment(
-                data_dir,
-                high_segno,
-                high_tli,
-                wal_seg_size,
-                start_offset,
-            )?;
        }
-        let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
-        return Ok((high_ptr, high_tli));
    }
-    Ok((0, 0))
+}
+
+// Open .partial or full WAL segment file, if present.
+fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result<Option<File>> {
+    let mut partial_path = seg_file_path.to_owned();
+    partial_path.set_extension("partial");
+    match File::open(partial_path) {
+        Ok(file) => Ok(Some(file)),
+        Err(e) => match e.kind() {
+            ErrorKind::NotFound => {
+                // .partial not found, try full
+                match File::open(seg_file_path) {
+                    Ok(file) => Ok(Some(file)),
+                    Err(e) => match e.kind() {
+                        ErrorKind::NotFound => Ok(None),
+                        _ => Err(e.into()),
+                    },
+                }
+            }
+            _ => Err(e.into()),
+        },
+    }
 }

 pub fn main() {
    let mut data_dir = PathBuf::new();
    data_dir.push(".");
-    let (wal_end, tli) = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, true, Lsn(0)).unwrap();
-    println!(
-        "wal_end={:>08X}{:>08X}, tli={}",
-        (wal_end >> 32) as u32,
-        wal_end as u32,
-        tli
-    );
+    let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap();
+    println!("wal_end={:?}", wal_end);
 }

 impl XLogRecord {
@@ -564,9 +311,9 @@ impl CheckPoint {
 // We need this segment to start compute node.
 //
 pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
-    let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);
+    let mut seg_buf = BytesMut::with_capacity(WAL_SEGMENT_SIZE as usize);

-    let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE);
+    let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
    let hdr = XLogLongPageHeaderData {
        std: {
            XLogPageHeaderData {
@@ -579,7 +326,7 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali
            }
        },
        xlp_sysid: system_id,
-        xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32,
+        xlp_seg_size: WAL_SEGMENT_SIZE as u32,
        xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
    };

@@ -587,37 +334,117 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali
    seg_buf.extend_from_slice(&hdr_bytes);

    //zero out the rest of the file
-    seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
+    seg_buf.resize(WAL_SEGMENT_SIZE, 0);
    Ok(seg_buf.freeze())
 }

+#[repr(C)]
+#[derive(Serialize)]
+struct XlLogicalMessage {
+    db_id: Oid,
+    transactional: uint32, // bool, takes 4 bytes due to alignment in C structures
+    prefix_size: uint64,
+    message_size: uint64,
+}
+
+impl XlLogicalMessage {
+    pub fn encode(&self) -> Bytes {
+        use utils::bin_ser::LeSer;
+        self.ser().unwrap().into()
+    }
+}
+
+/// Create new WAL record for non-transactional logical message.
+/// Used for creating artificial WAL for tests, as LogicalMessage
+/// record is basically no-op.
+///
+/// NOTE: This leaves the xl_prev field zero. The safekeeper and
+/// pageserver tolerate that, but PostgreSQL does not.
+pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
+    let mut prefix_bytes: Vec<u8> = Vec::with_capacity(prefix.len() + 1);
+    prefix_bytes.write_all(prefix.as_bytes()).unwrap();
+    prefix_bytes.push(0);
+
+    let message_bytes = message.as_bytes();
+
+    let logical_message = XlLogicalMessage {
+        db_id: 0,
+        transactional: 0,
+        prefix_size: prefix_bytes.len() as u64,
+        message_size: message_bytes.len() as u64,
+    };
+
+    let mainrdata = logical_message.encode();
+    let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len();
+    // only short mainrdata is supported for now
+    assert!(mainrdata_len <= 255);
+    let mainrdata_len = mainrdata_len as u8;
+
+    let mut data: Vec<u8> = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len];
+    data.extend_from_slice(&mainrdata);
+    data.extend_from_slice(&prefix_bytes);
+    data.extend_from_slice(message_bytes);
+
+    let total_len = XLOG_SIZE_OF_XLOG_RECORD + data.len();
+
+    let mut header = XLogRecord {
+        xl_tot_len: total_len as u32,
+        xl_xid: 0,
+        xl_prev: 0,
+        xl_info: 0,
+        xl_rmid: 21,
+        __bindgen_padding_0: [0u8; 2usize],
+        xl_crc: 0, // crc will be calculated later
+    };
+
+    let header_bytes = header.encode().expect("failed to encode header");
+    let crc = crc32c_append(0, &data);
+    let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
+    header.xl_crc = crc;
+
+    let mut wal: Vec<u8> = Vec::new();
+    wal.extend_from_slice(&header.encode().expect("failed to encode header"));
+    wal.extend_from_slice(&data);
+
+    // WAL start position must be aligned at 8 bytes,
+    // this will add padding for the next WAL record.
+    const PADDING: usize = 8;
+    let padding_rem = wal.len() % PADDING;
+    if padding_rem != 0 {
+        wal.resize(wal.len() + PADDING - padding_rem, 0);
+    }
+
+    wal
+}
+
 #[cfg(test)]
 mod tests {
+    use super::super::PG_MAJORVERSION;
    use super::*;
    use regex::Regex;
+    use std::cmp::min;
+    use std::fs;
    use std::{env, str::FromStr};
+    use utils::const_assert;

    fn init_logging() {
-        let _ = env_logger::Builder::from_env(
-            env_logger::Env::default()
-                .default_filter_or("wal_craft=info,postgres_ffi::xlog_utils=trace"),
-        )
+        let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
+            format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
+        ))
        .is_test(true)
        .try_init();
    }

-    fn test_end_of_wal<C: wal_craft::Crafter>(
-        test_name: &str,
-        expected_end_of_wal_non_partial: Lsn,
-    ) {
+    fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
        use wal_craft::*;
+
        // Craft some WAL
        let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join("..")
            .join("..");
        let cfg = Conf {
-            pg_distrib_dir: top_path.join("tmp_install"),
-            datadir: top_path.join(format!("test_output/{}", test_name)),
+            pg_distrib_dir: top_path.join(format!("pg_install/{PG_MAJORVERSION}")),
+            datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
        };
        if cfg.datadir.exists() {
            fs::remove_dir_all(&cfg.datadir).unwrap();
@@ -630,7 +457,7 @@ mod tests {
            .iter()
            .map(|&lsn| u64::from(lsn).into())
            .collect();
-        let expected_end_of_wal_partial: Lsn = u64::from(expected_end_of_wal_partial).into();
+        let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
        srv.kill();

        // Check find_end_of_wal on the initial WAL
@@ -642,10 +469,10 @@ mod tests {
            .filter(|fname| IsXLogFileName(fname))
            .max()
            .unwrap();
-        check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal_partial);
-        for start_lsn in std::iter::once(Lsn(0))
-            .chain(intermediate_lsns)
-            .chain(std::iter::once(expected_end_of_wal_partial))
+        check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
+        for start_lsn in intermediate_lsns
+            .iter()
+            .chain(std::iter::once(&expected_end_of_wal))
        {
            // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
            // We assume that `start_lsn` is non-decreasing.
@@ -660,7 +487,7 @@ mod tests {
                }
                let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
                let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
-                if seg_start_lsn > u64::from(start_lsn) {
+                if seg_start_lsn > u64::from(*start_lsn) {
                    continue;
                }
                let mut f = File::options().write(true).open(file.path()).unwrap();
@@ -668,18 +495,12 @@ mod tests {
                f.write_all(
                    &ZEROS[0..min(
                        WAL_SEGMENT_SIZE,
-                        (u64::from(start_lsn) - seg_start_lsn) as usize,
+                        (u64::from(*start_lsn) - seg_start_lsn) as usize,
                    )],
                )
                .unwrap();
            }
-            check_end_of_wal(
-                &cfg,
-                &last_segment,
-                start_lsn,
-                expected_end_of_wal_non_partial,
-                expected_end_of_wal_partial,
-            );
+            check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
        }
    }

@@ -716,18 +537,15 @@ mod tests {
        cfg: &wal_craft::Conf,
        last_segment: &str,
        start_lsn: Lsn,
-        expected_end_of_wal_non_partial: Lsn,
-        expected_end_of_wal_partial: Lsn,
+        expected_end_of_wal: Lsn,
    ) {
        // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
-        let (wal_end, tli) =
-            find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
-        let wal_end = Lsn(wal_end);
-        info!(
-            "find_end_of_wal returned (wal_end={}, tli={}) with non-partial WAL segment",
-            wal_end, tli
-        );
-        assert_eq!(wal_end, expected_end_of_wal_non_partial);
+        // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
+        // info!(
+        //     "find_end_of_wal returned wal_end={} with non-partial WAL segment",
+        //     wal_end
+        // );
+        // assert_eq!(wal_end, expected_end_of_wal_non_partial);

        // Rename file to partial to actually find last valid lsn, then rename it back.
        fs::rename(
@@ -735,14 +553,12 @@ mod tests {
            cfg.wal_dir().join(format!("{}.partial", last_segment)),
        )
        .unwrap();
-        let (wal_end, tli) =
-            find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
-        let wal_end = Lsn(wal_end);
+        let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
        info!(
-            "find_end_of_wal returned (wal_end={}, tli={}) with partial WAL segment",
-            wal_end, tli
+            "find_end_of_wal returned wal_end={} with partial WAL segment",
+            wal_end
        );
-        assert_eq!(wal_end, expected_end_of_wal_partial);
+        assert_eq!(wal_end, expected_end_of_wal);
        fs::rename(
            cfg.wal_dir().join(format!("{}.partial", last_segment)),
            cfg.wal_dir().join(last_segment),
@@ -755,10 +571,7 @@ mod tests {
    #[test]
    pub fn test_find_end_of_wal_simple() {
        init_logging();
-        test_end_of_wal::<wal_craft::Simple>(
-            "test_find_end_of_wal_simple",
-            "0/2000000".parse::<Lsn>().unwrap(),
-        );
+        test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
    }

    #[test]
@@ -766,17 +579,14 @@ mod tests {
        init_logging();
        test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
            "test_find_end_of_wal_crossing_segment_followed_by_small_one",
-            "0/3000000".parse::<Lsn>().unwrap(),
        );
    }

    #[test]
-    #[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO
    pub fn test_find_end_of_wal_last_crossing_segment() {
        init_logging();
        test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
            "test_find_end_of_wal_last_crossing_segment",
-            "0/3000000".parse::<Lsn>().unwrap(),
        );
    }

@@ -809,4 +619,15 @@ mod tests {
        checkpoint.update_next_xid(1024);
        assert_eq!(checkpoint.nextXid.value, 2048);
    }
+
+    #[test]
+    pub fn test_encode_logical_message() {
+        let expected = [
+            64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
+            38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
+            101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+        ];
+        let actual = encode_logical_message("prefix", "message");
+        assert_eq!(expected, actual[..]);
+    }
 }
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -37,7 +37,7 @@ fn main() -> Result<()> {
                    Arg::new("pg-distrib-dir")
                        .long("pg-distrib-dir")
                        .takes_value(true)
-                        .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
+                        .help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install/v14)")
                        .default_value("/usr/local")
                )
        )
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,10 +4,8 @@ use log::*;
 use once_cell::sync::Lazy;
 use postgres::types::PgLsn;
 use postgres::Client;
-use postgres_ffi::pg_constants::WAL_SEGMENT_SIZE;
-use postgres_ffi::xlog_utils::{
-    XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
-};
+use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
+use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
 use std::cmp::Ordering;
 use std::fs;
 use std::path::{Path, PathBuf};
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -12,10 +12,12 @@ use std::{
    borrow::Cow,
    collections::HashMap,
    ffi::OsStr,
-    fmt::Debug,
+    fmt::{Debug, Display},
    num::{NonZeroU32, NonZeroUsize},
+    ops::Deref,
    path::{Path, PathBuf},
    pin::Pin,
+    sync::Arc,
 };

 use anyhow::{bail, Context};
@@ -24,10 +26,7 @@ use tokio::io;
 use toml_edit::Item;
 use tracing::info;

-pub use self::{
-    local_fs::LocalFs,
-    s3_bucket::{S3Bucket, S3ObjectKey},
-};
+pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket};

 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -42,28 +41,62 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;

-pub trait RemoteObjectName {
+const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
+
+#[derive(Clone, PartialEq, Eq)]
+pub struct RemoteObjectId(String);
+
+///
+/// A key that refers to an object in remote storage. It works much like a Path,
+/// but it's a separate datatype so that you don't accidentally mix local paths
+/// and remote keys.
+///
+impl RemoteObjectId {
    // Needed to retrieve last component for RemoteObjectId.
    // In other words a file name
-    fn object_name(&self) -> Option<&str>;
+    /// Turn a/b/c or a/b/c/ into c
+    pub fn object_name(&self) -> Option<&str> {
+        // corner case, char::to_string is not const, thats why this is more verbose than it needs to be
+        // see https://github.com/rust-lang/rust/issues/88674
+        if self.0.len() == 1 && self.0.chars().next().unwrap() == REMOTE_STORAGE_PREFIX_SEPARATOR {
+            return None;
+        }
+
+        if self.0.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+            self.0.rsplit(REMOTE_STORAGE_PREFIX_SEPARATOR).nth(1)
+        } else {
+            self.0
+                .rsplit_once(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                .map(|(_, last)| last)
+        }
+    }
+}
+
+impl Debug for RemoteObjectId {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        Debug::fmt(&self.0, fmt)
+    }
+}
+
+impl Display for RemoteObjectId {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        Display::fmt(&self.0, fmt)
+    }
 }

 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
 #[async_trait::async_trait]
-pub trait RemoteStorage: Send + Sync {
-    /// A way to uniquely reference a file in the remote storage.
-    type RemoteObjectId: RemoteObjectName;
-
+pub trait RemoteStorage: Send + Sync + 'static {
    /// Attempts to derive the storage path out of the local path, if the latter is correct.
-    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
+    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId>;

    /// Gets the download path of the given storage file.
-    fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result<PathBuf>;
+    fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf>;

    /// Lists all items the storage has right now.
-    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
+    async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>>;

    /// Lists all top level subdirectories for a given prefix
    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
@@ -71,34 +104,39 @@ pub trait RemoteStorage: Send + Sync {
    /// so this method doesnt need to.
    async fn list_prefixes(
        &self,
-        prefix: Option<Self::RemoteObjectId>,
-    ) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
+        prefix: Option<&RemoteObjectId>,
+    ) -> anyhow::Result<Vec<RemoteObjectId>>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        from_size_bytes: usize,
-        to: &Self::RemoteObjectId,
+        to: &RemoteObjectId,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()>;

    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
    /// Returns the metadata, if any was stored with the file previously.
-    async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError>;
+    async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError>;

    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
    /// Returns the metadata, if any was stored with the file previously.
    async fn download_byte_range(
        &self,
-        from: &Self::RemoteObjectId,
+        from: &RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
    ) -> Result<Download, DownloadError>;

-    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
+    async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()>;
+
+    /// Downcast to LocalFs implementation. For tests.
+    fn as_local(&self) -> Option<&LocalFs> {
+        None
+    }
 }

 pub struct Download {
@@ -141,26 +179,91 @@ impl std::error::Error for DownloadError {}

 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
-pub enum GenericRemoteStorage {
-    Local(LocalFs),
-    S3(S3Bucket),
+#[derive(Clone)]
+pub struct GenericRemoteStorage(Arc<dyn RemoteStorage>);
+
+impl Deref for GenericRemoteStorage {
+    type Target = dyn RemoteStorage;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
 }

 impl GenericRemoteStorage {
-    pub fn new(
+    pub fn new(storage: impl RemoteStorage) -> Self {
+        Self(Arc::new(storage))
+    }
+
+    pub fn from_config(
        working_directory: PathBuf,
        storage_config: &RemoteStorageConfig,
-    ) -> anyhow::Result<Self> {
-        match &storage_config.storage {
+    ) -> anyhow::Result<GenericRemoteStorage> {
+        Ok(match &storage_config.storage {
            RemoteStorageKind::LocalFs(root) => {
                info!("Using fs root '{}' as a remote storage", root.display());
-                LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local)
+                GenericRemoteStorage::new(LocalFs::new(root.clone(), working_directory)?)
            }
            RemoteStorageKind::AwsS3(s3_config) => {
                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
-                    s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3)
+                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
+                GenericRemoteStorage::new(S3Bucket::new(s3_config, working_directory)?)
            }
+        })
+    }
+
+    /// Takes storage object contents and its size and uploads to remote storage,
+    /// mapping `from_path` to the corresponding remote object id in the storage.
+    ///
+    /// The storage object does not have to be present on the `from_path`,
+    /// this path is used for the remote object id conversion only.
+    pub async fn upload_storage_object(
+        &self,
+        from: Box<dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static>,
+        from_size_bytes: usize,
+        from_path: &Path,
+    ) -> anyhow::Result<()> {
+        let target_storage_path = self.remote_object_id(from_path).with_context(|| {
+            format!(
+                "Failed to get the storage path for source local path '{}'",
+                from_path.display()
+            )
+        })?;
+
+        self.upload(from, from_size_bytes, &target_storage_path, None)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to upload from '{}' to storage path '{:?}'",
+                    from_path.display(),
+                    target_storage_path
+                )
+            })
+    }
+
+    /// Downloads the storage object into the `to_path` provided.
+    /// `byte_range` could be specified to dowload only a part of the file, if needed.
+    pub async fn download_storage_object(
+        &self,
+        byte_range: Option<(u64, Option<u64>)>,
+        to_path: &Path,
+    ) -> Result<Download, DownloadError> {
+        let remote_object_path = self
+            .remote_object_id(to_path)
+            .with_context(|| {
+                format!(
+                    "Failed to get the storage path for target local path '{}'",
+                    to_path.display()
+                )
+            })
+            .map_err(DownloadError::BadInput)?;
+
+        match byte_range {
+            Some((start, end)) => {
+                self.download_byte_range(&remote_object_path, start, end)
+                    .await
+            }
+            None => self.download(&remote_object_path).await,
        }
    }
 }
@@ -241,6 +344,8 @@ impl Debug for S3Config {
    }
 }

+/// Adds a suffix to the file(directory) name, either appending the suffux to the end of its extension,
+/// or if there's no extension, creates one and puts a suffix there.
 pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
    let new_extension = match original_path
        .as_ref()
@@ -365,5 +470,29 @@ mod tests {
            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
            "/foo/bar.baz..temp"
        );
+        let p = PathBuf::from("/foo/bar/dir/");
+        assert_eq!(
+            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            "/foo/bar/dir..temp"
+        );
+    }
+
+    #[test]
+    fn object_name() {
+        let k = RemoteObjectId("a/b/c".to_owned());
+        assert_eq!(k.object_name(), Some("c"));
+
+        let k = RemoteObjectId("a/b/c/".to_owned());
+        assert_eq!(k.object_name(), Some("c"));
+
+        let k = RemoteObjectId("a/".to_owned());
+        assert_eq!(k.object_name(), Some("a"));
+
+        // XXX is it impossible to have an empty key?
+        let k = RemoteObjectId("".to_owned());
+        assert_eq!(k.object_name(), None);
+
+        let k = RemoteObjectId("/".to_owned());
+        assert_eq!(k.object_name(), None);
    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,7 +5,6 @@
 //! volume is mounted to the local FS.

 use std::{
-    borrow::Cow,
    future::Future,
    path::{Path, PathBuf},
    pin::Pin,
@@ -18,14 +17,19 @@ use tokio::{
 };
 use tracing::*;

-use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName};
+use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId};

 use super::{strip_path_prefix, RemoteStorage, StorageMetadata};

-impl RemoteObjectName for PathBuf {
-    fn object_name(&self) -> Option<&str> {
-        self.file_stem().and_then(|n| n.to_str())
-    }
+const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
+
+/// Convert a Path in the remote storage into a RemoteObjectId
+fn remote_object_id_from_path(path: &Path) -> anyhow::Result<RemoteObjectId> {
+    Ok(RemoteObjectId(
+        path.to_str()
+            .ok_or_else(|| anyhow::anyhow!("unexpected characters found in path"))?
+            .to_string(),
+    ))
 }

 pub struct LocalFs {
@@ -50,11 +54,17 @@ impl LocalFs {
        })
    }

-    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
+    ///
+    /// Get the absolute path in the local filesystem to given remote object.
+    ///
+    /// This is public so that it can be used in tests. Should not be used elsewhere.
+    ///
+    pub fn resolve_in_storage(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf> {
+        let path = PathBuf::from(&remote_object_id.0);
        if path.is_relative() {
            Ok(self.storage_root.join(path))
        } else if path.starts_with(&self.storage_root) {
-            Ok(path.to_path_buf())
+            Ok(path)
        } else {
            bail!(
                "Path '{}' does not belong to the current storage",
@@ -92,41 +102,42 @@ impl LocalFs {

 #[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
-    type RemoteObjectId = PathBuf;
-
-    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId> {
-        Ok(self.storage_root.join(
+    /// Convert a "local" path into a "remote path"
+    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId> {
+        let path = self.storage_root.join(
            strip_path_prefix(&self.working_directory, local_path)
                .context("local path does not belong to this storage")?,
-        ))
+        );
+        remote_object_id_from_path(&path)
    }

-    fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result<PathBuf> {
-        let relative_path = strip_path_prefix(&self.storage_root, storage_path)
+    fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf> {
+        let storage_path = PathBuf::from(&remote_object_id.0);
+        let relative_path = strip_path_prefix(&self.storage_root, &storage_path)
            .context("local path does not belong to this storage")?;
        Ok(self.working_directory.join(relative_path))
    }

-    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
+    async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>> {
        get_all_files(&self.storage_root, true).await
    }

    async fn list_prefixes(
        &self,
-        prefix: Option<Self::RemoteObjectId>,
-    ) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
+        prefix: Option<&RemoteObjectId>,
+    ) -> anyhow::Result<Vec<RemoteObjectId>> {
        let path = match prefix {
-            Some(prefix) => Cow::Owned(prefix),
-            None => Cow::Borrowed(&self.storage_root),
+            Some(prefix) => Path::new(&prefix.0),
+            None => &self.storage_root,
        };
-        get_all_files(path.as_ref(), false).await
+        get_all_files(path, false).await
    }

    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
        from_size_bytes: usize,
-        to: &Self::RemoteObjectId,
+        to: &RemoteObjectId,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
        let target_file_path = self.resolve_in_storage(to)?;
@@ -134,7 +145,8 @@ impl RemoteStorage for LocalFs {
        // We need this dance with sort of durable rename (without fsyncs)
        // to prevent partial uploads. This was really hit when pageserver shutdown
        // cancelled the upload and partial file was left on the fs
-        let temp_file_path = path_with_suffix_extension(&target_file_path, "temp");
+        let temp_file_path =
+            path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX);
        let mut destination = io::BufWriter::new(
            fs::OpenOptions::new()
                .write(true)
@@ -150,8 +162,7 @@ impl RemoteStorage for LocalFs {
        );

        let from_size_bytes = from_size_bytes as u64;
-        // Require to read 1 byte more than the expected to check later, that the stream and its size match.
-        let mut buffer_to_read = from.take(from_size_bytes + 1);
+        let mut buffer_to_read = from.take(from_size_bytes);

        let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
            .await
@@ -162,17 +173,15 @@ impl RemoteStorage for LocalFs {
                )
            })?;

+        if bytes_read < from_size_bytes {
+            bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes");
+        }
+        // Check if there is any extra data after the given size.
+        let mut from = buffer_to_read.into_inner();
+        let extra_read = from.read(&mut [1]).await?;
        ensure!(
-            bytes_read == from_size_bytes,
-            "Provided stream has actual size {} fthat is smaller than the given stream size {}",
-            bytes_read,
-            from_size_bytes
-        );
-
-        ensure!(
-            buffer_to_read.read(&mut [0]).await? == 0,
-            "Provided stream has bigger size than the given stream size {}",
-            from_size_bytes
+            extra_read == 0,
+            "Provided stream was larger than expected: expected {from_size_bytes} bytes",
        );

        destination.flush().await.with_context(|| {
@@ -210,7 +219,7 @@ impl RemoteStorage for LocalFs {
        Ok(())
    }

-    async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
+    async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError> {
        let file_path = self
            .resolve_in_storage(from)
            .map_err(DownloadError::BadInput)?;
@@ -244,7 +253,7 @@ impl RemoteStorage for LocalFs {

    async fn download_byte_range(
        &self,
-        from: &Self::RemoteObjectId,
+        from: &RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
    ) -> Result<Download, DownloadError> {
@@ -298,7 +307,7 @@ impl RemoteStorage for LocalFs {
        }
    }

-    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
+    async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()> {
        let file_path = self.resolve_in_storage(path)?;
        if file_path.exists() && file_path.is_file() {
            Ok(fs::remove_file(file_path).await?)
@@ -309,6 +318,10 @@ impl RemoteStorage for LocalFs {
            )
        }
    }
+
+    fn as_local(&self) -> Option<&LocalFs> {
+        Some(self)
+    }
 }

 fn storage_metadata_path(original_path: &Path) -> PathBuf {
@@ -318,7 +331,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf {
 fn get_all_files<'a, P>(
    directory_path: P,
    recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<RemoteObjectId>>> + Send + Sync + 'a>>
 where
    P: AsRef<Path> + Send + Sync + 'a,
 {
@@ -335,12 +348,12 @@ where
                        debug!("{:?} us a symlink, skipping", entry_path)
                    } else if file_type.is_dir() {
                        if recursive {
-                            paths.extend(get_all_files(entry_path, true).await?.into_iter())
+                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
                        } else {
-                            paths.push(dir_entry.path())
+                            paths.push(remote_object_id_from_path(&dir_entry.path())?)
                        }
                    } else {
-                        paths.push(dir_entry.path());
+                        paths.push(remote_object_id_from_path(&dir_entry.path())?);
                    }
                }
                Ok(paths)
@@ -402,9 +415,15 @@ mod pure_tests {
            .join("file_name");
        let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?);

+        let actual_path = PathBuf::from(
+            storage
+                .remote_object_id(&local_path)
+                .expect("Matching path should map to storage path normally")
+                .0,
+        );
        assert_eq!(
            expected_path,
-            storage.remote_object_id(&local_path).expect("Matching path should map to storage path normally"),
+            actual_path,
            "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir"
        );

@@ -465,7 +484,9 @@ mod pure_tests {
        assert_eq!(
            local_path,
            storage
-                .local_path(&storage_root.join(local_path.strip_prefix(&workdir)?))
+                .local_path(&remote_object_id_from_path(
+                    &storage_root.join(local_path.strip_prefix(&workdir)?)
+                )?)
                .expect("For a valid input, valid local path should be parsed"),
            "Should be able to parse metadata out of the correctly named remote delta file"
        );
@@ -489,8 +510,7 @@ mod pure_tests {
    #[test]
    fn local_path_negatives() -> anyhow::Result<()> {
        #[track_caller]
-        #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements
-        fn local_path_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
+        fn local_path_error(storage: &LocalFs, storage_path: &RemoteObjectId) -> String {
            match storage.local_path(storage_path) {
                Ok(wrong_path) => panic!(
                    "Expected local path input {:?} to cause an error, but got file path: {:?}",
@@ -507,7 +527,8 @@ mod pure_tests {
        };

        let totally_wrong_path = "wrong_wrong_wrong";
-        let error_message = local_path_error(&storage, &PathBuf::from(totally_wrong_path));
+        let error_message =
+            local_path_error(&storage, &RemoteObjectId(totally_wrong_path.to_string()));
        assert!(error_message.contains(totally_wrong_path));

        Ok(())
@@ -550,7 +571,7 @@ mod fs_tests {
        storage: &LocalFs,
        #[allow(clippy::ptr_arg)]
        // have to use &PathBuf due to `storage.local_path` parameter requirements
-        remote_storage_path: &PathBuf,
+        remote_storage_path: &RemoteObjectId,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
        let mut download = storage
@@ -581,12 +602,20 @@ mod fs_tests {
            "whatever_contents",
        )
        .await?;
-        let target_path = PathBuf::from("/").join("somewhere").join("else");
-        match storage.upload(file, size, &target_path, None).await {
+        let target_path = "/somewhere/else";
+        match storage
+            .upload(
+                Box::new(file),
+                size,
+                &RemoteObjectId(target_path.to_string()),
+                None,
+            )
+            .await
+        {
            Ok(()) => panic!("Should not allow storing files with wrong target path"),
            Err(e) => {
                let message = format!("{:?}", e);
-                assert!(message.contains(&target_path.display().to_string()));
+                assert!(message.contains(target_path));
                assert!(message.contains("does not belong to the current storage"));
            }
        }
@@ -609,6 +638,34 @@ mod fs_tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn upload_file_negatives() -> anyhow::Result<()> {
+        let storage = create_storage()?;
+
+        let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?;
+        let content = std::io::Cursor::new(b"12345");
+
+        // Check that you get an error if the size parameter doesn't match the actual
+        // size of the stream.
+        storage
+            .upload(Box::new(content.clone()), 0, &id, None)
+            .await
+            .expect_err("upload with zero size succeeded");
+        storage
+            .upload(Box::new(content.clone()), 4, &id, None)
+            .await
+            .expect_err("upload with too short size succeeded");
+        storage
+            .upload(Box::new(content.clone()), 6, &id, None)
+            .await
+            .expect_err("upload with too large size succeeded");
+
+        // Correct size is 5, this should succeed.
+        storage.upload(Box::new(content), 5, &id, None).await?;
+
+        Ok(())
+    }
+
    fn create_storage() -> anyhow::Result<LocalFs> {
        LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
    }
@@ -628,8 +685,8 @@ mod fs_tests {
            "We should upload and download the same contents"
        );

-        let non_existing_path = PathBuf::from("somewhere").join("else");
-        match storage.download(&non_existing_path).await {
+        let non_existing_path = "somewhere/else";
+        match storage.download(&RemoteObjectId(non_existing_path.to_string())).await {
            Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
            other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
        }
@@ -768,7 +825,7 @@ mod fs_tests {
            Err(e) => {
                let error_string = e.to_string();
                assert!(error_string.contains("does not exist"));
-                assert!(error_string.contains(&upload_target.display().to_string()));
+                assert!(error_string.contains(&upload_target.0));
            }
        }
        Ok(())
@@ -829,15 +886,19 @@ mod fs_tests {
        storage: &LocalFs,
        name: &str,
        metadata: Option<StorageMetadata>,
-    ) -> anyhow::Result<PathBuf> {
+    ) -> anyhow::Result<RemoteObjectId> {
        let timeline_path = workdir.join("timelines").join("some_timeline");
        let relative_timeline_path = timeline_path.strip_prefix(&workdir)?;
        let storage_path = storage.storage_root.join(relative_timeline_path).join(name);
+        let remote_object_id = RemoteObjectId(storage_path.to_str().unwrap().to_string());

        let from_path = storage.working_directory.join(name);
        let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?;
-        storage.upload(file, size, &storage_path, metadata).await?;
-        Ok(storage_path)
+
+        storage
+            .upload(Box::new(file), size, &remote_object_id, metadata)
+            .await?;
+        remote_object_id_from_path(&storage_path)
    }

    async fn create_file_for_upload(
@@ -862,9 +923,9 @@ mod fs_tests {
        format!("contents for {name}")
    }

-    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
+    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemoteObjectId>> {
        let mut files = storage.list().await?;
-        files.sort();
+        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -20,7 +20,8 @@ use tokio_util::io::ReaderStream;
 use tracing::debug;

 use crate::{
-    strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config,
+    strip_path_prefix, Download, DownloadError, RemoteObjectId, RemoteStorage, S3Config,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::StorageMetadata;
@@ -90,52 +91,26 @@ pub(super) mod metrics {
    }
 }

-const S3_PREFIX_SEPARATOR: char = '/';
+fn download_destination(
+    id: &RemoteObjectId,
+    workdir: &Path,
+    prefix_to_strip: Option<&str>,
+) -> PathBuf {
+    let path_without_prefix = match prefix_to_strip {
+        Some(prefix) => id.0.strip_prefix(prefix).unwrap_or_else(|| {
+            panic!(
+                "Could not strip prefix '{}' from S3 object key '{}'",
+                prefix, id.0
+            )
+        }),
+        None => &id.0,
+    };

-#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)]
-pub struct S3ObjectKey(String);
-
-impl S3ObjectKey {
-    fn key(&self) -> &str {
-        &self.0
-    }
-
-    fn download_destination(&self, workdir: &Path, prefix_to_strip: Option<&str>) -> PathBuf {
-        let path_without_prefix = match prefix_to_strip {
-            Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| {
-                panic!(
-                    "Could not strip prefix '{}' from S3 object key '{}'",
-                    prefix, self.0
-                )
-            }),
-            None => &self.0,
-        };
-
-        workdir.join(
-            path_without_prefix
-                .split(S3_PREFIX_SEPARATOR)
-                .collect::<PathBuf>(),
-        )
-    }
-}
-
-impl RemoteObjectName for S3ObjectKey {
-    /// Turn a/b/c or a/b/c/ into c
-    fn object_name(&self) -> Option<&str> {
-        // corner case, char::to_string is not const, thats why this is more verbose than it needs to be
-        // see https://github.com/rust-lang/rust/issues/88674
-        if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR {
-            return None;
-        }
-
-        if self.0.ends_with(S3_PREFIX_SEPARATOR) {
-            self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1)
-        } else {
-            self.0
-                .rsplit_once(S3_PREFIX_SEPARATOR)
-                .map(|(_, last)| last)
-        }
-    }
+    workdir.join(
+        path_without_prefix
+            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+            .collect::<PathBuf>(),
+    )
 }

 /// AWS S3 storage.
@@ -197,12 +172,12 @@ impl S3Bucket {

        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
            let mut prefix = prefix;
-            while prefix.starts_with(S3_PREFIX_SEPARATOR) {
+            while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                prefix = &prefix[1..]
            }

            let mut prefix = prefix.to_string();
-            while prefix.ends_with(S3_PREFIX_SEPARATOR) {
+            while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                prefix.pop();
            }
            prefix
@@ -253,23 +228,25 @@ impl S3Bucket {

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    type RemoteObjectId = S3ObjectKey;
-
-    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId> {
+    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId> {
        let relative_path = strip_path_prefix(&self.workdir, local_path)?;
        let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
        for segment in relative_path {
-            key.push(S3_PREFIX_SEPARATOR);
+            key.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
            key.push_str(&segment.to_string_lossy());
        }
-        Ok(S3ObjectKey(key))
+        Ok(RemoteObjectId(key))
    }

-    fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result<PathBuf> {
-        Ok(storage_path.download_destination(&self.workdir, self.prefix_in_bucket.as_deref()))
+    fn local_path(&self, storage_path: &RemoteObjectId) -> anyhow::Result<PathBuf> {
+        Ok(download_destination(
+            storage_path,
+            &self.workdir,
+            self.prefix_in_bucket.as_deref(),
+        ))
    }

-    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
+    async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>> {
        let mut document_keys = Vec::new();

        let mut continuation_token = None;
@@ -300,7 +277,7 @@ impl RemoteStorage for S3Bucket {
                    .contents
                    .unwrap_or_default()
                    .into_iter()
-                    .filter_map(|o| Some(S3ObjectKey(o.key?))),
+                    .filter_map(|o| Some(RemoteObjectId(o.key?))),
            );

            match fetch_response.continuation_token {
@@ -316,17 +293,17 @@ impl RemoteStorage for S3Bucket {
    /// Note: it wont include empty "directories"
    async fn list_prefixes(
        &self,
-        prefix: Option<Self::RemoteObjectId>,
-    ) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
+        prefix: Option<&RemoteObjectId>,
+    ) -> anyhow::Result<Vec<RemoteObjectId>> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
-            .map(|p| p.0)
+            .map(|p| p.0.clone())
            .or_else(|| self.prefix_in_bucket.clone())
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if !p.ends_with(S3_PREFIX_SEPARATOR) {
-                    p.push(S3_PREFIX_SEPARATOR);
+                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });
@@ -349,7 +326,7 @@ impl RemoteStorage for S3Bucket {
                    bucket: self.bucket_name.clone(),
                    prefix: list_prefix.clone(),
                    continuation_token,
-                    delimiter: Some(S3_PREFIX_SEPARATOR.to_string()),
+                    delimiter: Some(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()),
                    ..ListObjectsV2Request::default()
                })
                .await
@@ -363,7 +340,7 @@ impl RemoteStorage for S3Bucket {
                    .common_prefixes
                    .unwrap_or_default()
                    .into_iter()
-                    .filter_map(|o| Some(S3ObjectKey(o.prefix?))),
+                    .filter_map(|o| Some(RemoteObjectId(o.prefix?))),
            );

            match fetch_response.continuation_token {
@@ -377,9 +354,9 @@ impl RemoteStorage for S3Bucket {

    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
        from_size_bytes: usize,
-        to: &Self::RemoteObjectId,
+        to: &RemoteObjectId,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
        let _guard = self
@@ -396,7 +373,7 @@ impl RemoteStorage for S3Bucket {
                    from_size_bytes,
                )),
                bucket: self.bucket_name.clone(),
-                key: to.key().to_owned(),
+                key: to.0.to_owned(),
                metadata: metadata.map(|m| m.0),
                ..PutObjectRequest::default()
            })
@@ -408,10 +385,10 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }

-    async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
+    async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError> {
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
-            key: from.key().to_owned(),
+            key: from.0.to_owned(),
            ..GetObjectRequest::default()
        })
        .await
@@ -419,7 +396,7 @@ impl RemoteStorage for S3Bucket {

    async fn download_byte_range(
        &self,
-        from: &Self::RemoteObjectId,
+        from: &RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
    ) -> Result<Download, DownloadError> {
@@ -433,14 +410,14 @@ impl RemoteStorage for S3Bucket {

        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
-            key: from.key().to_owned(),
+            key: from.0.to_owned(),
            range,
            ..GetObjectRequest::default()
        })
        .await
    }

-    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
+    async fn delete(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<()> {
        let _guard = self
            .concurrency_limiter
            .acquire()
@@ -452,7 +429,7 @@ impl RemoteStorage for S3Bucket {
        self.client
            .delete_object(DeleteObjectRequest {
                bucket: self.bucket_name.clone(),
-                key: path.key().to_owned(),
+                key: remote_object_id.0.to_owned(),
                ..DeleteObjectRequest::default()
            })
            .await
@@ -471,43 +448,24 @@ mod tests {
    use super::*;

    #[test]
-    fn object_name() {
-        let k = S3ObjectKey("a/b/c".to_owned());
-        assert_eq!(k.object_name(), Some("c"));
-
-        let k = S3ObjectKey("a/b/c/".to_owned());
-        assert_eq!(k.object_name(), Some("c"));
-
-        let k = S3ObjectKey("a/".to_owned());
-        assert_eq!(k.object_name(), Some("a"));
-
-        // XXX is it impossible to have an empty key?
-        let k = S3ObjectKey("".to_owned());
-        assert_eq!(k.object_name(), None);
-
-        let k = S3ObjectKey("/".to_owned());
-        assert_eq!(k.object_name(), None);
-    }
-
-    #[test]
-    fn download_destination() -> anyhow::Result<()> {
+    fn test_download_destination() -> anyhow::Result<()> {
        let workdir = tempdir()?.path().to_owned();
        let local_path = workdir.join("one").join("two").join("test_name");
        let relative_path = local_path.strip_prefix(&workdir)?;

-        let key = S3ObjectKey(format!(
+        let key = RemoteObjectId(format!(
            "{}{}",
-            S3_PREFIX_SEPARATOR,
+            REMOTE_STORAGE_PREFIX_SEPARATOR,
            relative_path
                .iter()
                .map(|segment| segment.to_str().unwrap())
                .collect::<Vec<_>>()
-                .join(&S3_PREFIX_SEPARATOR.to_string()),
+                .join(&REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()),
        ));

        assert_eq!(
            local_path,
-            key.download_destination(&workdir, None),
+            download_destination(&key, &workdir, None),
            "Download destination should consist of s3 path joined with the workdir prefix"
        );

@@ -524,8 +482,8 @@ mod tests {

        let storage = dummy_storage(workdir);

-        let expected_key = S3ObjectKey(format!(
-            "{}{S3_PREFIX_SEPARATOR}{segment_1}{S3_PREFIX_SEPARATOR}{segment_2}",
+        let expected_key = RemoteObjectId(format!(
+            "{}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_1}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_2}",
            storage.prefix_in_bucket.as_deref().unwrap_or_default(),
        ));

@@ -596,7 +554,7 @@ mod tests {
            storage.prefix_in_bucket.as_deref(),
        );
        assert_eq!(
-            s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()),
+            download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()),
            storage
                .local_path(&s3_key)
                .expect("For a valid input, valid S3 info should be parsed"),
@@ -608,7 +566,7 @@ mod tests {
            storage.prefix_in_bucket.as_deref(),
        );
        assert_eq!(
-            s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()),
+            download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()),
            storage
                .local_path(&s3_key)
                .expect("For a valid input, valid S3 info should be parsed"),
@@ -649,11 +607,11 @@ mod tests {
        }
    }

-    fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> S3ObjectKey {
-        S3ObjectKey(relative_file_path.iter().fold(
+    fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> RemoteObjectId {
+        RemoteObjectId(relative_file_path.iter().fold(
            prefix.unwrap_or_default().to_string(),
            |mut path_string, segment| {
-                path_string.push(S3_PREFIX_SEPARATOR);
+                path_string.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                path_string.push_str(segment.to_str().unwrap());
                path_string
            },
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
+async-trait = "0.1"
 anyhow = "1.0"
 bincode = "1.3"
 bytes = "1.0.1"
@@ -16,6 +17,7 @@ serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 thiserror = "1.0"
 tokio = { version = "1.17", features = ["macros"]}
+tokio-rustls = "0.23"
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 nix = "0.23.0"
@@ -39,7 +41,7 @@ bytes = "1.0.1"
 hex-literal = "0.3"
 tempfile = "3.2"
 criterion = "0.3"
-rustls-pemfile = "0.2.1"
+rustls-pemfile = "1"

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/bin_ser.rs
+++ b/libs/utils/src/bin_ser.rs
@@ -265,7 +265,7 @@ mod tests {
    use serde::{Deserialize, Serialize};
    use std::io::Cursor;

-    #[derive(Debug, PartialEq, Serialize, Deserialize)]
+    #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
    pub struct ShortStruct {
        a: u8,
        b: u32,
@@ -286,7 +286,7 @@ mod tests {
    const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7];
    const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff];

-    #[derive(Debug, PartialEq, Serialize, Deserialize)]
+    #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
    pub struct LongMsg {
        pub tag: u8,
        pub blockpos: u32,
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -10,12 +10,10 @@ pub fn get_request_param<'a>(
 ) -> Result<&'a str, ApiError> {
    match request.param(param_name) {
        Some(arg) => Ok(arg),
-        None => {
-            return Err(ApiError::BadRequest(format!(
-                "no {} specified in path param",
-                param_name
-            )))
-        }
+        None => Err(ApiError::BadRequest(format!(
+            "no {} specified in path param",
+            param_name
+        ))),
    }
 }

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -8,14 +8,15 @@ pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
 pub mod seqwait;

+/// A simple Read-Copy-Update implementation.
+pub mod simple_rcu;
+
 /// append only ordered map implemented with a Vec
 pub mod vec_map;

-// Async version of SeqWait. Currently unused.
-// pub mod seqwait_async;
-
 pub mod bin_ser;
 pub mod postgres_backend;
+pub mod postgres_backend_async;
 pub mod pq_proto;

 // dealing with connstring parsing and handy access to it's parts
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -18,7 +18,7 @@ pub const XLOG_BLCKSZ: u32 = 8192;
 pub struct Lsn(pub u64);

 /// We tried to parse an LSN from a string, but failed
-#[derive(Debug, PartialEq, thiserror::Error)]
+#[derive(Debug, PartialEq, Eq, thiserror::Error)]
 #[error("LsnParseError")]
 pub struct LsnParseError;

--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -50,7 +50,7 @@ pub trait Handler {

 /// PostgresBackend protocol state.
 /// XXX: The order of the constructors matters.
-#[derive(Clone, Copy, PartialEq, PartialOrd)]
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
 pub enum ProtoState {
    Initialization,
    Encrypted,
@@ -163,14 +163,9 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
    false
 }

-// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations)
-// PG protocol strings are always C strings.
-fn cstr_to_str(b: &Bytes) -> Result<&str> {
-    let without_null = if b.last() == Some(&0) {
-        &b[..b.len() - 1]
-    } else {
-        &b[..]
-    };
+// Cast a byte slice to a string slice, dropping null terminator if there's one.
+fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
+    let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
    std::str::from_utf8(without_null).map_err(|e| e.into())
 }

@@ -423,9 +418,9 @@ impl PostgresBackend {
                self.state = ProtoState::Established;
            }

-            FeMessage::Query(m) => {
+            FeMessage::Query(body) => {
                // remove null terminator
-                let query_string = cstr_to_str(&m.body)?;
+                let query_string = cstr_to_str(&body)?;

                trace!("got query {:?}", query_string);
                // xxx distinguish fatal and recoverable errors?
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -0,0 +1,485 @@
+//! Server-side asynchronous Postgres connection, as limited as we need.
+//! To use, create PostgresBackend and run() it, passing the Handler
+//! implementation determining how to process the queries. Currently its API
+//! is rather narrow, but we can extend it once required.
+
+use crate::postgres_backend::AuthType;
+use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
+use anyhow::{bail, Context, Result};
+use bytes::{Bytes, BytesMut};
+use rand::Rng;
+use std::future::Future;
+use std::net::SocketAddr;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::Poll;
+use tracing::{debug, error, trace};
+
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio_rustls::TlsAcceptor;
+
+#[async_trait::async_trait]
+pub trait Handler {
+    /// Handle single query.
+    /// postgres_backend will issue ReadyForQuery after calling this (this
+    /// might be not what we want after CopyData streaming, but currently we don't
+    /// care).
+    async fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()>;
+
+    /// Called on startup packet receival, allows to process params.
+    ///
+    /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
+    /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
+    /// to override whole init logic in implementations.
+    fn startup(&mut self, _pgb: &mut PostgresBackend, _sm: &FeStartupPacket) -> Result<()> {
+        Ok(())
+    }
+
+    /// Check auth md5
+    fn check_auth_md5(&mut self, _pgb: &mut PostgresBackend, _md5_response: &[u8]) -> Result<()> {
+        bail!("MD5 auth failed")
+    }
+
+    /// Check auth jwt
+    fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
+        bail!("JWT auth failed")
+    }
+}
+
+/// PostgresBackend protocol state.
+/// XXX: The order of the constructors matters.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
+pub enum ProtoState {
+    Initialization,
+    Encrypted,
+    Authentication,
+    Established,
+    Closed,
+}
+
+#[derive(Clone, Copy)]
+pub enum ProcessMsgResult {
+    Continue,
+    Break,
+}
+
+/// Always-writeable sock_split stream.
+/// May not be readable. See [`PostgresBackend::take_stream_in`]
+pub enum Stream {
+    Unencrypted(tokio::net::TcpStream),
+    Tls(Box<tokio_rustls::server::TlsStream<tokio::net::TcpStream>>),
+    Broken,
+}
+
+impl AsyncWrite for Stream {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, std::io::Error>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
+            Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
+            Self::Broken => unreachable!(),
+        }
+    }
+    fn poll_flush(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
+            Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
+            Self::Broken => unreachable!(),
+        }
+    }
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
+            Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
+            Self::Broken => unreachable!(),
+        }
+    }
+}
+impl AsyncRead for Stream {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut tokio::io::ReadBuf<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
+            Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
+            Self::Broken => unreachable!(),
+        }
+    }
+}
+
+pub struct PostgresBackend {
+    stream: Stream,
+    // Output buffer. c.f. BeMessage::write why we are using BytesMut here.
+    buf_out: BytesMut,
+
+    pub state: ProtoState,
+
+    md5_salt: [u8; 4],
+    auth_type: AuthType,
+
+    peer_addr: SocketAddr,
+    pub tls_config: Option<Arc<rustls::ServerConfig>>,
+}
+
+pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
+    let mut query_string = query_string.to_vec();
+    if let Some(ch) = query_string.last() {
+        if *ch == 0 {
+            query_string.pop();
+        }
+    }
+    query_string
+}
+
+// Cast a byte slice to a string slice, dropping null terminator if there's one.
+fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
+    let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
+    std::str::from_utf8(without_null).map_err(|e| e.into())
+}
+
+impl PostgresBackend {
+    pub fn new(
+        socket: tokio::net::TcpStream,
+        auth_type: AuthType,
+        tls_config: Option<Arc<rustls::ServerConfig>>,
+    ) -> std::io::Result<Self> {
+        let peer_addr = socket.peer_addr()?;
+
+        Ok(Self {
+            stream: Stream::Unencrypted(socket),
+            buf_out: BytesMut::with_capacity(10 * 1024),
+            state: ProtoState::Initialization,
+            md5_salt: [0u8; 4],
+            auth_type,
+            tls_config,
+            peer_addr,
+        })
+    }
+
+    pub fn get_peer_addr(&self) -> &SocketAddr {
+        &self.peer_addr
+    }
+
+    /// Read full message or return None if connection is closed.
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>> {
+        use ProtoState::*;
+        match self.state {
+            Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
+            Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
+            Closed => Ok(None),
+        }
+    }
+
+    /// Flush output buffer into the socket.
+    pub async fn flush(&mut self) -> std::io::Result<&mut Self> {
+        self.stream.write_all(&self.buf_out).await?;
+        self.buf_out.clear();
+        Ok(self)
+    }
+
+    /// Write message into internal output buffer.
+    pub fn write_message(&mut self, message: &BeMessage<'_>) -> Result<&mut Self, std::io::Error> {
+        BeMessage::write(&mut self.buf_out, message)?;
+        Ok(self)
+    }
+
+    // Wrapper for run_message_loop() that shuts down socket when we are done
+    pub async fn run<F, S>(mut self, handler: &mut impl Handler, shutdown_watcher: F) -> Result<()>
+    where
+        F: Fn() -> S,
+        S: Future,
+    {
+        let ret = self.run_message_loop(handler, shutdown_watcher).await;
+        let _ = self.stream.shutdown();
+        ret
+    }
+
+    async fn run_message_loop<F, S>(
+        &mut self,
+        handler: &mut impl Handler,
+        shutdown_watcher: F,
+    ) -> Result<()>
+    where
+        F: Fn() -> S,
+        S: Future,
+    {
+        trace!("postgres backend to {:?} started", self.peer_addr);
+
+        tokio::select!(
+            biased;
+
+            _ = shutdown_watcher() => {
+                // We were requested to shut down.
+                tracing::info!("shutdown request received during handshake");
+                return Ok(())
+            },
+
+            result = async {
+                while self.state < ProtoState::Established {
+                    if let Some(msg) = self.read_message().await? {
+                        trace!("got message {msg:?} during handshake");
+
+                        match self.process_handshake_message(handler, msg).await? {
+                            ProcessMsgResult::Continue => {
+                                self.flush().await?;
+                                continue;
+                            }
+                            ProcessMsgResult::Break => {
+                                trace!("postgres backend to {:?} exited during handshake", self.peer_addr);
+                                return Ok(());
+                            }
+                        }
+                    } else {
+                        trace!("postgres backend to {:?} exited during handshake", self.peer_addr);
+                        return Ok(());
+                    }
+                }
+                Ok::<(), anyhow::Error>(())
+            } => {
+                // Handshake complete.
+                result?;
+            }
+        );
+
+        // Authentication completed
+        let mut query_string = Bytes::new();
+        while let Some(msg) = tokio::select!(
+            biased;
+            _ = shutdown_watcher() => {
+                // We were requested to shut down.
+                tracing::info!("shutdown request received in run_message_loop");
+                Ok(None)
+            },
+            msg = self.read_message() => { msg },
+        )? {
+            trace!("got message {:?}", msg);
+
+            let result = self.process_message(handler, msg, &mut query_string).await;
+            self.flush().await?;
+            match result? {
+                ProcessMsgResult::Continue => {
+                    self.flush().await?;
+                    continue;
+                }
+                ProcessMsgResult::Break => break,
+            }
+        }
+
+        trace!("postgres backend to {:?} exited", self.peer_addr);
+        Ok(())
+    }
+
+    async fn start_tls(&mut self) -> anyhow::Result<()> {
+        if let Stream::Unencrypted(plain_stream) =
+            std::mem::replace(&mut self.stream, Stream::Broken)
+        {
+            let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap());
+            let tls_stream = acceptor.accept(plain_stream).await?;
+
+            self.stream = Stream::Tls(Box::new(tls_stream));
+            return Ok(());
+        };
+        bail!("TLS already started");
+    }
+
+    async fn process_handshake_message(
+        &mut self,
+        handler: &mut impl Handler,
+        msg: FeMessage,
+    ) -> Result<ProcessMsgResult> {
+        assert!(self.state < ProtoState::Established);
+        let have_tls = self.tls_config.is_some();
+        match msg {
+            FeMessage::StartupPacket(m) => {
+                trace!("got startup message {m:?}");
+
+                match m {
+                    FeStartupPacket::SslRequest => {
+                        debug!("SSL requested");
+
+                        self.write_message(&BeMessage::EncryptionResponse(have_tls))?;
+                        if have_tls {
+                            self.start_tls().await?;
+                            self.state = ProtoState::Encrypted;
+                        }
+                    }
+                    FeStartupPacket::GssEncRequest => {
+                        debug!("GSS requested");
+                        self.write_message(&BeMessage::EncryptionResponse(false))?;
+                    }
+                    FeStartupPacket::StartupMessage { .. } => {
+                        if have_tls && !matches!(self.state, ProtoState::Encrypted) {
+                            self.write_message(&BeMessage::ErrorResponse("must connect with TLS"))?;
+                            bail!("client did not connect with TLS");
+                        }
+
+                        // NB: startup() may change self.auth_type -- we are using that in proxy code
+                        // to bypass auth for new users.
+                        handler.startup(self, &m)?;
+
+                        match self.auth_type {
+                            AuthType::Trust => {
+                                self.write_message(&BeMessage::AuthenticationOk)?
+                                    .write_message(&BeParameterStatusMessage::encoding())?
+                                    // The async python driver requires a valid server_version
+                                    .write_message(&BeMessage::ParameterStatus(
+                                        BeParameterStatusMessage::ServerVersion("14.1"),
+                                    ))?
+                                    .write_message(&BeMessage::ReadyForQuery)?;
+                                self.state = ProtoState::Established;
+                            }
+                            AuthType::MD5 => {
+                                rand::thread_rng().fill(&mut self.md5_salt);
+                                self.write_message(&BeMessage::AuthenticationMD5Password(
+                                    self.md5_salt,
+                                ))?;
+                                self.state = ProtoState::Authentication;
+                            }
+                            AuthType::ZenithJWT => {
+                                self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
+                                self.state = ProtoState::Authentication;
+                            }
+                        }
+                    }
+                    FeStartupPacket::CancelRequest { .. } => {
+                        self.state = ProtoState::Closed;
+                        return Ok(ProcessMsgResult::Break);
+                    }
+                }
+            }
+
+            FeMessage::PasswordMessage(m) => {
+                trace!("got password message '{:?}'", m);
+
+                assert!(self.state == ProtoState::Authentication);
+
+                match self.auth_type {
+                    AuthType::Trust => unreachable!(),
+                    AuthType::MD5 => {
+                        let (_, md5_response) = m.split_last().context("protocol violation")?;
+
+                        if let Err(e) = handler.check_auth_md5(self, md5_response) {
+                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+                            bail!("auth failed: {}", e);
+                        }
+                    }
+                    AuthType::ZenithJWT => {
+                        let (_, jwt_response) = m.split_last().context("protocol violation")?;
+
+                        if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
+                            self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+                            bail!("auth failed: {}", e);
+                        }
+                    }
+                }
+                self.write_message(&BeMessage::AuthenticationOk)?
+                    .write_message(&BeParameterStatusMessage::encoding())?
+                    .write_message(&BeMessage::ReadyForQuery)?;
+                self.state = ProtoState::Established;
+            }
+
+            _ => {
+                self.state = ProtoState::Closed;
+                return Ok(ProcessMsgResult::Break);
+            }
+        }
+        Ok(ProcessMsgResult::Continue)
+    }
+
+    async fn process_message(
+        &mut self,
+        handler: &mut impl Handler,
+        msg: FeMessage,
+        unnamed_query_string: &mut Bytes,
+    ) -> Result<ProcessMsgResult> {
+        // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
+        // TODO: change that to proper top-level match of protocol state with separate message handling for each state
+        assert!(self.state == ProtoState::Established);
+
+        match msg {
+            FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
+                bail!("protocol violation");
+            }
+
+            FeMessage::Query(body) => {
+                // remove null terminator
+                let query_string = cstr_to_str(&body)?;
+
+                trace!("got query {:?}", query_string);
+                // xxx distinguish fatal and recoverable errors?
+                if let Err(e) = handler.process_query(self, query_string).await {
+                    // ":?" uses the alternate formatting style, which makes anyhow display the
+                    // full cause of the error, not just the top-level context + its trace.
+                    // We don't want to send that in the ErrorResponse though,
+                    // because it's not relevant to the compute node logs.
+                    error!("query handler for '{}' failed: {:?}", query_string, e);
+                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+                    // TODO: untangle convoluted control flow
+                    if e.to_string().contains("failed to run") {
+                        return Ok(ProcessMsgResult::Break);
+                    }
+                }
+                self.write_message(&BeMessage::ReadyForQuery)?;
+            }
+
+            FeMessage::Parse(m) => {
+                *unnamed_query_string = m.query_string;
+                self.write_message(&BeMessage::ParseComplete)?;
+            }
+
+            FeMessage::Describe(_) => {
+                self.write_message(&BeMessage::ParameterDescription)?
+                    .write_message(&BeMessage::NoData)?;
+            }
+
+            FeMessage::Bind(_) => {
+                self.write_message(&BeMessage::BindComplete)?;
+            }
+
+            FeMessage::Close(_) => {
+                self.write_message(&BeMessage::CloseComplete)?;
+            }
+
+            FeMessage::Execute(_) => {
+                let query_string = cstr_to_str(unnamed_query_string)?;
+                trace!("got execute {:?}", query_string);
+                // xxx distinguish fatal and recoverable errors?
+                if let Err(e) = handler.process_query(self, query_string).await {
+                    error!("query handler for '{}' failed: {:?}", query_string, e);
+                    self.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
+                }
+                // NOTE there is no ReadyForQuery message. This handler is used
+                // for basebackup and it uses CopyOut which doesn't require
+                // ReadyForQuery message and backend just switches back to
+                // processing mode after sending CopyDone or ErrorResponse.
+            }
+
+            FeMessage::Sync => {
+                self.write_message(&BeMessage::ReadyForQuery)?;
+            }
+
+            FeMessage::Terminate => {
+                return Ok(ProcessMsgResult::Break);
+            }
+
+            // We prefer explicit pattern matching to wildcards, because
+            // this helps us spot the places where new variants are missing
+            FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
+                bail!("unexpected message type: {:?}", msg);
+            }
+        }
+
+        Ok(ProcessMsgResult::Continue)
+    }
+}
--- a/libs/utils/src/pq_proto.rs
+++ b/libs/utils/src/pq_proto.rs
@@ -7,11 +7,14 @@ use anyhow::{bail, ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use postgres_protocol::PG_EPOCH;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-use std::future::Future;
-use std::io::{self, Cursor};
-use std::str;
-use std::time::{Duration, SystemTime};
+use std::{
+    borrow::Cow,
+    collections::HashMap,
+    future::Future,
+    io::{self, Cursor},
+    str,
+    time::{Duration, SystemTime},
+};
 use tokio::io::AsyncReadExt;
 use tracing::{trace, warn};

@@ -25,8 +28,10 @@ pub const TEXT_OID: Oid = 25;
 #[derive(Debug)]
 pub enum FeMessage {
    StartupPacket(FeStartupPacket),
-    Query(FeQueryMessage), // Simple query
-    Parse(FeParseMessage), // Extended query protocol
+    // Simple query.
+    Query(Bytes),
+    // Extended query protocol.
+    Parse(FeParseMessage),
    Describe(FeDescribeMessage),
    Bind(FeBindMessage),
    Execute(FeExecuteMessage),
@@ -51,7 +56,67 @@ pub enum FeStartupPacket {
    },
 }

-pub type StartupMessageParams = HashMap<String, String>;
+#[derive(Debug)]
+pub struct StartupMessageParams {
+    params: HashMap<String, String>,
+}
+
+impl StartupMessageParams {
+    /// Get parameter's value by its name.
+    pub fn get(&self, name: &str) -> Option<&str> {
+        self.params.get(name).map(|s| s.as_str())
+    }
+
+    /// Split command-line options according to PostgreSQL's logic,
+    /// taking into account all escape sequences but leaving them as-is.
+    /// [`None`] means that there's no `options` in [`Self`].
+    pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
+        // See `postgres: pg_split_opts`.
+        let mut last_was_escape = false;
+        let iter = self
+            .get("options")?
+            .split(move |c: char| {
+                // We split by non-escaped whitespace symbols.
+                let should_split = c.is_ascii_whitespace() && !last_was_escape;
+                last_was_escape = c == '\\' && !last_was_escape;
+                should_split
+            })
+            .filter(|s| !s.is_empty());
+
+        Some(iter)
+    }
+
+    /// Split command-line options according to PostgreSQL's logic,
+    /// applying all escape sequences (using owned strings as needed).
+    /// [`None`] means that there's no `options` in [`Self`].
+    pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
+        // See `postgres: pg_split_opts`.
+        let iter = self.options_raw()?.map(|s| {
+            let mut preserve_next_escape = false;
+            let escape = |c| {
+                // We should remove '\\' unless it's preceded by '\\'.
+                let should_remove = c == '\\' && !preserve_next_escape;
+                preserve_next_escape = should_remove;
+                should_remove
+            };
+
+            match s.contains('\\') {
+                true => Cow::Owned(s.replace(escape, "")),
+                false => Cow::Borrowed(s),
+            }
+        });
+
+        Some(iter)
+    }
+
+    // This function is mostly useful in tests.
+    #[doc(hidden)]
+    pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
+        Self {
+            params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
+        }
+    }
+}

 #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
 pub struct CancelKeyData {
@@ -69,11 +134,6 @@ impl Distribution<CancelKeyData> for Standard {
    }
 }

-#[derive(Debug)]
-pub struct FeQueryMessage {
-    pub body: Bytes,
-}
-
 // We only support the simple case of Parse on unnamed prepared statement and
 // no params
 #[derive(Debug)]
@@ -89,7 +149,7 @@ pub struct FeDescribeMessage {

 // we only support unnamed prepared stmt and portal
 #[derive(Debug)]
-pub struct FeBindMessage {}
+pub struct FeBindMessage;

 // we only support unnamed prepared stmt or portal
 #[derive(Debug)]
@@ -100,7 +160,7 @@ pub struct FeExecuteMessage {

 // we only support unnamed prepared stmt and portal
 #[derive(Debug)]
-pub struct FeCloseMessage {}
+pub struct FeCloseMessage;

 /// Retry a read on EINTR
 ///
@@ -163,22 +223,20 @@ impl FeMessage {
                Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
                Err(e) => return Err(e.into()),
            };
-            let len = retry_read!(stream.read_u32().await)?;

-            // The message length includes itself, so it better be at least 4
-            let bodylen = len
+            // The message length includes itself, so it better be at least 4.
+            let len = retry_read!(stream.read_u32().await)?
                .checked_sub(4)
-                .context("invalid message length: parsing u32")?;
+                .context("invalid message length")?;

-            // Read message body
-            let mut body_buf: Vec<u8> = vec![0; bodylen as usize];
-            stream.read_exact(&mut body_buf).await?;
+            let body = {
+                let mut buffer = vec![0u8; len as usize];
+                stream.read_exact(&mut buffer).await?;
+                Bytes::from(buffer)
+            };

-            let body = Bytes::from(body_buf);
-
-            // Parse it
            match tag {
-                b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body }))),
+                b'Q' => Ok(Some(FeMessage::Query(body))),
                b'P' => Ok(Some(FeParseMessage::parse(body)?)),
                b'D' => Ok(Some(FeDescribeMessage::parse(body)?)),
                b'E' => Ok(Some(FeExecuteMessage::parse(body)?)),
@@ -242,9 +300,9 @@ impl FeStartupPacket {
            stream.read_exact(params_bytes.as_mut()).await?;

            // Parse params depending on request code
-            let most_sig_16_bits = request_code >> 16;
-            let least_sig_16_bits = request_code & ((1 << 16) - 1);
-            let message = match (most_sig_16_bits, least_sig_16_bits) {
+            let req_hi = request_code >> 16;
+            let req_lo = request_code & ((1 << 16) - 1);
+            let message = match (req_hi, req_lo) {
                (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
                    ensure!(params_len == 8, "expected 8 bytes for CancelRequest params");
                    let mut cursor = Cursor::new(params_bytes);
@@ -253,173 +311,115 @@ impl FeStartupPacket {
                        cancel_key: cursor.read_i32().await?,
                    })
                }
-                (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => FeStartupPacket::SslRequest,
+                (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
+                    // Requested upgrade to SSL (aka TLS)
+                    FeStartupPacket::SslRequest
+                }
                (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
+                    // Requested upgrade to GSSAPI
                    FeStartupPacket::GssEncRequest
                }
                (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
                    bail!("Unrecognized request code {}", unrecognized_code)
                }
+                // TODO bail if protocol major_version is not 3?
                (major_version, minor_version) => {
-                    // TODO bail if protocol major_version is not 3?
-                    // Parse null-terminated (String) pairs of param name / param value
-                    let params_str = str::from_utf8(&params_bytes).unwrap();
-                    let mut params_tokens = params_str.split('\0');
-                    let mut params: HashMap<String, String> = HashMap::new();
-                    while let Some(name) = params_tokens.next() {
-                        let value = params_tokens
+                    // Parse pairs of null-terminated strings (key, value).
+                    // See `postgres: ProcessStartupPacket, build_startup_packet`.
+                    let mut tokens = str::from_utf8(&params_bytes)
+                        .context("StartupMessage params: invalid utf-8")?
+                        .strip_suffix('\0') // drop packet's own null terminator
+                        .context("StartupMessage params: missing null terminator")?
+                        .split_terminator('\0');
+
+                    let mut params = HashMap::new();
+                    while let Some(name) = tokens.next() {
+                        let value = tokens
                            .next()
-                            .context("expected even number of params in StartupMessage")?;
-                        if name == "options" {
-                            // parsing options arguments "...&options=<var0>%3D<val0>+<var1>=<var1>..."
-                            // '%3D' is '=' and '+' is ' '
+                            .context("StartupMessage params: key without value")?;

-                            // Note: we allow users that don't have SNI capabilities,
-                            // to pass a special keyword argument 'project'
-                            // to be used to determine the cluster name by the proxy.
-
-                            //TODO: write unit test for this and refactor in its own function.
-                            for cmdopt in value.split(' ') {
-                                let nameval: Vec<&str> = cmdopt.split('=').collect();
-                                if nameval.len() == 2 {
-                                    params.insert(nameval[0].to_string(), nameval[1].to_string());
-                                }
-                            }
-                        } else {
-                            params.insert(name.to_string(), value.to_string());
-                        }
+                        params.insert(name.to_owned(), value.to_owned());
                    }
+
                    FeStartupPacket::StartupMessage {
                        major_version,
                        minor_version,
-                        params,
+                        params: StartupMessageParams { params },
                    }
                }
            };
+
            Ok(Some(FeMessage::StartupPacket(message)))
        })
    }
 }

 impl FeParseMessage {
-    pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
-        let _pstmt_name = read_null_terminated(&mut buf)?;
-        let query_string = read_null_terminated(&mut buf)?;
-        let nparams = buf.get_i16();
-
+    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
        // FIXME: the rust-postgres driver uses a named prepared statement
        // for copy_out(). We're not prepared to handle that correctly. For
        // now, just ignore the statement name, assuming that the client never
        // uses more than one prepared statement at a time.
-        /*
-        if !pstmt_name.is_empty() {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "named prepared statements not implemented in Parse",
-            ));
-        }
-         */

-        if nparams != 0 {
-            bail!("query params not implemented");
-        }
+        let _pstmt_name = read_cstr(&mut buf)?;
+        let query_string = read_cstr(&mut buf)?;
+        let nparams = buf.get_i16();
+
+        ensure!(nparams == 0, "query params not implemented");

        Ok(FeMessage::Parse(FeParseMessage { query_string }))
    }
 }

 impl FeDescribeMessage {
-    pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
        let kind = buf.get_u8();
-        let _pstmt_name = read_null_terminated(&mut buf)?;
+        let _pstmt_name = read_cstr(&mut buf)?;

        // FIXME: see FeParseMessage::parse
-        /*
-        if !pstmt_name.is_empty() {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "named prepared statements not implemented in Describe",
-            ));
-        }
-        */
-
-        if kind != b'S' {
-            bail!("only prepared statmement Describe is implemented");
-        }
+        ensure!(
+            kind == b'S',
+            "only prepared statemement Describe is implemented"
+        );

        Ok(FeMessage::Describe(FeDescribeMessage { kind }))
    }
 }

 impl FeExecuteMessage {
-    pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
-        let portal_name = read_null_terminated(&mut buf)?;
+    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+        let portal_name = read_cstr(&mut buf)?;
        let maxrows = buf.get_i32();

-        if !portal_name.is_empty() {
-            bail!("named portals not implemented");
-        }
-
-        if maxrows != 0 {
-            bail!("row limit in Execute message not supported");
-        }
+        ensure!(portal_name.is_empty(), "named portals not implemented");
+        ensure!(maxrows == 0, "row limit in Execute message not implemented");

        Ok(FeMessage::Execute(FeExecuteMessage { maxrows }))
    }
 }

 impl FeBindMessage {
-    pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
-        let portal_name = read_null_terminated(&mut buf)?;
-        let _pstmt_name = read_null_terminated(&mut buf)?;
-
-        if !portal_name.is_empty() {
-            bail!("named portals not implemented");
-        }
+    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+        let portal_name = read_cstr(&mut buf)?;
+        let _pstmt_name = read_cstr(&mut buf)?;

        // FIXME: see FeParseMessage::parse
-        /*
-        if !pstmt_name.is_empty() {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "named prepared statements not implemented",
-            ));
-        }
-        */
+        ensure!(portal_name.is_empty(), "named portals not implemented");

-        Ok(FeMessage::Bind(FeBindMessage {}))
+        Ok(FeMessage::Bind(FeBindMessage))
    }
 }

 impl FeCloseMessage {
-    pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
        let _kind = buf.get_u8();
-        let _pstmt_or_portal_name = read_null_terminated(&mut buf)?;
+        let _pstmt_or_portal_name = read_cstr(&mut buf)?;

        // FIXME: we do nothing with Close
-
-        Ok(FeMessage::Close(FeCloseMessage {}))
+        Ok(FeMessage::Close(FeCloseMessage))
    }
 }

-fn read_null_terminated(buf: &mut Bytes) -> anyhow::Result<Bytes> {
-    let mut result = BytesMut::new();
-
-    loop {
-        if !buf.has_remaining() {
-            bail!("no null-terminator in string");
-        }
-
-        let byte = buf.get_u8();
-
-        if byte == 0 {
-            break;
-        }
-        result.put_u8(byte);
-    }
-    Ok(result.freeze())
-}
-
 // Backend

 #[derive(Debug)]
@@ -441,7 +441,7 @@ pub enum BeMessage<'a> {
    // None means column is NULL
    DataRow(&'a [Option<&'a [u8]>]),
    ErrorResponse(&'a str),
-    // single byte - used in response to SSLRequest/GSSENCRequest
+    /// Single byte - used in response to SSLRequest/GSSENCRequest.
    EncryptionResponse(bool),
    NoData,
    ParameterDescription,
@@ -554,49 +554,22 @@ pub static SINGLE_COL_ROWDESC: BeMessage = BeMessage::RowDescription(&[RowDescri
    formatcode: 0,
 }]);

-// Safe usize -> i32|i16 conversion, from rust-postgres
-trait FromUsize: Sized {
-    fn from_usize(x: usize) -> Result<Self, io::Error>;
-}
-
-macro_rules! from_usize {
-    ($t:ty) => {
-        impl FromUsize for $t {
-            #[inline]
-            fn from_usize(x: usize) -> io::Result<$t> {
-                if x > <$t>::max_value() as usize {
-                    Err(io::Error::new(
-                        io::ErrorKind::InvalidInput,
-                        "value too large to transmit",
-                    ))
-                } else {
-                    Ok(x as $t)
-                }
-            }
-        }
-    };
-}
-
-from_usize!(i32);
-
 /// Call f() to write body of the message and prepend it with 4-byte len as
 /// prescribed by the protocol.
-fn write_body<F>(buf: &mut BytesMut, f: F) -> io::Result<()>
-where
-    F: FnOnce(&mut BytesMut) -> io::Result<()>,
-{
+fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
    let base = buf.len();
    buf.extend_from_slice(&[0; 4]);

-    f(buf)?;
+    let res = f(buf);

-    let size = i32::from_usize(buf.len() - base)?;
+    let size = i32::try_from(buf.len() - base).expect("message too big to transmit");
    (&mut buf[base..]).put_slice(&size.to_be_bytes());
-    Ok(())
+
+    res
 }

 /// Safe write of s into buf as cstring (String in the protocol).
-pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
+fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
    if s.contains(&0) {
        return Err(io::Error::new(
            io::ErrorKind::InvalidInput,
@@ -608,15 +581,11 @@ pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
    Ok(())
 }

-// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations)
-// PG protocol strings are always C strings.
-fn cstr_to_str(b: &Bytes) -> Result<&str> {
-    let without_null = if b.last() == Some(&0) {
-        &b[..b.len() - 1]
-    } else {
-        &b[..]
-    };
-    std::str::from_utf8(without_null).map_err(|e| e.into())
+fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
+    let pos = buf.iter().position(|x| *x == 0);
+    let result = buf.split_to(pos.context("missing terminator")?);
+    buf.advance(1); // drop the null terminator
+    Ok(result)
 }

 impl<'a> BeMessage<'a> {
@@ -631,18 +600,14 @@ impl<'a> BeMessage<'a> {
                buf.put_u8(b'R');
                write_body(buf, |buf| {
                    buf.put_i32(0); // Specifies that the authentication was successful.
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap(); // write into BytesMut can't fail
+                });
            }

            BeMessage::AuthenticationCleartextPassword => {
                buf.put_u8(b'R');
                write_body(buf, |buf| {
                    buf.put_i32(3); // Specifies that clear text password is required.
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap(); // write into BytesMut can't fail
+                });
            }

            BeMessage::AuthenticationMD5Password(salt) => {
@@ -650,9 +615,7 @@ impl<'a> BeMessage<'a> {
                write_body(buf, |buf| {
                    buf.put_i32(5); // Specifies that an MD5-encrypted password is required.
                    buf.put_slice(&salt[..]);
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap(); // write into BytesMut can't fail
+                });
            }

            BeMessage::AuthenticationSasl(msg) => {
@@ -677,8 +640,7 @@ impl<'a> BeMessage<'a> {
                        }
                    }
                    Ok::<_, io::Error>(())
-                })
-                .unwrap()
+                })?;
            }

            BeMessage::BackendKeyData(key_data) => {
@@ -686,77 +648,64 @@ impl<'a> BeMessage<'a> {
                write_body(buf, |buf| {
                    buf.put_i32(key_data.backend_pid);
                    buf.put_i32(key_data.cancel_key);
-                    Ok(())
-                })
-                .unwrap();
+                });
            }

            BeMessage::BindComplete => {
                buf.put_u8(b'2');
-                write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+                write_body(buf, |_| {});
            }

            BeMessage::CloseComplete => {
                buf.put_u8(b'3');
-                write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+                write_body(buf, |_| {});
            }

            BeMessage::CommandComplete(cmd) => {
                buf.put_u8(b'C');
-                write_body(buf, |buf| {
-                    write_cstr(cmd, buf)?;
-                    Ok::<_, io::Error>(())
-                })?;
+                write_body(buf, |buf| write_cstr(cmd, buf))?;
            }

            BeMessage::CopyData(data) => {
                buf.put_u8(b'd');
                write_body(buf, |buf| {
                    buf.put_slice(data);
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                });
            }

            BeMessage::CopyDone => {
                buf.put_u8(b'c');
-                write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+                write_body(buf, |_| {});
            }

            BeMessage::CopyFail => {
                buf.put_u8(b'f');
-                write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+                write_body(buf, |_| {});
            }

            BeMessage::CopyInResponse => {
                buf.put_u8(b'G');
                write_body(buf, |buf| {
-                    buf.put_u8(1); /* copy_is_binary */
-                    buf.put_i16(0); /* numAttributes */
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                    buf.put_u8(1); // copy_is_binary
+                    buf.put_i16(0); // numAttributes
+                });
            }

            BeMessage::CopyOutResponse => {
                buf.put_u8(b'H');
                write_body(buf, |buf| {
-                    buf.put_u8(0); /* copy_is_binary */
-                    buf.put_i16(0); /* numAttributes */
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                    buf.put_u8(0); // copy_is_binary
+                    buf.put_i16(0); // numAttributes
+                });
            }

            BeMessage::CopyBothResponse => {
                buf.put_u8(b'W');
                write_body(buf, |buf| {
                    // doesn't matter, used only for replication
-                    buf.put_u8(0); /* copy_is_binary */
-                    buf.put_i16(0); /* numAttributes */
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                    buf.put_u8(0); // copy_is_binary
+                    buf.put_i16(0); // numAttributes
+                });
            }

            BeMessage::DataRow(vals) => {
@@ -771,9 +720,7 @@ impl<'a> BeMessage<'a> {
                            buf.put_i32(-1);
                        }
                    }
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                });
            }

            // ErrorResponse is a zero-terminated array of zero-terminated fields.
@@ -788,18 +735,17 @@ impl<'a> BeMessage<'a> {
                buf.put_u8(b'E');
                write_body(buf, |buf| {
                    buf.put_u8(b'S'); // severity
-                    write_cstr(&Bytes::from("ERROR"), buf)?;
+                    buf.put_slice(b"ERROR\0");

                    buf.put_u8(b'C'); // SQLSTATE error code
-                    write_cstr(&Bytes::from("CXX000"), buf)?;
+                    buf.put_slice(b"CXX000\0");

                    buf.put_u8(b'M'); // the message
                    write_cstr(error_msg.as_bytes(), buf)?;

                    buf.put_u8(0); // terminator
                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                })?;
            }

            // NoticeResponse has the same format as ErrorResponse. From doc: "The frontend should display the
@@ -812,23 +758,22 @@ impl<'a> BeMessage<'a> {
                buf.put_u8(b'N');
                write_body(buf, |buf| {
                    buf.put_u8(b'S'); // severity
-                    write_cstr(&Bytes::from("NOTICE"), buf)?;
+                    buf.put_slice(b"NOTICE\0");

                    buf.put_u8(b'C'); // SQLSTATE error code
-                    write_cstr(&Bytes::from("CXX000"), buf)?;
+                    buf.put_slice(b"CXX000\0");

                    buf.put_u8(b'M'); // the message
                    write_cstr(error_msg.as_bytes(), buf)?;

                    buf.put_u8(0); // terminator
                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                })?;
            }

            BeMessage::NoData => {
                buf.put_u8(b'n');
-                write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+                write_body(buf, |_| {});
            }

            BeMessage::EncryptionResponse(should_negotiate) => {
@@ -853,9 +798,7 @@ impl<'a> BeMessage<'a> {
                buf.put_u8(b'S');
                write_body(buf, |buf| {
                    buf.put_slice(&buffer[..cnt]);
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                });
            }

            BeMessage::ParameterDescription => {
@@ -863,23 +806,19 @@ impl<'a> BeMessage<'a> {
                write_body(buf, |buf| {
                    // we don't support params, so always 0
                    buf.put_i16(0);
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                });
            }

            BeMessage::ParseComplete => {
                buf.put_u8(b'1');
-                write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+                write_body(buf, |_| {});
            }

            BeMessage::ReadyForQuery => {
                buf.put_u8(b'Z');
                write_body(buf, |buf| {
                    buf.put_u8(b'I');
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                });
            }

            BeMessage::RowDescription(rows) => {
@@ -907,9 +846,7 @@ impl<'a> BeMessage<'a> {
                    buf.put_u64(body.wal_end);
                    buf.put_i64(body.timestamp);
                    buf.put_slice(body.data);
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                });
            }

            BeMessage::KeepAlive(req) => {
@@ -918,10 +855,8 @@ impl<'a> BeMessage<'a> {
                    buf.put_u8(b'k');
                    buf.put_u64(req.sent_ptr);
                    buf.put_i64(req.timestamp);
-                    buf.put_u8(if req.request_reply { 1u8 } else { 0u8 });
-                    Ok::<_, io::Error>(())
-                })
-                .unwrap();
+                    buf.put_u8(if req.request_reply { 1 } else { 0 });
+                });
            }
        }
        Ok(())
@@ -930,7 +865,7 @@ impl<'a> BeMessage<'a> {

 // Neon extension of postgres replication protocol
 // See NEON_STATUS_UPDATE_TAG_BYTE
-#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct ReplicationFeedback {
    // Last known size of the timeline. Used to enforce timeline size limit.
    pub current_timeline_size: u64,
@@ -968,17 +903,17 @@ impl ReplicationFeedback {
    // value itself
    pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
        buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
-        write_cstr(&Bytes::from("current_timeline_size"), buf)?;
+        buf.put_slice(b"current_timeline_size\0");
        buf.put_i32(8);
        buf.put_u64(self.current_timeline_size);

-        write_cstr(&Bytes::from("ps_writelsn"), buf)?;
+        buf.put_slice(b"ps_writelsn\0");
        buf.put_i32(8);
        buf.put_u64(self.ps_writelsn);
-        write_cstr(&Bytes::from("ps_flushlsn"), buf)?;
+        buf.put_slice(b"ps_flushlsn\0");
        buf.put_i32(8);
        buf.put_u64(self.ps_flushlsn);
-        write_cstr(&Bytes::from("ps_applylsn"), buf)?;
+        buf.put_slice(b"ps_applylsn\0");
        buf.put_i32(8);
        buf.put_u64(self.ps_applylsn);

@@ -988,7 +923,7 @@ impl ReplicationFeedback {
            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
            .as_micros() as i64;

-        write_cstr(&Bytes::from("ps_replytime"), buf)?;
+        buf.put_slice(b"ps_replytime\0");
        buf.put_i32(8);
        buf.put_i64(timestamp);
        Ok(())
@@ -998,33 +933,30 @@ impl ReplicationFeedback {
    pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
        let mut zf = ReplicationFeedback::empty();
        let nfields = buf.get_u8();
-        let mut i = 0;
-        while i < nfields {
-            i += 1;
-            let key_cstr = read_null_terminated(&mut buf).unwrap();
-            let key = cstr_to_str(&key_cstr).unwrap();
-            match key {
-                "current_timeline_size" => {
+        for _ in 0..nfields {
+            let key = read_cstr(&mut buf).unwrap();
+            match key.as_ref() {
+                b"current_timeline_size" => {
                    let len = buf.get_i32();
                    assert_eq!(len, 8);
                    zf.current_timeline_size = buf.get_u64();
                }
-                "ps_writelsn" => {
+                b"ps_writelsn" => {
                    let len = buf.get_i32();
                    assert_eq!(len, 8);
                    zf.ps_writelsn = buf.get_u64();
                }
-                "ps_flushlsn" => {
+                b"ps_flushlsn" => {
                    let len = buf.get_i32();
                    assert_eq!(len, 8);
                    zf.ps_flushlsn = buf.get_u64();
                }
-                "ps_applylsn" => {
+                b"ps_applylsn" => {
                    let len = buf.get_i32();
                    assert_eq!(len, 8);
                    zf.ps_applylsn = buf.get_u64();
                }
-                "ps_replytime" => {
+                b"ps_replytime" => {
                    let len = buf.get_i32();
                    assert_eq!(len, 8);
                    let raw_time = buf.get_i64();
@@ -1037,8 +969,8 @@ impl ReplicationFeedback {
                _ => {
                    let len = buf.get_i32();
                    warn!(
-                        "ReplicationFeedback parse. unknown key {} of len {}. Skip it.",
-                        key, len
+                        "ReplicationFeedback parse. unknown key {} of len {len}. Skip it.",
+                        String::from_utf8_lossy(key.as_ref())
                    );
                    buf.advance(len as usize);
                }
@@ -1084,7 +1016,7 @@ mod tests {
            *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
        }

-        write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap();
+        data.put_slice(b"new_field_one\0");
        data.put_i32(8);
        data.put_u64(42);

@@ -1093,6 +1025,33 @@ mod tests {
        assert_eq!(zf, zf_parsed);
    }

+    #[test]
+    fn test_startup_message_params_options_escaped() {
+        fn split_options(params: &StartupMessageParams) -> Vec<Cow<'_, str>> {
+            params
+                .options_escaped()
+                .expect("options are None")
+                .collect()
+        }
+
+        let make_params = |options| StartupMessageParams::new([("options", options)]);
+
+        let params = StartupMessageParams::new([]);
+        assert!(matches!(params.options_escaped(), None));
+
+        let params = make_params("");
+        assert!(split_options(&params).is_empty());
+
+        let params = make_params("foo");
+        assert_eq!(split_options(&params), ["foo"]);
+
+        let params = make_params(" foo  bar ");
+        assert_eq!(split_options(&params), ["foo", "bar"]);
+
+        let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
+        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
+    }
+
    // Make sure that `read` is sync/async callable
    async fn _assert(stream: &mut (impl tokio::io::AsyncRead + Unpin)) {
        let _ = FeMessage::read(&mut [].as_ref());
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -4,12 +4,13 @@ use std::cmp::{Eq, Ordering, PartialOrd};
 use std::collections::BinaryHeap;
 use std::fmt::Debug;
 use std::mem;
-use std::sync::mpsc::{channel, Receiver, Sender};
 use std::sync::Mutex;
 use std::time::Duration;
+use tokio::sync::watch::{channel, Receiver, Sender};
+use tokio::time::timeout;

 /// An error happened while waiting for a number
-#[derive(Debug, PartialEq, thiserror::Error)]
+#[derive(Debug, PartialEq, Eq, thiserror::Error)]
 #[error("SeqWaitError")]
 pub enum SeqWaitError {
    /// The wait timeout was reached
@@ -141,10 +142,10 @@ where
    ///
    /// This call won't complete until someone has called `advance`
    /// with a number greater than or equal to the one we're waiting for.
-    pub fn wait_for(&self, num: V) -> Result<(), SeqWaitError> {
+    pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> {
        match self.queue_for_wait(num) {
            Ok(None) => Ok(()),
-            Ok(Some(rx)) => rx.recv().map_err(|_| SeqWaitError::Shutdown),
+            Ok(Some(mut rx)) => rx.changed().await.map_err(|_| SeqWaitError::Shutdown),
            Err(e) => Err(e),
        }
    }
@@ -156,13 +157,18 @@ where
    ///
    /// If that hasn't happened after the specified timeout duration,
    /// [`SeqWaitError::Timeout`] will be returned.
-    pub fn wait_for_timeout(&self, num: V, timeout_duration: Duration) -> Result<(), SeqWaitError> {
+    pub async fn wait_for_timeout(
+        &self,
+        num: V,
+        timeout_duration: Duration,
+    ) -> Result<(), SeqWaitError> {
        match self.queue_for_wait(num) {
            Ok(None) => Ok(()),
-            Ok(Some(rx)) => rx.recv_timeout(timeout_duration).map_err(|e| match e {
-                std::sync::mpsc::RecvTimeoutError::Timeout => SeqWaitError::Timeout,
-                std::sync::mpsc::RecvTimeoutError::Disconnected => SeqWaitError::Shutdown,
-            }),
+            Ok(Some(mut rx)) => match timeout(timeout_duration, rx.changed()).await {
+                Ok(Ok(())) => Ok(()),
+                Ok(Err(_)) => Err(SeqWaitError::Shutdown),
+                Err(_) => Err(SeqWaitError::Timeout),
+            },
            Err(e) => Err(e),
        }
    }
@@ -179,7 +185,7 @@ where
        }

        // Create a new channel.
-        let (tx, rx) = channel();
+        let (tx, rx) = channel(());
        internal.waiters.push(Waiter {
            wake_num: num,
            wake_channel: tx,
@@ -235,7 +241,6 @@ mod tests {
    use super::*;
    use std::sync::Arc;
    use std::thread::sleep;
-    use std::thread::spawn;
    use std::time::Duration;

    impl MonotonicCounter<i32> for i32 {
@@ -248,25 +253,25 @@ mod tests {
        }
    }

-    #[test]
-    fn seqwait() {
+    #[tokio::test]
+    async fn seqwait() {
        let seq = Arc::new(SeqWait::new(0));
        let seq2 = Arc::clone(&seq);
        let seq3 = Arc::clone(&seq);
-        spawn(move || {
-            seq2.wait_for(42).expect("wait_for 42");
+        tokio::task::spawn(async move {
+            seq2.wait_for(42).await.expect("wait_for 42");
            let old = seq2.advance(100);
            assert_eq!(old, 99);
-            seq2.wait_for(999).expect_err("no 999");
+            seq2.wait_for(999).await.expect_err("no 999");
        });
-        spawn(move || {
-            seq3.wait_for(42).expect("wait_for 42");
-            seq3.wait_for(0).expect("wait_for 0");
+        tokio::task::spawn(async move {
+            seq3.wait_for(42).await.expect("wait_for 42");
+            seq3.wait_for(0).await.expect("wait_for 0");
        });
        sleep(Duration::from_secs(1));
        let old = seq.advance(99);
        assert_eq!(old, 0);
-        seq.wait_for(100).expect("wait_for 100");
+        seq.wait_for(100).await.expect("wait_for 100");

        // Calling advance with a smaller value is a no-op
        assert_eq!(seq.advance(98), 100);
@@ -275,16 +280,16 @@ mod tests {
        seq.shutdown();
    }

-    #[test]
-    fn seqwait_timeout() {
+    #[tokio::test]
+    async fn seqwait_timeout() {
        let seq = Arc::new(SeqWait::new(0));
        let seq2 = Arc::clone(&seq);
-        spawn(move || {
+        tokio::task::spawn(async move {
            let timeout = Duration::from_millis(1);
-            let res = seq2.wait_for_timeout(42, timeout);
+            let res = seq2.wait_for_timeout(42, timeout).await;
            assert_eq!(res, Err(SeqWaitError::Timeout));
        });
-        sleep(Duration::from_secs(1));
+        tokio::time::sleep(Duration::from_secs(1)).await;
        // This will attempt to wake, but nothing will happen
        // because the waiter already dropped its Receiver.
        let old = seq.advance(99);
--- a/libs/utils/src/seqwait_async.rs
+++ b/libs/utils/src/seqwait_async.rs
@@ -1,224 +0,0 @@
-///
-/// Async version of 'seqwait.rs'
-///
-/// NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs.
-///
-
-#![warn(missing_docs)]
-
-use std::collections::BTreeMap;
-use std::fmt::Debug;
-use std::mem;
-use std::sync::Mutex;
-use std::time::Duration;
-use tokio::sync::watch::{channel, Receiver, Sender};
-use tokio::time::timeout;
-
-/// An error happened while waiting for a number
-#[derive(Debug, PartialEq, thiserror::Error)]
-#[error("SeqWaitError")]
-pub enum SeqWaitError {
-    /// The wait timeout was reached
-    Timeout,
-    /// [`SeqWait::shutdown`] was called
-    Shutdown,
-}
-
-/// Internal components of a `SeqWait`
-struct SeqWaitInt<T>
-where
-    T: Ord,
-{
-    waiters: BTreeMap<T, (Sender<()>, Receiver<()>)>,
-    current: T,
-    shutdown: bool,
-}
-
-/// A tool for waiting on a sequence number
-///
-/// This provides a way to await the arrival of a number.
-/// As soon as the number arrives by another caller calling
-/// [`advance`], then the waiter will be woken up.
-///
-/// This implementation takes a blocking Mutex on both [`wait_for`]
-/// and [`advance`], meaning there may be unexpected executor blocking
-/// due to thread scheduling unfairness. There are probably better
-/// implementations, but we can probably live with this for now.
-///
-/// [`wait_for`]: SeqWait::wait_for
-/// [`advance`]: SeqWait::advance
-///
-pub struct SeqWait<T>
-where
-    T: Ord,
-{
-    internal: Mutex<SeqWaitInt<T>>,
-}
-
-impl<T> SeqWait<T>
-where
-    T: Ord + Debug + Copy,
-{
-    /// Create a new `SeqWait`, initialized to a particular number
-    pub fn new(starting_num: T) -> Self {
-        let internal = SeqWaitInt {
-            waiters: BTreeMap::new(),
-            current: starting_num,
-            shutdown: false,
-        };
-        SeqWait {
-            internal: Mutex::new(internal),
-        }
-    }
-
-    /// Shut down a `SeqWait`, causing all waiters (present and
-    /// future) to return an error.
-    pub fn shutdown(&self) {
-        let waiters = {
-            // Prevent new waiters; wake all those that exist.
-            // Wake everyone with an error.
-            let mut internal = self.internal.lock().unwrap();
-
-            // This will steal the entire waiters map.
-            // When we drop it all waiters will be woken.
-            mem::take(&mut internal.waiters)
-
-            // Drop the lock as we exit this scope.
-        };
-
-        // When we drop the waiters list, each Receiver will
-        // be woken with an error.
-        // This drop doesn't need to be explicit; it's done
-        // here to make it easier to read the code and understand
-        // the order of events.
-        drop(waiters);
-    }
-
-    /// Wait for a number to arrive
-    ///
-    /// This call won't complete until someone has called `advance`
-    /// with a number greater than or equal to the one we're waiting for.
-    pub async fn wait_for(&self, num: T) -> Result<(), SeqWaitError> {
-        let mut rx = {
-            let mut internal = self.internal.lock().unwrap();
-            if internal.current >= num {
-                return Ok(());
-            }
-            if internal.shutdown {
-                return Err(SeqWaitError::Shutdown);
-            }
-
-            // If we already have a channel for waiting on this number, reuse it.
-            if let Some((_, rx)) = internal.waiters.get_mut(&num) {
-                // an Err from changed() means the sender was dropped.
-                rx.clone()
-            } else {
-                // Create a new channel.
-                let (tx, rx) = channel(());
-                internal.waiters.insert(num, (tx, rx.clone()));
-                rx
-            }
-            // Drop the lock as we exit this scope.
-        };
-        rx.changed().await.map_err(|_| SeqWaitError::Shutdown)
-    }
-
-    /// Wait for a number to arrive
-    ///
-    /// This call won't complete until someone has called `advance`
-    /// with a number greater than or equal to the one we're waiting for.
-    ///
-    /// If that hasn't happened after the specified timeout duration,
-    /// [`SeqWaitError::Timeout`] will be returned.
-    pub async fn wait_for_timeout(
-        &self,
-        num: T,
-        timeout_duration: Duration,
-    ) -> Result<(), SeqWaitError> {
-        timeout(timeout_duration, self.wait_for(num))
-            .await
-            .unwrap_or(Err(SeqWaitError::Timeout))
-    }
-
-    /// Announce a new number has arrived
-    ///
-    /// All waiters at this value or below will be woken.
-    ///
-    /// `advance` will panic if you send it a lower number than
-    /// a previous call.
-    pub fn advance(&self, num: T) {
-        let wake_these = {
-            let mut internal = self.internal.lock().unwrap();
-
-            if internal.current > num {
-                panic!(
-                    "tried to advance backwards, from {:?} to {:?}",
-                    internal.current, num
-                );
-            }
-            internal.current = num;
-
-            // split_off will give me all the high-numbered waiters,
-            // so split and then swap. Everything at or above `num`
-            // stays.
-            let mut split = internal.waiters.split_off(&num);
-            std::mem::swap(&mut split, &mut internal.waiters);
-
-            // `split_at` didn't get the value at `num`; if it's
-            // there take that too.
-            if let Some(sleeper) = internal.waiters.remove(&num) {
-                split.insert(num, sleeper);
-            }
-
-            split
-        };
-
-        for (_wake_num, (tx, _rx)) in wake_these {
-            // This can fail if there are no receivers.
-            // We don't care; discard the error.
-            let _ = tx.send(());
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::sync::Arc;
-    use tokio::time::{sleep, Duration};
-
-    #[tokio::test]
-    async fn seqwait() {
-        let seq = Arc::new(SeqWait::new(0));
-        let seq2 = Arc::clone(&seq);
-        let seq3 = Arc::clone(&seq);
-        tokio::spawn(async move {
-            seq2.wait_for(42).await.expect("wait_for 42");
-            seq2.advance(100);
-            seq2.wait_for(999).await.expect_err("no 999");
-        });
-        tokio::spawn(async move {
-            seq3.wait_for(42).await.expect("wait_for 42");
-            seq3.wait_for(0).await.expect("wait_for 0");
-        });
-        sleep(Duration::from_secs(1)).await;
-        seq.advance(99);
-        seq.wait_for(100).await.expect("wait_for 100");
-        seq.shutdown();
-    }
-
-    #[tokio::test]
-    async fn seqwait_timeout() {
-        let seq = Arc::new(SeqWait::new(0));
-        let seq2 = Arc::clone(&seq);
-        tokio::spawn(async move {
-            let timeout = Duration::from_millis(1);
-            let res = seq2.wait_for_timeout(42, timeout).await;
-            assert_eq!(res, Err(SeqWaitError::Timeout));
-        });
-        sleep(Duration::from_secs(1)).await;
-        // This will attempt to wake, but nothing will happen
-        // because the waiter already dropped its Receiver.
-        seq.advance(99);
-    }
-}
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -0,0 +1,289 @@
+//!
+//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
+//! similar to a lock, but it allows readers to "hold on" to an old value of RCU
+//! without blocking writers, and allows writing a new values without blocking
+//! readers. When you update the new value, the new value is immediately visible
+//! to new readers, but the update waits until all existing readers have
+//! finishe, so that no one sees the old value anymore.
+//!
+//! This implementation isn't wait-free; it uses an RwLock that is held for a
+//! short duration when the value is read or updated.
+//!
+//! # Examples
+//!
+//! Read a value and do things with it while holding the guard:
+//!
+//! ```
+//! # let rcu = utils::simple_rcu::Rcu::new(1);
+//! {
+//!     let read = rcu.read();
+//!     println!("the current value is {}", *read);
+//!     // exiting the scope drops the read-guard, and allows concurrent writers
+//!     // to finish.
+//! }
+//! ```
+//!
+//! Increment the value by one, and wait for old readers to finish:
+//!
+//! ```
+//! # let rcu = utils::simple_rcu::Rcu::new(1);
+//! let write_guard = rcu.lock_for_write();
+//!
+//! // NB: holding `write_guard` blocks new readers and writers. Keep this section short!
+//! let new_value = *write_guard + 1;
+//!
+//! let waitlist = write_guard.store_and_unlock(new_value); // consumes `write_guard`
+//!
+//! // Concurrent reads and writes are now possible again. Wait for all the readers
+//! // that still observe the old value to finish.
+//! waitlist.wait();
+//! ```
+//!
+#![warn(missing_docs)]
+
+use std::ops::Deref;
+use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
+use std::sync::{Arc, Weak};
+use std::sync::{Mutex, RwLock, RwLockWriteGuard};
+
+///
+/// Rcu allows multiple readers to read and hold onto a value without blocking
+/// (for very long).  Storing to the Rcu updates the value, making new readers
+/// immediately see the new value, but it also waits for all current readers to
+/// finish.
+///
+pub struct Rcu<V> {
+    inner: RwLock<RcuInner<V>>,
+}
+
+struct RcuInner<V> {
+    current_cell: Arc<RcuCell<V>>,
+    old_cells: Vec<Weak<RcuCell<V>>>,
+}
+
+///
+/// RcuCell holds one value. It can be the latest one, or an old one.
+///
+struct RcuCell<V> {
+    value: V,
+
+    /// A dummy channel. We never send anything to this channel. The point is
+    /// that when the RcuCell is dropped, any cloned Senders will be notified
+    /// that the channel is closed. Updaters can use this to wait out until the
+    /// RcuCell has been dropped, i.e. until the old value is no longer in use.
+    ///
+    /// We never do anything with the receiver, we just need to hold onto it so
+    /// that the Senders will be notified when it's dropped. But because it's
+    /// not Sync, we need a Mutex on it.
+    watch: (SyncSender<()>, Mutex<Receiver<()>>),
+}
+
+impl<V> RcuCell<V> {
+    fn new(value: V) -> Self {
+        let (watch_sender, watch_receiver) = sync_channel(0);
+        RcuCell {
+            value,
+            watch: (watch_sender, Mutex::new(watch_receiver)),
+        }
+    }
+}
+
+impl<V> Rcu<V> {
+    /// Create a new `Rcu`, initialized to `starting_val`
+    pub fn new(starting_val: V) -> Self {
+        let inner = RcuInner {
+            current_cell: Arc::new(RcuCell::new(starting_val)),
+            old_cells: Vec::new(),
+        };
+        Self {
+            inner: RwLock::new(inner),
+        }
+    }
+
+    ///
+    /// Read current value. Any store() calls will block until the returned
+    /// guard object is dropped.
+    ///
+    pub fn read(&self) -> RcuReadGuard<V> {
+        let current_cell = Arc::clone(&self.inner.read().unwrap().current_cell);
+        RcuReadGuard { cell: current_cell }
+    }
+
+    ///
+    /// Lock the current value for updating. Returns a guard object that can be
+    /// used to read the current value, and to store a new value.
+    ///
+    /// Note: holding the write-guard blocks concurrent readers, so you should
+    /// finish the update and drop the guard quickly! Multiple writers can be
+    /// waiting on the RcuWriteGuard::store step at the same time, however.
+    ///
+    pub fn lock_for_write(&self) -> RcuWriteGuard<'_, V> {
+        let inner = self.inner.write().unwrap();
+        RcuWriteGuard { inner }
+    }
+}
+
+///
+/// Read guard returned by `read`
+///
+pub struct RcuReadGuard<V> {
+    cell: Arc<RcuCell<V>>,
+}
+
+impl<V> Deref for RcuReadGuard<V> {
+    type Target = V;
+
+    fn deref(&self) -> &V {
+        &self.cell.value
+    }
+}
+
+///
+/// Write guard returned by `write`
+///
+/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
+/// it should only be held for a short duration!
+///
+/// Calling `store` consumes the guard, making new reads and new writes possible
+/// again.
+///
+pub struct RcuWriteGuard<'a, V> {
+    inner: RwLockWriteGuard<'a, RcuInner<V>>,
+}
+
+impl<'a, V> Deref for RcuWriteGuard<'a, V> {
+    type Target = V;
+
+    fn deref(&self) -> &V {
+        &self.inner.current_cell.value
+    }
+}
+
+impl<'a, V> RcuWriteGuard<'a, V> {
+    ///
+    /// Store a new value. The new value will be written to the Rcu immediately,
+    /// and will be immediately seen by any `read` calls that start afterwards.
+    ///
+    /// Returns a list of readers that can see old values. You can call `wait()`
+    /// on it to wait for them to finish.
+    ///
+    pub fn store_and_unlock(mut self, new_val: V) -> RcuWaitList {
+        let new_cell = Arc::new(RcuCell::new(new_val));
+
+        let mut watches = Vec::new();
+        {
+            let old = std::mem::replace(&mut self.inner.current_cell, new_cell);
+            self.inner.old_cells.push(Arc::downgrade(&old));
+
+            // cleanup old cells that no longer have any readers, and collect
+            // the watches for any that do.
+            self.inner.old_cells.retain(|weak| {
+                if let Some(cell) = weak.upgrade() {
+                    watches.push(cell.watch.0.clone());
+                    true
+                } else {
+                    false
+                }
+            });
+        }
+        RcuWaitList(watches)
+    }
+}
+
+///
+/// List of readers who can still see old values.
+///
+pub struct RcuWaitList(Vec<SyncSender<()>>);
+
+impl RcuWaitList {
+    ///
+    /// Wait for old readers to finish.
+    ///
+    pub fn wait(mut self) {
+        // after all the old_cells are no longer in use, we're done
+        for w in self.0.iter_mut() {
+            // This will block until the Receiver is closed. That happens when
+            // the RcuCell is dropped.
+            #[allow(clippy::single_match)]
+            match w.send(()) {
+                Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
+                Err(_) => {
+                    // closed, which means that the cell has been dropped, and
+                    // its value is no longer in use
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::{Arc, Mutex};
+    use std::thread::{sleep, spawn};
+    use std::time::Duration;
+
+    #[test]
+    fn two_writers() {
+        let rcu = Rcu::new(1);
+
+        let read1 = rcu.read();
+        assert_eq!(*read1, 1);
+
+        let write2 = rcu.lock_for_write();
+        assert_eq!(*write2, 1);
+        let wait2 = write2.store_and_unlock(2);
+
+        let read2 = rcu.read();
+        assert_eq!(*read2, 2);
+
+        let write3 = rcu.lock_for_write();
+        assert_eq!(*write3, 2);
+        let wait3 = write3.store_and_unlock(3);
+
+        // new reader can see the new value, and old readers continue to see the old values.
+        let read3 = rcu.read();
+        assert_eq!(*read3, 3);
+        assert_eq!(*read2, 2);
+        assert_eq!(*read1, 1);
+
+        let log = Arc::new(Mutex::new(Vec::new()));
+        // Wait for the old readers to finish in separate threads.
+        let log_clone = Arc::clone(&log);
+        let thread2 = spawn(move || {
+            wait2.wait();
+            log_clone.lock().unwrap().push("wait2 done");
+        });
+        let log_clone = Arc::clone(&log);
+        let thread3 = spawn(move || {
+            wait3.wait();
+            log_clone.lock().unwrap().push("wait3 done");
+        });
+
+        // without this sleep the test can pass on accident if the writer is slow
+        sleep(Duration::from_millis(500));
+
+        // Release first reader. This allows first write to finish, but calling
+        // wait() on the second one would still block.
+        log.lock().unwrap().push("dropping read1");
+        drop(read1);
+        thread2.join().unwrap();
+
+        sleep(Duration::from_millis(500));
+
+        // Release second reader, and finish second writer.
+        log.lock().unwrap().push("dropping read2");
+        drop(read2);
+        thread3.join().unwrap();
+
+        assert_eq!(
+            log.lock().unwrap().as_slice(),
+            &[
+                "dropping read1",
+                "wait2 done",
+                "dropping read2",
+                "wait3 done"
+            ]
+        );
+    }
+}
--- a/libs/utils/tests/bin_ser_test.rs
+++ b/libs/utils/tests/bin_ser_test.rs
@@ -4,7 +4,7 @@ use serde::Deserialize;
 use std::io::Read;
 use utils::bin_ser::LeSer;

-#[derive(Debug, PartialEq, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Deserialize)]
 pub struct HeaderData {
    magic: u16,
    info: u16,
--- a/libs/utils/tests/ssl_test.rs
+++ b/libs/utils/tests/ssl_test.rs
@@ -30,6 +30,9 @@ static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
 });

 #[test]
+// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274),
+// we resize the vector so doing some modifications after all
+#[allow(clippy::read_zero_byte_vec)]
 fn ssl() {
    let (mut client_sock, server_sock) = make_tcp_pair();

--- a/neon_local/Cargo.toml
+++ b/neon_local/Cargo.toml
@@ -1,19 +0,0 @@
-[package]
-name = "neon_local"
-version = "0.1.0"
-edition = "2021"
-
-[dependencies]
-clap = "3.0"
-anyhow = "1.0"
-serde_json = "1"
-comfy-table = "5.0.1"
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-git-version = "0.3.5"
-
-# FIXME: 'pageserver' is needed for BranchInfo. Refactor
-pageserver = { path = "../pageserver" }
-control_plane = { path = "../control_plane" }
-safekeeper = { path = "../safekeeper" }
-utils = { path = "../libs/utils" }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,10 +12,12 @@ profiling = ["pprof"]
 failpoints = ["fail/failpoints"]

 [dependencies]
+async-stream = "0.3"
+async-trait = "0.1"
 chrono = "0.4.19"
 rand = "0.8.3"
 regex = "1.4.5"
-bytes = { version = "1.0.1", features = ['serde'] }
+bytes = "1.0.1"
 byteorder = "1.4.3"
 futures = "0.3.13"
 hex = "0.4.3"
@@ -24,6 +26,7 @@ itertools = "0.10.3"
 clap = "3.0"
 daemonize = "0.4.1"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
+tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
 postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
@@ -43,7 +46,7 @@ pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallcl
 toml_edit = { version = "0.13", features = ["easy"] }
 scopeguard = "1.1.0"
 const_format = "0.2.21"
-tracing = "0.1.27"
+tracing = "0.1.36"
 signal-hook = "0.3.10"
 url = "2"
 nix = "0.23"
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,22 +22,26 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;

+use crate::layered_repository::Timeline;
 use crate::reltag::{RelTag, SlruKind};
-use crate::DatadirTimeline;
-use postgres_ffi::xlog_utils::*;
-use postgres_ffi::*;
+
+use postgres_ffi::v14::pg_constants;
+use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName};
+use postgres_ffi::v14::{CheckPoint, ControlFileData};
+use postgres_ffi::TransactionId;
+use postgres_ffi::PG_TLI;
+use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;

 /// This is short-living object only for the time of tarball creation,
 /// created mostly to avoid passing a lot of parameters between various functions
 /// used for constructing tarball.
-pub struct Basebackup<'a, W, T>
+pub struct Basebackup<'a, W>
 where
    W: Write,
-    T: DatadirTimeline,
 {
    ar: Builder<AbortableWrite<W>>,
-    timeline: &'a Arc<T>,
+    timeline: &'a Arc<Timeline>,
    pub lsn: Lsn,
    prev_record_lsn: Lsn,
    full_backup: bool,
@@ -52,18 +56,17 @@ where
 //  * When working without safekeepers. In this situation it is important to match the lsn
 //    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
 //    to start the replication.
-impl<'a, W, T> Basebackup<'a, W, T>
+impl<'a, W> Basebackup<'a, W>
 where
    W: Write,
-    T: DatadirTimeline,
 {
    pub fn new(
        write: W,
-        timeline: &'a Arc<T>,
+        timeline: &'a Arc<Timeline>,
        req_lsn: Option<Lsn>,
        prev_lsn: Option<Lsn>,
        full_backup: bool,
-    ) -> Result<Basebackup<'a, W, T>> {
+    ) -> Result<Basebackup<'a, W>> {
        // Compute postgres doesn't have any previous WAL files, but the first
        // record that it's going to write needs to include the LSN of the
        // previous record (xl_prev). We include prev_record_lsn in the
@@ -78,9 +81,8 @@ where
        // an old LSN and it doesn't have any WAL of its own yet. We will set
        // prev_lsn to Lsn(0) if we cannot provide the correct value.
        let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
-            // Backup was requested at a particular LSN. Wait for it to arrive.
-            info!("waiting for {}", req_lsn);
-            timeline.wait_lsn(req_lsn)?;
+            // Backup was requested at a particular LSN. The caller should've
+            // already checked that it's a valid LSN.

            // If the requested point is the end of the timeline, we can
            // provide prev_lsn. (get_last_record_rlsn() might return it as
@@ -183,7 +185,7 @@ where
    }

    fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_rel_size(tag, self.lsn)?;
+        let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?;

        // Function that adds relation segment data to archive
        let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
@@ -200,11 +202,13 @@ where
        }

        // Add a file for each chunk of blocks (aka segment)
-        let chunks = (0..nblocks).chunks(pg_constants::RELSEG_SIZE as usize);
+        let chunks = (0..nblocks).chunks(RELSEG_SIZE as usize);
        for (seg, blocks) in chunks.into_iter().enumerate() {
            let mut segment_data: Vec<u8> = vec![];
            for blknum in blocks {
-                let img = self.timeline.get_rel_page_at_lsn(tag, blknum, self.lsn)?;
+                let img = self
+                    .timeline
+                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)?;
                segment_data.extend_from_slice(&img[..]);
            }

@@ -220,23 +224,19 @@ where
    fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
        let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;

-        let mut slru_buf: Vec<u8> =
-            Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize);
+        let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
        for blknum in 0..nblocks {
            let img = self
                .timeline
                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;

            if slru == SlruKind::Clog {
-                ensure!(
-                    img.len() == pg_constants::BLCKSZ as usize
-                        || img.len() == pg_constants::BLCKSZ as usize + 8
-                );
+                ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
            } else {
-                ensure!(img.len() == pg_constants::BLCKSZ as usize);
+                ensure!(img.len() == BLCKSZ as usize);
            }

-            slru_buf.extend_from_slice(&img[..pg_constants::BLCKSZ as usize]);
+            slru_buf.extend_from_slice(&img[..BLCKSZ as usize]);
        }

        let segname = format!("{}/{:>04X}", slru.to_str(), segno);
@@ -267,8 +267,11 @@ where
            None
        };

+        // TODO pass this as a parameter
+        let pg_version = "14";
+
        if spcnode == pg_constants::GLOBALTABLESPACE_OID {
-            let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
+            let version_bytes = pg_version.as_bytes();
            let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?;
            self.ar.append(&header, version_bytes)?;

@@ -311,7 +314,7 @@ where

            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);
-                let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
+                let version_bytes = pg_version.as_bytes();
                let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
                self.ar.append(&header, version_bytes)?;

@@ -357,7 +360,7 @@ where
        let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;

        // Generate new pg_control needed for bootstrap
-        checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0;
+        checkpoint.redo = normalize_lsn(self.lsn, WAL_SEGMENT_SIZE).0;

        //reset some fields we don't want to preserve
        //TODO Check this.
@@ -391,22 +394,21 @@ where
        self.ar.append(&header, &pg_control_bytes[..])?;

        //send wal segment
-        let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
-        let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
+        let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
+        let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE);
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
-        let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
+        let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?;
        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier)
            .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
-        ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
+        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
        self.ar.append(&header, &wal_seg[..])?;
        Ok(())
    }
 }

-impl<'a, W, T> Drop for Basebackup<'a, W, T>
+impl<'a, W> Drop for Basebackup<'a, W>
 where
    W: Write,
-    T: DatadirTimeline,
 {
    /// If the basebackup was not finished, prevent the Archive::drop() from
    /// writing the end-of-archive marker.
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -1,9 +1,10 @@
 //! Main entry point for the Page Server executable.

-use std::{env, path::Path, str::FromStr};
+use remote_storage::GenericRemoteStorage;
+use std::{env, ops::ControlFlow, path::Path, str::FromStr};
 use tracing::*;

-use anyhow::{bail, Context, Result};
+use anyhow::{anyhow, bail, Context, Result};

 use clap::{App, Arg};
 use daemonize::Daemonize;
@@ -11,20 +12,21 @@ use daemonize::Daemonize;
 use fail::FailScenario;
 use pageserver::{
    config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, profiling, tenant_mgr, thread_mgr,
-    thread_mgr::ThreadKind,
-    timelines, virtual_file, LOG_FILE_NAME,
+    http, page_cache, page_service, profiling, task_mgr,
+    task_mgr::TaskKind,
+    task_mgr::{
+        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
+    },
+    tenant_mgr, virtual_file, LOG_FILE_NAME,
 };
 use utils::{
    auth::JwtAuth,
-    http::endpoint,
    logging,
    postgres_backend::AuthType,
    project_git_version,
    shutdown::exit_now,
    signals::{self, Signal},
    tcp_listener,
-    zid::{ZTenantId, ZTimelineId},
 };

 project_git_version!(GIT_VERSION);
@@ -42,6 +44,7 @@ fn main() -> anyhow::Result<()> {
        .about("Materializes WAL stream to pages and serves them to the postgres")
        .version(&*version())
        .arg(
+
            Arg::new("daemonize")
                .short('d')
                .long("daemonize")
@@ -52,7 +55,7 @@ fn main() -> anyhow::Result<()> {
            Arg::new("init")
                .long("init")
                .takes_value(false)
-                .help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"),
+                .help("Initialize pageserver with all given config overrides"),
        )
        .arg(
            Arg::new("workdir")
@@ -61,20 +64,6 @@ fn main() -> anyhow::Result<()> {
                .takes_value(true)
                .help("Working directory for the pageserver"),
        )
-        .arg(
-            Arg::new("create-tenant")
-                .long("create-tenant")
-                .takes_value(true)
-                .help("Create tenant during init")
-                .requires("init"),
-        )
-        .arg(
-            Arg::new("initial-timeline-id")
-                .long("initial-timeline-id")
-                .takes_value(true)
-                .help("Use a specific timeline id during init and tenant creation")
-                .requires("create-tenant"),
-        )
        // See `settings.md` for more details on the extra configuration patameters pageserver can process
        .arg(
            Arg::new("config-override")
@@ -85,6 +74,9 @@ fn main() -> anyhow::Result<()> {
                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
        )
+        .arg(Arg::new("update-config").long("update-config").takes_value(false).help(
+            "Update the config file when started",
+        ))
        .arg(
            Arg::new("enabled-features")
                .long("enabled-features")
@@ -110,18 +102,6 @@ fn main() -> anyhow::Result<()> {
        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
    let cfg_file_path = workdir.join("pageserver.toml");

-    let init = arg_matches.is_present("init");
-    let create_tenant = arg_matches
-        .value_of("create-tenant")
-        .map(ZTenantId::from_str)
-        .transpose()
-        .context("Failed to parse tenant id from the arguments")?;
-    let initial_timeline_id = arg_matches
-        .value_of("initial-timeline-id")
-        .map(ZTimelineId::from_str)
-        .transpose()
-        .context("Failed to parse timeline id from the arguments")?;
-
    // Set CWD to workdir for non-daemon modes
    env::set_current_dir(&workdir).with_context(|| {
        format!(
@@ -131,30 +111,86 @@ fn main() -> anyhow::Result<()> {
    })?;

    let daemonize = arg_matches.is_present("daemonize");
-    if init && daemonize {
-        bail!("--daemonize cannot be used with --init")
-    }

-    let mut toml = if init {
-        // We're initializing the repo, so there's no config file yet
-        DEFAULT_CONFIG_FILE
-            .parse::<toml_edit::Document>()
-            .context("could not parse built-in config file")?
-    } else {
-        // Supplement the CLI arguments with the config file
-        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)
-            .with_context(|| format!("No pageserver config at '{}'", cfg_file_path.display()))?;
-        cfg_file_contents
-            .parse::<toml_edit::Document>()
-            .with_context(|| {
-                format!(
-                    "Failed to read '{}' as pageserver config",
-                    cfg_file_path.display()
-                )
-            })?
+    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
+        ControlFlow::Continue(conf) => conf,
+        ControlFlow::Break(()) => {
+            info!("Pageserver config init successful");
+            return Ok(());
+        }
+    };
+
+    let tenants_path = conf.tenants_path();
+    if !tenants_path.exists() {
+        utils::crashsafe_dir::create_dir_all(conf.tenants_path()).with_context(|| {
+            format!(
+                "Failed to create tenants root dir at '{}'",
+                tenants_path.display()
+            )
+        })?;
+    }
+
+    // Initialize up failpoints support
+    let scenario = FailScenario::setup();
+
+    // Basic initialization of things that don't change after startup
+    virtual_file::init(conf.max_file_descriptors);
+    page_cache::init(conf.page_cache_size);
+
+    start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
+
+    scenario.teardown();
+    Ok(())
+}
+
+fn initialize_config(
+    cfg_file_path: &Path,
+    arg_matches: clap::ArgMatches,
+    workdir: &Path,
+) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
+    let init = arg_matches.is_present("init");
+    let update_config = init || arg_matches.is_present("update-config");
+
+    let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
+        if init {
+            anyhow::bail!(
+                "Config file '{}' already exists, cannot init it, use --update-config to update it",
+                cfg_file_path.display()
+            );
+        }
+        // Supplement the CLI arguments with the config file
+        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| {
+            format!(
+                "Failed to read pageserver config at '{}'",
+                cfg_file_path.display()
+            )
+        })?;
+        (
+            cfg_file_contents
+                .parse::<toml_edit::Document>()
+                .with_context(|| {
+                    format!(
+                        "Failed to parse '{}' as pageserver config",
+                        cfg_file_path.display()
+                    )
+                })?,
+            true,
+        )
+    } else if cfg_file_path.exists() {
+        anyhow::bail!(
+            "Config file '{}' exists but is not a regular file",
+            cfg_file_path.display()
+        );
+    } else {
+        // We're initializing the repo, so there's no config file yet
+        (
+            DEFAULT_CONFIG_FILE
+                .parse::<toml_edit::Document>()
+                .context("could not parse built-in config file")?,
+            false,
+        )
    };

-    // Process any extra options given with -c
    if let Some(values) = arg_matches.values_of("config-override") {
        for option_line in values {
            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
@@ -165,49 +201,38 @@ fn main() -> anyhow::Result<()> {
            })?;

            for (key, item) in doc.iter() {
-                if key == "id" {
-                    anyhow::ensure!(
-                        init,
-                        "node id can only be set during pageserver init and cannot be overridden"
-                    );
+                if config_file_exists && update_config && key == "id" && toml.contains_key(key) {
+                    anyhow::bail!("Pageserver config file exists at '{}' and has node id already, it cannot be overridden", cfg_file_path.display());
                }
                toml.insert(key, item.clone());
            }
        }
    }
-    trace!("Resulting toml: {}", toml);
-    let conf = PageServerConf::parse_and_validate(&toml, &workdir)
+
+    debug!("Resulting toml: {toml}");
+    let conf = PageServerConf::parse_and_validate(&toml, workdir)
        .context("Failed to parse pageserver configuration")?;

-    // The configuration is all set up now. Turn it into a 'static
-    // that can be freely stored in structs and passed across threads
-    // as a ref.
-    let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+    if update_config {
+        info!("Writing pageserver config to '{}'", cfg_file_path.display());

-    // Initialize up failpoints support
-    let scenario = FailScenario::setup();
-
-    // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors);
-    page_cache::init(conf.page_cache_size);
-
-    // Create repo and exit if init was requested
-    if init {
-        timelines::init_pageserver(conf, create_tenant, initial_timeline_id)
-            .context("Failed to init pageserver")?;
-        // write the config file
        std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
            format!(
-                "Failed to initialize pageserver config at '{}'",
+                "Failed to write pageserver config to '{}'",
                cfg_file_path.display()
            )
        })?;
-    } else {
-        start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
+        info!(
+            "Config successfully written to '{}'",
+            cfg_file_path.display()
+        )
    }

-    scenario.teardown();
-    Ok(())
+    Ok(if init {
+        ControlFlow::Break(())
+    } else {
+        ControlFlow::Continue(Box::leak(Box::new(conf)))
+    })
 }

 fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
@@ -263,7 +288,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    // start profiler (if enabled)
    let profiler_guard = profiling::init_profiler(conf);

-    pageserver::tenant_tasks::init_tenant_task_pool()?;
+    WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?;

    // initialize authentication for incoming connections
    let auth = match &conf.auth_type {
@@ -276,34 +301,62 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    };
    info!("Using auth: {:#?}", conf.auth_type);

-    let remote_index = tenant_mgr::init_tenant_mgr(conf)?;
+    let remote_storage = conf
+        .remote_storage_config
+        .as_ref()
+        .map(|storage_config| {
+            GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config)
+        })
+        .transpose()
+        .context("Failed to init generic remote storage")?;
+    let remote_index = {
+        let _rt_guard = BACKGROUND_RUNTIME.enter();
+        tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?
+    };

-    // Spawn a new thread for the http endpoint
+    // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
    // bind before launching separate thread so the error reported before startup exits
-    let auth_cloned = auth.clone();
-    thread_mgr::spawn(
-        ThreadKind::HttpEndpointListener,
-        None,
-        None,
-        "http_endpoint_thread",
-        true,
-        move || {
-            let router = http::make_router(conf, auth_cloned, remote_index)?;
-            endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
-        },
-    )?;

-    // Spawn a thread to listen for libpq connections. It will spawn further threads
+    // Create a Service from the router above to handle incoming requests.
+    {
+        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
+
+        let router = http::make_router(conf, auth.clone(), remote_index, remote_storage)?;
+        let service =
+            utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap();
+        let server = hyper::Server::from_tcp(http_listener)?
+            .serve(service)
+            .with_graceful_shutdown(task_mgr::shutdown_watcher());
+
+        task_mgr::spawn(
+            MGMT_REQUEST_RUNTIME.handle(),
+            TaskKind::HttpEndpointListener,
+            None,
+            None,
+            "http endpoint listener",
+            true,
+            async {
+                server.await?;
+                Ok(())
+            },
+        );
+    }
+
+    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection.
-    thread_mgr::spawn(
-        ThreadKind::LibpqEndpointListener,
+    task_mgr::spawn(
+        COMPUTE_REQUEST_RUNTIME.handle(),
+        TaskKind::LibpqEndpointListener,
        None,
        None,
-        "libpq endpoint thread",
+        "libpq endpoint listener",
        true,
-        move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type),
-    )?;
+        async move {
+            page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await
+        },
+    );

+    // All started up! Now just sit and wait for shutdown signal.
    signals.handle(|signal| match signal {
        Signal::Quit => {
            info!(
@@ -320,7 +373,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
                signal.name()
            );
            profiling::exit_profiler(conf, &profiler_guard);
-            pageserver::shutdown_pageserver(0);
+            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
            unreachable!()
        }
    })
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -205,7 +205,7 @@ impl Default for PageServerConfigBuilder {
            workdir: Set(PathBuf::new()),
            pg_distrib_dir: Set(env::current_dir()
                .expect("cannot access current directory")
-                .join("tmp_install")),
+                .join("pg_install/v14")),
            auth_type: Set(AuthType::Trust),
            auth_validation_public_key_path: Set(None),
            remote_storage_config: Set(None),
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -8,7 +8,6 @@ use utils::{
 };

 // These enums are used in the API response fields.
-use crate::repository::LocalTimelineState;
 use crate::tenant_mgr::TenantState;

 #[serde_as]
@@ -129,11 +128,10 @@ pub struct LocalTimelineInfo {
    pub latest_gc_cutoff_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
-    pub current_logical_size: Option<usize>, // is None when timeline is Unloaded
-    pub current_physical_size: Option<u64>,  // is None when timeline is Unloaded
-    pub current_logical_size_non_incremental: Option<usize>,
+    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
+    pub current_logical_size_non_incremental: Option<u64>,
    pub current_physical_size_non_incremental: Option<u64>,
-    pub timeline_state: LocalTimelineState,

    pub wal_source_connstr: Option<String>,
    #[serde_as(as = "Option<DisplayFromStr>")]
@@ -150,6 +148,9 @@ pub struct RemoteTimelineInfo {
    pub awaits_download: bool,
 }

+///
+/// This represents the output of the "timeline_detail" API call.
+///
 #[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -257,51 +257,6 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

-  /v1/tenant/{tenant_id}/timeline/{timeline_id}/detach:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Deprecated, use DELETE /v1/tenant/{tenant_id}/timeline/{timeline_id} instead
-      deprecated: true
-      responses:
-        "200":
-          description: Ok
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-
  /v1/tenant/{tenant_id}/detach:
    parameters:
      - name: tenant_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -11,14 +11,10 @@ use super::models::{
    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
    TimelineCreateRequest,
 };
-use crate::layered_repository::metadata::TimelineMetadata;
-use crate::pgdatadir_mapping::DatadirTimeline;
-use crate::repository::{LocalTimelineState, RepositoryTimeline};
-use crate::repository::{Repository, Timeline};
+use crate::layered_repository::Timeline;
 use crate::storage_sync;
 use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
 use crate::tenant_config::TenantConfOpt;
-use crate::TimelineImpl;
 use crate::{config::PageServerConf, tenant_mgr, timelines};
 use utils::{
    auth::JwtAuth,
@@ -46,20 +42,12 @@ impl State {
        conf: &'static PageServerConf,
        auth: Option<Arc<JwtAuth>>,
        remote_index: RemoteIndex,
+        remote_storage: Option<GenericRemoteStorage>,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
            .iter()
            .map(|v| v.parse().unwrap())
            .collect::<Vec<_>>();
-        // Note that this remote storage is created separately from the main one in the sync_loop.
-        // It's fine since it's stateless and some code duplication saves us from bloating the code around with generics.
-        let remote_storage = conf
-            .remote_storage_config
-            .as_ref()
-            .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config))
-            .transpose()
-            .context("Failed to init generic remote storage")?;
-
        Ok(Self {
            conf,
            auth,
@@ -85,8 +73,8 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {

 // Helper functions to construct a LocalTimelineInfo struct for a timeline

-fn local_timeline_info_from_loaded_timeline(
-    timeline: &TimelineImpl,
+fn local_timeline_info_from_timeline(
+    timeline: &Arc<Timeline>,
    include_non_incremental_logical_size: bool,
    include_non_incremental_physical_size: bool,
 ) -> anyhow::Result<LocalTimelineInfo> {
@@ -116,8 +104,11 @@ fn local_timeline_info_from_loaded_timeline(
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
-        timeline_state: LocalTimelineState::Loaded,
-        current_logical_size: Some(timeline.get_current_logical_size()),
+        current_logical_size: Some(
+            timeline
+                .get_current_logical_size()
+                .context("Timeline info creation failed to get current logical size")?,
+        ),
        current_physical_size: Some(timeline.get_physical_size()),
        current_logical_size_non_incremental: if include_non_incremental_logical_size {
            Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?)
@@ -136,61 +127,20 @@ fn local_timeline_info_from_loaded_timeline(
    Ok(info)
 }

-fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> LocalTimelineInfo {
-    LocalTimelineInfo {
-        ancestor_timeline_id: metadata.ancestor_timeline(),
-        ancestor_lsn: {
-            match metadata.ancestor_lsn() {
-                Lsn(0) => None,
-                lsn @ Lsn(_) => Some(lsn),
-            }
-        },
-        disk_consistent_lsn: metadata.disk_consistent_lsn(),
-        last_record_lsn: metadata.disk_consistent_lsn(),
-        prev_record_lsn: metadata.prev_record_lsn(),
-        latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(),
-        timeline_state: LocalTimelineState::Unloaded,
-        current_logical_size: None,
-        current_physical_size: None,
-        current_logical_size_non_incremental: None,
-        current_physical_size_non_incremental: None,
-        wal_source_connstr: None,
-        last_received_msg_lsn: None,
-        last_received_msg_ts: None,
-    }
-}
-
-fn local_timeline_info_from_repo_timeline(
-    repo_timeline: &RepositoryTimeline<TimelineImpl>,
-    include_non_incremental_logical_size: bool,
-    include_non_incremental_physical_size: bool,
-) -> anyhow::Result<LocalTimelineInfo> {
-    match repo_timeline {
-        RepositoryTimeline::Loaded(timeline) => local_timeline_info_from_loaded_timeline(
-            &*timeline,
-            include_non_incremental_logical_size,
-            include_non_incremental_physical_size,
-        ),
-        RepositoryTimeline::Unloaded { metadata } => {
-            Ok(local_timeline_info_from_unloaded_timeline(metadata))
-        }
-    }
-}
-
 fn list_local_timelines(
    tenant_id: ZTenantId,
    include_non_incremental_logical_size: bool,
    include_non_incremental_physical_size: bool,
 ) -> Result<Vec<(ZTimelineId, LocalTimelineInfo)>> {
    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
-        .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?;
+        .with_context(|| format!("Failed to get repo for tenant {tenant_id}"))?;
    let repo_timelines = repo.list_timelines();

    let mut local_timeline_info = Vec::with_capacity(repo_timelines.len());
    for (timeline_id, repository_timeline) in repo_timelines {
        local_timeline_info.push((
            timeline_id,
-            local_timeline_info_from_repo_timeline(
+            local_timeline_info_from_timeline(
                &repository_timeline,
                include_non_incremental_logical_size,
                include_non_incremental_physical_size,
@@ -209,25 +159,22 @@ async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiErr
 async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
-
    check_permission(&request, Some(tenant_id))?;

-    let new_timeline_info = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("/timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn).entered();
-
+    let new_timeline_info = async {
        match timelines::create_timeline(
            get_config(&request),
            tenant_id,
            request_data.new_timeline_id.map(ZTimelineId::from),
            request_data.ancestor_timeline_id.map(ZTimelineId::from),
            request_data.ancestor_start_lsn,
-        ) {
-            Ok(Some((new_timeline_id, new_timeline))) => {
+        ).await {
+            Ok(Some(new_timeline)) => {
                // Created. Construct a TimelineInfo for it.
-                let local_info = local_timeline_info_from_loaded_timeline(new_timeline.as_ref(), false, false)?;
+                let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?;
                Ok(Some(TimelineInfo {
                    tenant_id,
-                    timeline_id: new_timeline_id,
+                    timeline_id: new_timeline.timeline_id,
                    local: Some(local_info),
                    remote: None,
                }))
@@ -235,9 +182,10 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
            Ok(None) => Ok(None), // timeline already exists
            Err(err) => Err(err),
        }
-    })
-    .await
-        .map_err(ApiError::from_err)??;
+    }
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn))
+        .await
+            .map_err(ApiError::from_err)?;

    Ok(match new_timeline_info {
        Some(info) => json_response(StatusCode::CREATED, info)?,
@@ -247,11 +195,12 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<

 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
    let include_non_incremental_logical_size =
        query_param_present(&request, "include-non-incremental-logical-size");
    let include_non_incremental_physical_size =
        query_param_present(&request, "include-non-incremental-physical-size");
+    check_permission(&request, Some(tenant_id))?;
+
    let local_timeline_infos = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
        list_local_timelines(
@@ -302,13 +251,12 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {

 async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size =
        query_param_present(&request, "include-non-incremental-logical-size");
    let include_non_incremental_physical_size =
        query_param_present(&request, "include-non-incremental-physical-size");
+    check_permission(&request, Some(tenant_id))?;

    let (local_timeline_info, remote_timeline_info) = async {
        // any error here will render local timeline as None
@@ -319,7 +267,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
                repo.get_timeline(timeline_id)
                    .as_ref()
                    .map(|timeline| {
-                        local_timeline_info_from_repo_timeline(
+                        local_timeline_info_from_timeline(
                            timeline,
                            include_non_incremental_logical_size,
                            include_non_incremental_physical_size,
@@ -348,7 +296,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
        };
        (local_timeline_info, remote_timeline_info)
    }
-    .instrument(info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id))
+    .instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
    .await;

    if local_timeline_info.is_none() && remote_timeline_info.is_none() {
@@ -372,7 +320,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    info!("Handling tenant attach {}", tenant_id,);
+    info!("Handling tenant attach {}", tenant_id);

    tokio::task::spawn_blocking(move || {
        if tenant_mgr::get_tenant_state(tenant_id).is_some() {
@@ -452,16 +400,8 @@ async fn gather_tenant_timelines_index_parts(
    tenant_id: ZTenantId,
 ) -> anyhow::Result<Option<Vec<(ZTimelineId, RemoteTimeline)>>> {
    let index_parts = match state.remote_storage.as_ref() {
-        Some(GenericRemoteStorage::Local(local_storage)) => {
-            storage_sync::gather_tenant_timelines_index_parts(state.conf, local_storage, tenant_id)
-                .await
-        }
-        // FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones
-        //       because it is a different instance. We can move this limit to some global static
-        //       or use one instance everywhere.
-        Some(GenericRemoteStorage::S3(s3_storage)) => {
-            storage_sync::gather_tenant_timelines_index_parts(state.conf, s3_storage, tenant_id)
-                .await
+        Some(storage) => {
+            storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await
        }
        None => return Ok(None),
    }
@@ -481,17 +421,14 @@ async fn gather_tenant_timelines_index_parts(

 async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
-
    let state = get_state(&request);
-    tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered();
-        tenant_mgr::delete_timeline(tenant_id, timeline_id)
-    })
-    .await
-    .map_err(ApiError::from_err)??;
+    tenant_mgr::delete_timeline(tenant_id, timeline_id)
+        .instrument(info_span!("timeline_delete", tenant = %tenant_id))
+        .await
+        .map_err(ApiError::from_err)?;

    let mut remote_index = state.remote_index.write().await;
    remote_index.remove_timeline_entry(ZTenantTimelineId {
@@ -508,12 +445,10 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,

    let state = get_state(&request);
    let conf = state.conf;
-    tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered();
-        tenant_mgr::detach_tenant(conf, tenant_id)
-    })
-    .await
-    .map_err(ApiError::from_err)??;
+    tenant_mgr::detach_tenant(conf, tenant_id)
+        .instrument(info_span!("tenant_detach", tenant = %tenant_id))
+        .await
+        .map_err(ApiError::from_err)?;

    let mut remote_index = state.remote_index.write().await;
    remote_index.remove_tenant_entry(&tenant_id);
@@ -522,7 +457,6 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
 }

 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    // check for management permission
    check_permission(&request, None)?;

    let state = get_state(&request);
@@ -531,7 +465,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A

    let response_data = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("tenant_list").entered();
-        crate::tenant_mgr::list_tenants(&remote_index)
+        crate::tenant_mgr::list_tenant_info(&remote_index)
    })
    .await
    .map_err(ApiError::from_err)?;
@@ -590,7 +524,6 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
 }

 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    // check for management permission
    check_permission(&request, None)?;

    let request_data: TenantCreateRequest = json_request(&mut request).await?;
@@ -645,7 +578,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
        let conf = get_config(&request);

-        tenant_mgr::create_tenant_repository(conf, tenant_conf, target_tenant_id, remote_index)
+        tenant_mgr::create_tenant(conf, tenant_conf, target_tenant_id, remote_index)
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -659,7 +592,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
 async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let request_data: TenantConfigRequest = json_request(&mut request).await?;
    let tenant_id = request_data.tenant_id;
-    // check for management permission
    check_permission(&request, Some(tenant_id))?;

    let mut tenant_conf: TenantConfOpt = Default::default();
@@ -703,7 +635,8 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
    tokio::task::spawn_blocking(move || {
        let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();

-        tenant_mgr::update_tenant_config(tenant_conf, tenant_id)
+        let state = get_state(&request);
+        tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -722,6 +655,7 @@ pub fn make_router(
    conf: &'static PageServerConf,
    auth: Option<Arc<JwtAuth>>,
    remote_index: RemoteIndex,
+    remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -738,7 +672,8 @@ pub fn make_router(

    Ok(router
        .data(Arc::new(
-            State::new(conf, auth, remote_index).context("Failed to initialize router state")?,
+            State::new(conf, auth, remote_index, remote_storage)
+                .context("Failed to initialize router state")?,
        ))
        .get("/v1/status", status_handler)
        .get("/v1/tenant", tenant_list_handler)
@@ -757,10 +692,5 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id",
            timeline_delete_handler,
        )
-        // for backward compatibility
-        .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
-            timeline_delete_handler,
-        )
        .any(handler_404))
 }
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -11,26 +11,38 @@ use bytes::Bytes;
 use tracing::*;
 use walkdir::WalkDir;

+use crate::layered_repository::Timeline;
 use crate::pgdatadir_mapping::*;
 use crate::reltag::{RelTag, SlruKind};
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
-use postgres_ffi::relfile_utils::*;
-use postgres_ffi::waldecoder::*;
-use postgres_ffi::xlog_utils::*;
+use postgres_ffi::v14::relfile_utils::*;
+use postgres_ffi::v14::waldecoder::*;
+use postgres_ffi::v14::xlog_utils::*;
+use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
 use postgres_ffi::Oid;
-use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
+use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;

+// Returns checkpoint LSN from controlfile
+pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
+    // Read control file to extract the LSN
+    let controlfile_path = path.join("global").join("pg_control");
+    let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?;
+    let lsn = controlfile.checkPoint;
+
+    Ok(Lsn(lsn))
+}
+
 ///
 /// Import all relation data pages from local disk into the repository.
 ///
 /// This is currently only used to import a cluster freshly created by initdb.
 /// The code that deals with the checkpoint would not work right if the
 /// cluster was not shut down cleanly.
-pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
+pub fn import_timeline_from_postgres_datadir(
    path: &Path,
-    tline: &T,
+    tline: &Timeline,
    lsn: Lsn,
 ) -> Result<()> {
    let mut pg_control: Option<ControlFileData> = None;
@@ -88,8 +100,8 @@ pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
 }

 // subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
-fn import_rel<T: DatadirTimeline, Reader: Read>(
-    modification: &mut DatadirModification<T>,
+fn import_rel<Reader: Read>(
+    modification: &mut DatadirModification,
    path: &Path,
    spcoid: Oid,
    dboid: Oid,
@@ -110,8 +122,8 @@ fn import_rel<T: DatadirTimeline, Reader: Read>(

    let mut buf: [u8; 8192] = [0u8; 8192];

-    ensure!(len % pg_constants::BLCKSZ as usize == 0);
-    let nblocks = len / pg_constants::BLCKSZ as usize;
+    ensure!(len % BLCKSZ as usize == 0);
+    let nblocks = len / BLCKSZ as usize;

    let rel = RelTag {
        spcnode: spcoid,
@@ -120,7 +132,7 @@ fn import_rel<T: DatadirTimeline, Reader: Read>(
        forknum,
    };

-    let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
+    let mut blknum: u32 = segno * (1024 * 1024 * 1024 / BLCKSZ as u32);

    // Call put_rel_creation for every segment of the relation,
    // because there is no guarantee about the order in which we are processing segments.
@@ -144,8 +156,7 @@ fn import_rel<T: DatadirTimeline, Reader: Read>(
            Err(err) => match err.kind() {
                std::io::ErrorKind::UnexpectedEof => {
                    // reached EOF. That's expected.
-                    let relative_blknum =
-                        blknum - segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
+                    let relative_blknum = blknum - segno * (1024 * 1024 * 1024 / BLCKSZ as u32);
                    ensure!(relative_blknum == nblocks as u32, "unexpected EOF");
                    break;
                }
@@ -168,8 +179,8 @@ fn import_rel<T: DatadirTimeline, Reader: Read>(

 /// Import an SLRU segment file
 ///
-fn import_slru<T: DatadirTimeline, Reader: Read>(
-    modification: &mut DatadirModification<T>,
+fn import_slru<Reader: Read>(
+    modification: &mut DatadirModification,
    slru: SlruKind,
    path: &Path,
    mut reader: Reader,
@@ -184,8 +195,8 @@ fn import_slru<T: DatadirTimeline, Reader: Read>(
        .to_string_lossy();
    let segno = u32::from_str_radix(filename, 16)?;

-    ensure!(len % pg_constants::BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ
-    let nblocks = len / pg_constants::BLCKSZ as usize;
+    ensure!(len % BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ
+    let nblocks = len / BLCKSZ as usize;

    ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);

@@ -224,23 +235,18 @@ fn import_slru<T: DatadirTimeline, Reader: Read>(

 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
-fn import_wal<T: DatadirTimeline>(
-    walpath: &Path,
-    tline: &T,
-    startpoint: Lsn,
-    endpoint: Lsn,
-) -> Result<()> {
+fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
    let mut waldecoder = WalStreamDecoder::new(startpoint);

-    let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
-    let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
+    let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
+    let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
    let mut last_lsn = startpoint;

    let mut walingest = WalIngest::new(tline, startpoint)?;

    while last_lsn <= endpoint {
        // FIXME: assume postgresql tli 1 for now
-        let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
+        let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
        let mut buf = Vec::new();

        // Read local file
@@ -259,7 +265,7 @@ fn import_wal<T: DatadirTimeline>(
        }

        let nread = file.read_to_end(&mut buf)?;
-        if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
+        if nread != WAL_SEGMENT_SIZE - offset as usize {
            // Maybe allow this for .partial files?
            error!("read only {} bytes from WAL file", nread);
        }
@@ -295,12 +301,12 @@ fn import_wal<T: DatadirTimeline>(
    Ok(())
 }

-pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
-    tline: &T,
+pub fn import_basebackup_from_tar<Reader: Read>(
+    tline: &Timeline,
    reader: Reader,
    base_lsn: Lsn,
 ) -> Result<()> {
-    info!("importing base at {}", base_lsn);
+    info!("importing base at {base_lsn}");
    let mut modification = tline.begin_modification(base_lsn);
    modification.init_empty()?;

@@ -325,7 +331,11 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
                debug!("directory {:?}", file_path);
            }
            _ => {
-                panic!("tar::EntryType::?? {}", file_path.display());
+                bail!(
+                    "entry {} in backup tar archive is of unexpected type: {:?}",
+                    file_path.display(),
+                    header.entry_type()
+                );
            }
        }
    }
@@ -337,16 +347,16 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
    Ok(())
 }

-pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
-    tline: &T,
+pub fn import_wal_from_tar<Reader: Read>(
+    tline: &Timeline,
    reader: Reader,
    start_lsn: Lsn,
    end_lsn: Lsn,
 ) -> Result<()> {
    // Set up walingest mutable state
    let mut waldecoder = WalStreamDecoder::new(start_lsn);
-    let mut segno = start_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
-    let mut offset = start_lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
+    let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
+    let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
    let mut last_lsn = start_lsn;
    let mut walingest = WalIngest::new(tline, start_lsn)?;

@@ -363,7 +373,7 @@ pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
            match header.entry_type() {
                tar::EntryType::Regular => {
                    // FIXME: assume postgresql tli 1 for now
-                    let expected_filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
+                    let expected_filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
                    let file_name = file_path
                        .file_name()
                        .expect("missing wal filename")
@@ -378,7 +388,11 @@ pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
                    continue;
                }
                _ => {
-                    panic!("tar::EntryType::?? {}", file_path.display());
+                    bail!(
+                        "entry {} in WAL tar archive is of unexpected type: {:?}",
+                        file_path.display(),
+                        header.entry_type()
+                    );
                }
            }
        };
@@ -418,14 +432,12 @@ pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
    Ok(())
 }

-pub fn import_file<T: DatadirTimeline, Reader: Read>(
-    modification: &mut DatadirModification<T>,
+fn import_file<Reader: Read>(
+    modification: &mut DatadirModification,
    file_path: &Path,
    reader: Reader,
    len: usize,
 ) -> Result<Option<ControlFileData>> {
-    debug!("looking at {:?}", file_path);
-
    if file_path.starts_with("global") {
        let spcnode = pg_constants::GLOBALTABLESPACE_OID;
        let dbnode = 0;
@@ -547,7 +559,10 @@ pub fn import_file<T: DatadirTimeline, Reader: Read>(
        // this to import arbitrary postgres databases.
        bail!("Importing pg_tblspc is not implemented");
    } else {
-        debug!("ignored");
+        debug!(
+            "ignoring unrecognized file \"{}\" in tar archive",
+            file_path.display()
+        );
    }

    Ok(None)
--- a/pageserver/src/keyspace.rs
+++ b/pageserver/src/keyspace.rs
@@ -1,5 +1,5 @@
 use crate::repository::{key_range_size, singleton_range, Key};
-use postgres_ffi::pg_constants;
+use postgres_ffi::BLCKSZ;
 use std::ops::Range;

 ///
@@ -19,7 +19,7 @@ impl KeySpace {
    ///
    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
        // Assume that each value is 8k in size.
-        let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize;
+        let target_nblocks = (target_size / BLCKSZ as u64) as usize;

        let mut parts = Vec::new();
        let mut current_part = Vec::new();
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/block_io.rs
+++ b/pageserver/src/layered_repository/block_io.rs
@@ -157,7 +157,14 @@ where
        // Look up the right page
        let cache = page_cache::get();
        loop {
-            match cache.read_immutable_buf(self.file_id, blknum) {
+            match cache
+                .read_immutable_buf(self.file_id, blknum)
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        format!("Failed to read immutable buf: {e:#}"),
+                    )
+                })? {
                ReadBufResult::Found(guard) => break Ok(guard),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -34,7 +34,7 @@ use crate::layered_repository::storage_layer::{
 use crate::page_cache::{PageReadGuard, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::virtual_file::VirtualFile;
-use crate::walrecord;
+use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use rand::{distributions::Alphanumeric, Rng};
@@ -447,11 +447,12 @@ impl DeltaLayer {
            .collect();

        conf.timeline_path(&timelineid, &tenantid).join(format!(
-            "{}-XXX__{:016X}-{:016X}.{}.temp",
+            "{}-XXX__{:016X}-{:016X}.{}.{}",
            key_start,
            u64::from(lsn_range.start),
            u64::from(lsn_range.end),
-            rand_string
+            rand_string,
+            TEMP_FILE_SUFFIX,
        ))
    }

--- a/pageserver/src/layered_repository/disk_btree.rs
+++ b/pageserver/src/layered_repository/disk_btree.rs
@@ -209,7 +209,7 @@ where
    reader: R,
 }

-#[derive(Clone, Copy, Debug, PartialEq)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum VisitDirection {
    Forwards,
    Backwards,
--- a/pageserver/src/layered_repository/ephemeral_file.rs
+++ b/pageserver/src/layered_repository/ephemeral_file.rs
@@ -12,7 +12,7 @@ use once_cell::sync::Lazy;
 use std::cmp::min;
 use std::collections::HashMap;
 use std::fs::OpenOptions;
-use std::io::{Error, ErrorKind};
+use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
@@ -51,7 +51,7 @@ impl EphemeralFile {
        conf: &PageServerConf,
        tenantid: ZTenantId,
        timelineid: ZTimelineId,
-    ) -> Result<EphemeralFile, std::io::Error> {
+    ) -> Result<EphemeralFile, io::Error> {
        let mut l = EPHEMERAL_FILES.write().unwrap();
        let file_id = l.next_file_id;
        l.next_file_id += 1;
@@ -76,7 +76,7 @@ impl EphemeralFile {
        })
    }

-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> {
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
        let mut off = 0;
        while off < PAGE_SZ {
            let n = self
@@ -96,10 +96,13 @@ impl EphemeralFile {
        Ok(())
    }

-    fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, Error> {
+    fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, io::Error> {
        // Look up the right page
        let cache = page_cache::get();
-        let mut write_guard = match cache.write_ephemeral_buf(self.file_id, blkno) {
+        let mut write_guard = match cache
+            .write_ephemeral_buf(self.file_id, blkno)
+            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
+        {
            WriteBufResult::Found(guard) => guard,
            WriteBufResult::NotFound(mut guard) => {
                // Read the page from disk into the buffer
@@ -127,7 +130,7 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
 }

 impl FileExt for EphemeralFile {
-    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, Error> {
+    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, io::Error> {
        // Look up the right page
        let blkno = (offset / PAGE_SZ as u64) as u32;
        let off = offset as usize % PAGE_SZ;
@@ -137,7 +140,10 @@ impl FileExt for EphemeralFile {
        let mut write_guard;

        let cache = page_cache::get();
-        let buf = match cache.read_ephemeral_buf(self.file_id, blkno) {
+        let buf = match cache
+            .read_ephemeral_buf(self.file_id, blkno)
+            .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
+        {
            ReadBufResult::Found(guard) => {
                read_guard = guard;
                read_guard.as_ref()
@@ -158,7 +164,7 @@ impl FileExt for EphemeralFile {
        Ok(len)
    }

-    fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, Error> {
+    fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, io::Error> {
        // Look up the right page
        let blkno = (offset / PAGE_SZ as u64) as u32;
        let off = offset as usize % PAGE_SZ;
@@ -166,7 +172,10 @@ impl FileExt for EphemeralFile {

        let mut write_guard;
        let cache = page_cache::get();
-        let buf = match cache.write_ephemeral_buf(self.file_id, blkno) {
+        let buf = match cache
+            .write_ephemeral_buf(self.file_id, blkno)
+            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
+        {
            WriteBufResult::Found(guard) => {
                write_guard = guard;
                write_guard.deref_mut()
@@ -190,7 +199,7 @@ impl FileExt for EphemeralFile {
 }

 impl BlobWriter for EphemeralFile {
-    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
+    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
        let pos = self.size;

        let mut blknum = (self.size / PAGE_SZ as u64) as u32;
@@ -268,11 +277,11 @@ impl Drop for EphemeralFile {
    }
 }

-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
        match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
            Ok(_) => Ok(()),
-            Err(e) => Err(std::io::Error::new(
+            Err(e) => Err(io::Error::new(
                ErrorKind::Other,
                format!(
                    "failed to write back to ephemeral file at {} error: {}",
@@ -282,7 +291,7 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er
            )),
        }
    } else {
-        Err(std::io::Error::new(
+        Err(io::Error::new(
            ErrorKind::Other,
            "could not write back page, not found in ephemeral files hash",
        ))
@@ -292,11 +301,14 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er
 impl BlockReader for EphemeralFile {
    type BlockLease = page_cache::PageReadGuard<'static>;

-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
        // Look up the right page
        let cache = page_cache::get();
        loop {
-            match cache.read_ephemeral_buf(self.file_id, blknum) {
+            match cache
+                .read_ephemeral_buf(self.file_id, blknum)
+                .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
+            {
                ReadBufResult::Found(guard) => return Ok(guard),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
@@ -311,6 +323,10 @@ impl BlockReader for EphemeralFile {
    }
 }

+fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
+    io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -322,7 +338,7 @@ mod tests {

    fn repo_harness(
        test_name: &str,
-    ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> {
+    ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> {
        let repo_dir = PageServerConf::test_repo_dir(test_name);
        let _ = fs::remove_dir_all(&repo_dir);
        let conf = PageServerConf::dummy_conf(repo_dir);
@@ -339,7 +355,7 @@ mod tests {

    // Helper function to slurp contents of a file, starting at the current position,
    // into a string
-    fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, Error> {
+    fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, io::Error> {
        let mut buf = Vec::new();
        buf.resize(len, 0u8);

@@ -351,7 +367,7 @@ mod tests {
    }

    #[test]
-    fn test_ephemeral_files() -> Result<(), Error> {
+    fn test_ephemeral_files() -> Result<(), io::Error> {
        let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?;

        let file_a = EphemeralFile::create(conf, tenantid, timelineid)?;
@@ -382,7 +398,7 @@ mod tests {
    }

    #[test]
-    fn test_ephemeral_blobs() -> Result<(), Error> {
+    fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?;

        let mut file = EphemeralFile::create(conf, tenantid, timelineid)?;
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -10,7 +10,7 @@ use std::path::PathBuf;

 use utils::lsn::Lsn;

-// Note: LayeredTimeline::load_layer_map() relies on this sort order
+// Note: Timeline::load_layer_map() relies on this sort order
 #[derive(Debug, PartialEq, Eq, Clone)]
 pub struct DeltaFileName {
    pub key_range: Range<Key>,
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -30,7 +30,7 @@ use crate::layered_repository::storage_layer::{
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::virtual_file::VirtualFile;
-use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION};
+use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
@@ -255,7 +255,7 @@ impl ImageLayer {
            .collect();

        conf.timeline_path(&timelineid, &tenantid)
-            .join(format!("{}.{}.temp", fname, rand_string))
+            .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
    }

    ///
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -13,21 +13,15 @@
 use crate::layered_repository::inmemory_layer::InMemoryLayer;
 use crate::layered_repository::storage_layer::Layer;
 use crate::layered_repository::storage_layer::{range_eq, range_overlaps};
+use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
 use anyhow::Result;
-use metrics::{register_int_gauge, IntGauge};
-use once_cell::sync::Lazy;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
 use tracing::*;
 use utils::lsn::Lsn;

-static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
-        .expect("failed to define a metric")
-});
-
 ///
 /// LayerMap tracks what layers exist on a timeline.
 ///
--- a/pageserver/src/layered_repository/metadata.rs
+++ b/pageserver/src/layered_repository/metadata.rs
@@ -1,4 +1,4 @@
-//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
+//! Every image of a certain timeline from [`crate::layered_repository::Repository`]
 //! has a metadata that needs to be stored persistently.
 //!
 //! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
@@ -6,10 +6,13 @@
 //!
 //! The module contains all structs and related helper methods related to timeline metadata.

+use std::fs::{File, OpenOptions};
+use std::io::Write;
 use std::path::PathBuf;

-use anyhow::ensure;
+use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
+use tracing::info_span;
 use utils::{
    bin_ser::BeSer,
    lsn::Lsn,
@@ -17,6 +20,7 @@ use utils::{
 };

 use crate::config::PageServerConf;
+use crate::virtual_file::VirtualFile;
 use crate::STORAGE_FORMAT_VERSION;

 /// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
@@ -30,7 +34,7 @@ pub const METADATA_FILE_NAME: &str = "metadata";

 /// Metadata stored on disk for each timeline
 ///
-/// The fields correspond to the values we hold in memory, in LayeredTimeline.
+/// The fields correspond to the values we hold in memory, in Timeline.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct TimelineMetadata {
    hdr: TimelineMetadataHeader,
@@ -65,17 +69,6 @@ struct TimelineMetadataBody {
    initdb_lsn: Lsn,
 }

-/// Points to a place in pageserver's local directory,
-/// where certain timeline's metadata file should be located.
-pub fn metadata_path(
-    conf: &'static PageServerConf,
-    timelineid: ZTimelineId,
-    tenantid: ZTenantId,
-) -> PathBuf {
-    conf.timeline_path(&timelineid, &tenantid)
-        .join(METADATA_FILE_NAME)
-}
-
 impl TimelineMetadata {
    pub fn new(
        disk_consistent_lsn: Lsn,
@@ -173,11 +166,57 @@ impl TimelineMetadata {
    }
 }

+/// Points to a place in pageserver's local directory,
+/// where certain timeline's metadata file should be located.
+pub fn metadata_path(
+    conf: &'static PageServerConf,
+    timelineid: ZTimelineId,
+    tenantid: ZTenantId,
+) -> PathBuf {
+    conf.timeline_path(&timelineid, &tenantid)
+        .join(METADATA_FILE_NAME)
+}
+
+/// Save timeline metadata to file
+pub fn save_metadata(
+    conf: &'static PageServerConf,
+    timelineid: ZTimelineId,
+    tenantid: ZTenantId,
+    data: &TimelineMetadata,
+    first_save: bool,
+) -> anyhow::Result<()> {
+    let _enter = info_span!("saving metadata").entered();
+    let path = metadata_path(conf, timelineid, tenantid);
+    // use OpenOptions to ensure file presence is consistent with first_save
+    let mut file = VirtualFile::open_with_options(
+        &path,
+        OpenOptions::new().write(true).create_new(first_save),
+    )?;
+
+    let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
+
+    if file.write(&metadata_bytes)? != metadata_bytes.len() {
+        bail!("Could not write all the metadata bytes in a single call");
+    }
+    file.sync_all()?;
+
+    // fsync the parent directory to ensure the directory entry is durable
+    if first_save {
+        let timeline_dir = File::open(
+            &path
+                .parent()
+                .expect("Metadata should always have a parent dir"),
+        )?;
+        timeline_dir.sync_all()?;
+    }
+
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
-    use crate::repository::repo_harness::TIMELINE_ID;
-
    use super::*;
+    use crate::layered_repository::repo_harness::TIMELINE_ID;

    #[test]
    fn metadata_serializes_correctly() {
--- a/pageserver/src/layered_repository/timeline.rs
+++ b/pageserver/src/layered_repository/timeline.rs
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -4,6 +4,7 @@ pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
 pub mod layered_repository;
+pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
@@ -11,10 +12,10 @@ pub mod profiling;
 pub mod reltag;
 pub mod repository;
 pub mod storage_sync;
+pub mod task_mgr;
 pub mod tenant_config;
 pub mod tenant_mgr;
 pub mod tenant_tasks;
-pub mod thread_mgr;
 pub mod timelines;
 pub mod virtual_file;
 pub mod walingest;
@@ -22,14 +23,12 @@ pub mod walreceiver;
 pub mod walrecord;
 pub mod walredo;

-use once_cell::sync::Lazy;
+use std::collections::HashMap;
+
 use tracing::info;
+use utils::zid::{ZTenantId, ZTimelineId};

-use crate::thread_mgr::ThreadKind;
-use metrics::{register_int_gauge_vec, IntGaugeVec};
-
-use layered_repository::LayeredRepository;
-use pgdatadir_mapping::DatadirTimeline;
+use crate::task_mgr::TaskKind;

 /// Current storage format version
 ///
@@ -42,15 +41,6 @@ pub const STORAGE_FORMAT_VERSION: u16 = 3;
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
 pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

-static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_live_connections",
-        "Number of live network connections",
-        &["pageserver_connection_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub const LOG_FILE_NAME: &str = "pageserver.log";

 /// Config for the Repository checkpointer
@@ -62,33 +52,31 @@ pub enum CheckpointConfig {
    Forced,
 }

-pub type RepositoryImpl = LayeredRepository;
-pub type TimelineImpl = <LayeredRepository as repository::Repository>::Timeline;
-
-pub fn shutdown_pageserver(exit_code: i32) {
-    // Shut down the libpq endpoint thread. This prevents new connections from
+pub async fn shutdown_pageserver(exit_code: i32) {
+    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
+    task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;

-    // Shut down any page service threads.
-    thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None);
+    // Shut down any page service tasks.
+    task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;

    // Shut down all the tenants. This flushes everything to disk and kills
-    // the checkpoint and GC threads.
-    tenant_mgr::shutdown_all_tenants();
+    // the checkpoint and GC tasks.
+    tenant_mgr::shutdown_all_tenants().await;

    // Stop syncing with remote storage.
    //
-    // FIXME: Does this wait for the sync thread to finish syncing what's queued up?
+    // FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
    // Should it?
-    thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None);
+    task_mgr::shutdown_tasks(Some(TaskKind::StorageSync), None, None).await;

    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
-    thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None);
+    // FIXME: We should probably stop accepting commands like attach/detach earlier.
+    task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;

    // There should be nothing left, but let's be sure
-    thread_mgr::shutdown_threads(None, None, None);
+    task_mgr::shutdown_tasks(None, None, None).await;

    info!("Shut down successfully completed");
    std::process::exit(exit_code);
@@ -116,6 +104,50 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds
    }
 }

+/// A newtype to store arbitrary data grouped by tenant and timeline ids.
+/// One could use [`utils::zid::ZTenantTimelineId`] for grouping, but that would
+/// not include the cases where a certain tenant has zero timelines.
+/// This is sometimes important: a tenant could be registered during initial load from FS,
+/// even if he has no timelines on disk.
+#[derive(Debug)]
+pub struct TenantTimelineValues<T>(HashMap<ZTenantId, HashMap<ZTimelineId, T>>);
+
+impl<T> TenantTimelineValues<T> {
+    fn new() -> Self {
+        Self(HashMap::new())
+    }
+
+    fn with_capacity(capacity: usize) -> Self {
+        Self(HashMap::with_capacity(capacity))
+    }
+
+    /// A convenience method to map certain values and omit some of them, if needed.
+    /// Tenants that won't have any timeline entries due to the filtering, will still be preserved
+    /// in the structure.
+    fn filter_map<F, NewT>(self, map: F) -> TenantTimelineValues<NewT>
+    where
+        F: Fn(T) -> Option<NewT>,
+    {
+        let capacity = self.0.len();
+        self.0.into_iter().fold(
+            TenantTimelineValues::<NewT>::with_capacity(capacity),
+            |mut new_values, (tenant_id, old_values)| {
+                let new_timeline_values = new_values.0.entry(tenant_id).or_default();
+                for (timeline_id, old_value) in old_values {
+                    if let Some(new_value) = map(old_value) {
+                        new_timeline_values.insert(timeline_id, new_value);
+                    }
+                }
+                new_values
+            },
+        )
+    }
+}
+
+/// A suffix to be used during file sync from the remote storage,
+/// to ensure that we do not leave corrupted files that pretend to be layers.
+const TEMP_FILE_SUFFIX: &str = "___temp";
+
 #[cfg(test)]
 mod backoff_defaults_tests {
    use super::*;
@@ -146,3 +178,35 @@ mod backoff_defaults_tests {
        );
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::layered_repository::repo_harness::TIMELINE_ID;
+
+    use super::*;
+
+    #[test]
+    fn tenant_timeline_value_mapping() {
+        let first_tenant = ZTenantId::generate();
+        let second_tenant = ZTenantId::generate();
+        assert_ne!(first_tenant, second_tenant);
+
+        let mut initial = TenantTimelineValues::new();
+        initial
+            .0
+            .entry(first_tenant)
+            .or_default()
+            .insert(TIMELINE_ID, "test_value");
+        let _ = initial.0.entry(second_tenant).or_default();
+        assert_eq!(initial.0.len(), 2, "Should have entries for both tenants");
+
+        let filtered = initial.filter_map(|_| None::<&str>).0;
+        assert_eq!(
+            filtered.len(),
+            2,
+            "Should have entries for both tenants even after filtering away all entries"
+        );
+        assert!(filtered.contains_key(&first_tenant));
+        assert!(filtered.contains_key(&second_tenant));
+    }
+}
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -0,0 +1,419 @@
+use metrics::core::{AtomicU64, GenericCounter};
+use metrics::{
+    register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
+    register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
+    IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+};
+use once_cell::sync::Lazy;
+use utils::zid::{ZTenantId, ZTimelineId};
+
+/// Prometheus histogram buckets (in seconds) that capture the majority of
+/// latencies in the microsecond range but also extend far enough up to distinguish
+/// "bad" from "really bad".
+fn get_buckets_for_critical_operations() -> Vec<f64> {
+    let buckets_per_digit = 5;
+    let min_exponent = -6;
+    let max_exponent = 2;
+
+    let mut buckets = vec![];
+    // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp
+    // because it's more numerically stable and doesn't result in numbers like 9.999999
+    for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) {
+        buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64))
+    }
+    buckets
+}
+
+// Metrics collected on operations on the storage repository.
+const STORAGE_TIME_OPERATIONS: &[&str] = &[
+    "layer flush",
+    "compact",
+    "create images",
+    "init logical size",
+    "load layer map",
+    "gc",
+];
+
+pub static STORAGE_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_storage_operations_seconds",
+        "Time spent on storage operations",
+        &["operation", "tenant_id", "timeline_id"],
+        get_buckets_for_critical_operations(),
+    )
+    .expect("failed to define a metric")
+});
+
+// Metrics collected on operations on the storage repository.
+static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_getpage_reconstruct_seconds",
+        "Time spent in reconstruct_value",
+        &["tenant_id", "timeline_id"],
+        get_buckets_for_critical_operations(),
+    )
+    .expect("failed to define a metric")
+});
+
+static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_materialized_cache_hits_total",
+        "Number of cache hits from materialized page cache",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_wait_lsn_seconds",
+        "Time spent waiting for WAL to arrive",
+        &["tenant_id", "timeline_id"],
+        get_buckets_for_critical_operations(),
+    )
+    .expect("failed to define a metric")
+});
+
+static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_last_record_lsn",
+        "Last record LSN grouped by timeline",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+// Metrics for determining timeline's physical size.
+// A layered timeline's physical is defined as the total size of
+// (delta/image) layer files on disk.
+static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_current_physical_size",
+        "Current physical size grouped by timeline",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_current_logical_size",
+        "Current logical size grouped by timeline",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define current logical size metric")
+});
+
+// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
+// or in testing they estimate how much we would upload if we did.
+static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_created_persistent_files_total",
+        "Number of files created that are meant to be uploaded to cloud storage",
+    )
+    .expect("failed to define a metric")
+});
+
+static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_written_persistent_bytes_total",
+        "Total bytes written that are meant to be uploaded to cloud storage",
+    )
+    .expect("failed to define a metric")
+});
+
+// Metrics collected on disk IO operations
+const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
+    0.000001, // 1 usec
+    0.00001,  // 10 usec
+    0.0001,   // 100 usec
+    0.001,    // 1 msec
+    0.01,     // 10 msec
+    0.1,      // 100 msec
+    1.0,      // 1 sec
+];
+
+const STORAGE_IO_TIME_OPERATIONS: &[&str] =
+    &["open", "close", "read", "write", "seek", "fsync", "gc"];
+
+const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+
+pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_io_operations_seconds",
+        "Time spent in IO operations",
+        &["operation", "tenant_id", "timeline_id"],
+        STORAGE_IO_TIME_BUCKETS.into()
+    )
+    .expect("failed to define a metric")
+});
+
+pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_io_operations_bytes_total",
+        "Total amount of bytes read/written in IO operations",
+        &["operation", "tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
+    "get_rel_exists",
+    "get_rel_size",
+    "get_page_at_lsn",
+    "get_db_size",
+];
+
+const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[
+    0.00001, // 1/100000 s
+    0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s
+    0.001, 0.0025, 0.005, 0.0075, // 1/1000 s
+    0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s
+    0.1,  // 1/10 s
+];
+
+pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_smgr_query_seconds",
+        "Time spent on smgr query handling",
+        &["smgr_query_type", "tenant_id", "timeline_id"],
+        SMGR_QUERY_TIME_BUCKETS.into()
+    )
+    .expect("failed to define a metric")
+});
+
+pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_live_connections",
+        "Number of live network connections",
+        &["pageserver_connection_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
+        .expect("failed to define a metric")
+});
+
+pub static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "pageserver_remote_storage_remaining_sync_items",
+        "Number of storage sync items left in the queue"
+    )
+    .expect("failed to register pageserver remote storage remaining sync items int gauge")
+});
+
+pub static IMAGE_SYNC_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_remote_storage_image_sync_seconds",
+        "Time took to synchronize (download or upload) a whole pageserver image. \
+        Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
+        &["tenant_id", "timeline_id", "operation_kind", "status"],
+        vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
+    )
+    .expect("failed to register pageserver image sync time histogram vec")
+});
+
+pub static REMOTE_INDEX_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_storage_remote_index_uploads_total",
+        "Number of remote index uploads",
+        &["tenant_id", "timeline_id"],
+    )
+    .expect("failed to register pageserver remote index upload vec")
+});
+
+pub static NO_LAYERS_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_remote_storage_no_layers_uploads_total",
+        "Number of skipped uploads due to no layers",
+        &["tenant_id", "timeline_id"],
+    )
+    .expect("failed to register pageserver no layers upload vec")
+});
+
+pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_tenant_task_events",
+        "Number of task start/stop/fail events.",
+        &["event"],
+    )
+    .expect("Failed to register tenant_task_events metric")
+});
+
+// Metrics collected on WAL redo operations
+//
+// We collect the time spent in actual WAL redo ('redo'), and time waiting
+// for access to the postgres process ('wait') since there is only one for
+// each tenant.
+
+/// Time buckets are small because we want to be able to measure the
+/// smallest redo processing times. These buckets allow us to measure down
+/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
+/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
+macro_rules! redo_histogram_time_buckets {
+    () => {
+        vec![
+            0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
+            0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000,
+        ]
+    };
+}
+
+/// While we're at it, also measure the amount of records replayed in each
+/// operation. We have a global 'total replayed' counter, but that's not
+/// as useful as 'what is the skew for how many records we replay in one
+/// operation'.
+macro_rules! redo_histogram_count_buckets {
+    () => {
+        vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0]
+    };
+}
+
+pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_wal_redo_seconds",
+        "Time spent on WAL redo",
+        redo_histogram_time_buckets!()
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_wal_redo_wait_seconds",
+        "Time spent waiting for access to the WAL redo process",
+        redo_histogram_time_buckets!(),
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_wal_redo_records_histogram",
+        "Histogram of number of records replayed per redo",
+        redo_histogram_count_buckets!(),
+    )
+    .expect("failed to define a metric")
+});
+
+pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_replayed_wal_records_total",
+        "Number of WAL records replayed in WAL redo process"
+    )
+    .unwrap()
+});
+
+#[derive(Debug)]
+pub struct TimelineMetrics {
+    tenant_id: String,
+    timeline_id: String,
+    pub reconstruct_time_histo: Histogram,
+    pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
+    pub flush_time_histo: Histogram,
+    pub compact_time_histo: Histogram,
+    pub create_images_time_histo: Histogram,
+    pub init_logical_size_histo: Histogram,
+    pub load_layer_map_histo: Histogram,
+    pub last_record_gauge: IntGauge,
+    pub wait_lsn_time_histo: Histogram,
+    pub current_physical_size_gauge: UIntGauge,
+    /// copy of LayeredTimeline.current_logical_size
+    pub current_logical_size_gauge: UIntGauge,
+    pub num_persistent_files_created: IntCounter,
+    pub persistent_bytes_written: IntCounter,
+}
+
+impl TimelineMetrics {
+    pub fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self {
+        let tenant_id = tenant_id.to_string();
+        let timeline_id = timeline_id.to_string();
+        let reconstruct_time_histo = RECONSTRUCT_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let flush_time_histo = STORAGE_TIME
+            .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id])
+            .unwrap();
+        let compact_time_histo = STORAGE_TIME
+            .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id])
+            .unwrap();
+        let create_images_time_histo = STORAGE_TIME
+            .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id])
+            .unwrap();
+        let init_logical_size_histo = STORAGE_TIME
+            .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
+            .unwrap();
+        let load_layer_map_histo = STORAGE_TIME
+            .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
+            .unwrap();
+        let last_record_gauge = LAST_RECORD_LSN
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let wait_lsn_time_histo = WAIT_LSN_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED.clone();
+        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN.clone();
+
+        TimelineMetrics {
+            tenant_id,
+            timeline_id,
+            reconstruct_time_histo,
+            materialized_page_cache_hit_counter,
+            flush_time_histo,
+            compact_time_histo,
+            create_images_time_histo,
+            init_logical_size_histo,
+            load_layer_map_histo,
+            last_record_gauge,
+            wait_lsn_time_histo,
+            current_physical_size_gauge,
+            current_logical_size_gauge,
+            num_persistent_files_created,
+            persistent_bytes_written,
+        }
+    }
+}
+
+impl Drop for TimelineMetrics {
+    fn drop(&mut self) {
+        let tenant_id = &self.tenant_id;
+        let timeline_id = &self.timeline_id;
+        let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+
+        for op in STORAGE_TIME_OPERATIONS {
+            let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+        }
+        for op in STORAGE_IO_TIME_OPERATIONS {
+            let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+        }
+
+        for op in STORAGE_IO_SIZE_OPERATIONS {
+            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
+        }
+
+        for op in SMGR_QUERY_TIME_OPERATIONS {
+            let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+        }
+    }
+}
+
+pub fn remove_tenant_metrics(tenant_id: &ZTenantId) {
+    let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
+}
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -45,6 +45,7 @@ use std::{
    },
 };

+use anyhow::Context;
 use once_cell::sync::OnceCell;
 use tracing::error;
 use utils::{
@@ -83,7 +84,7 @@ pub fn get() -> &'static PageCache {
    }
 }

-pub const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize;
+pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
 const MAX_USAGE_COUNT: u8 = 5;

 ///
@@ -342,7 +343,7 @@ impl PageCache {
        key: Key,
        lsn: Lsn,
        img: &[u8],
-    ) {
+    ) -> anyhow::Result<()> {
        let cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
                tenant_id,
@@ -352,7 +353,7 @@ impl PageCache {
            lsn,
        };

-        match self.lock_for_write(&cache_key) {
+        match self.lock_for_write(&cache_key)? {
            WriteBufResult::Found(write_guard) => {
                // We already had it in cache. Another thread must've put it there
                // concurrently. Check that it had the same contents that we
@@ -364,17 +365,19 @@ impl PageCache {
                write_guard.mark_valid();
            }
        }
+
+        Ok(())
    }

    // Section 1.2: Public interface functions for working with Ephemeral pages.

-    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
+    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };

        self.lock_for_read(&mut cache_key)
    }

-    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult {
+    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
        let cache_key = CacheKey::EphemeralPage { file_id, blkno };

        self.lock_for_write(&cache_key)
@@ -402,7 +405,7 @@ impl PageCache {

    // Section 1.3: Public interface functions for working with immutable file pages.

-    pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
+    pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

        self.lock_for_read(&mut cache_key)
@@ -495,15 +498,16 @@ impl PageCache {
    /// }
    /// ```
    ///
-    fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult {
+    fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
        loop {
            // First check if the key already exists in the cache.
            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
-                return ReadBufResult::Found(read_guard);
+                return Ok(ReadBufResult::Found(read_guard));
            }

            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self.find_victim();
+            let (slot_idx, mut inner) =
+                self.find_victim().context("Failed to find evict victim")?;

            // Insert mapping for this. At this point, we may find that another
            // thread did the same thing concurrently. In that case, we evicted
@@ -526,10 +530,10 @@ impl PageCache {
            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

-            return ReadBufResult::NotFound(PageWriteGuard {
+            return Ok(ReadBufResult::NotFound(PageWriteGuard {
                inner,
                valid: false,
-            });
+            }));
        }
    }

@@ -556,15 +560,16 @@ impl PageCache {
    ///
    /// Similar to lock_for_read(), but the returned buffer is write-locked and
    /// may be modified by the caller even if it's already found in the cache.
-    fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult {
+    fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
        loop {
            // First check if the key already exists in the cache.
            if let Some(write_guard) = self.try_lock_for_write(cache_key) {
-                return WriteBufResult::Found(write_guard);
+                return Ok(WriteBufResult::Found(write_guard));
            }

            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self.find_victim();
+            let (slot_idx, mut inner) =
+                self.find_victim().context("Failed to find evict victim")?;

            // Insert mapping for this. At this point, we may find that another
            // thread did the same thing concurrently. In that case, we evicted
@@ -587,10 +592,10 @@ impl PageCache {
            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

-            return WriteBufResult::NotFound(PageWriteGuard {
+            return Ok(WriteBufResult::NotFound(PageWriteGuard {
                inner,
                valid: false,
-            });
+            }));
        }
    }

@@ -754,7 +759,7 @@ impl PageCache {
    /// Find a slot to evict.
    ///
    /// On return, the slot is empty and write-locked.
-    fn find_victim(&self) -> (usize, RwLockWriteGuard<SlotInner>) {
+    fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard<SlotInner>)> {
        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
@@ -767,7 +772,7 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(TryLockError::Poisoned(err)) => {
-                        panic!("buffer lock was poisoned: {:?}", err)
+                        anyhow::bail!("buffer lock was poisoned: {err:?}")
                    }
                    Err(TryLockError::WouldBlock) => {
                        // If we have looped through the whole buffer pool 10 times
@@ -777,7 +782,7 @@ impl PageCache {
                        // there are buffers in the pool. In practice, with a reasonably
                        // large buffer pool it really shouldn't happen.
                        if iters > iter_limit {
-                            panic!("could not find a victim buffer to evict");
+                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
                    }
@@ -804,7 +809,7 @@ impl PageCache {
                    inner.dirty = false;
                    inner.key = None;
                }
-                return (slot_idx, inner);
+                return Ok((slot_idx, inner));
            }
        }
    }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,16 +7,17 @@
 //! Clarify that)
 //!
 use crate::keyspace::{KeySpace, KeySpaceAccum};
+use crate::layered_repository::Timeline;
 use crate::reltag::{RelTag, SlruKind};
-use crate::repository::Timeline;
 use crate::repository::*;
 use crate::walrecord::ZenithWalRecord;
 use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
-use postgres_ffi::xlog_utils::TimestampTz;
-use postgres_ffi::{pg_constants, Oid, TransactionId};
+use postgres_ffi::v14::pg_constants;
+use postgres_ffi::BLCKSZ;
+use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::collections::{HashMap, HashSet};
+use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::Range;
 use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -33,23 +34,13 @@ pub enum LsnForTimestamp {
 }

 ///
-/// This trait provides all the functionality to store PostgreSQL relations, SLRUs,
+/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
 /// and other special kinds of files, in a versioned key-value store. The
-/// Timeline trait provides the key-value store.
+/// Timeline struct provides the key-value store.
 ///
-/// This is a trait, so that we can easily include all these functions in a Timeline
-/// implementation. You're not expected to have different implementations of this trait,
-/// rather, this provides an interface and implementation, over Timeline.
-///
-/// If you wanted to store other kinds of data in the Neon repository, e.g.
-/// flat files or MySQL, you would create a new trait like this, with all the
-/// functions that make sense for the kind of data you're storing. For flat files,
-/// for example, you might have a function like "fn read(path, offset, size)".
-/// We might also have that situation in the future, to support multiple PostgreSQL
-/// versions, if there are big changes in how the data is organized in the data
-/// directory, or if new special files are introduced.
-///
-pub trait DatadirTimeline: Timeline {
+/// This is a separate impl, so that we can easily include all these functions in a Timeline
+/// implementation, and might be moved into a separate struct later.
+impl Timeline {
    /// Start ingesting a WAL record, or other atomic modification of
    /// the timeline.
    ///
@@ -73,7 +64,7 @@ pub trait DatadirTimeline: Timeline {
    /// functions of the timeline until you finish! And if you update the
    /// same page twice, the last update wins.
    ///
-    fn begin_modification(&self, lsn: Lsn) -> DatadirModification<Self>
+    pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification
    where
        Self: Sized,
    {
@@ -91,10 +82,16 @@ pub trait DatadirTimeline: Timeline {
    //------------------------------------------------------------------------------

    /// Look up given page version.
-    fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_rel_page_at_lsn(
+        &self,
+        tag: RelTag,
+        blknum: BlockNumber,
+        lsn: Lsn,
+        latest: bool,
+    ) -> Result<Bytes> {
        ensure!(tag.relnode != 0, "invalid relnode");

-        let nblocks = self.get_rel_size(tag, lsn)?;
+        let nblocks = self.get_rel_size(tag, lsn, latest)?;
        if blknum >= nblocks {
            debug!(
                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -108,20 +105,20 @@ pub trait DatadirTimeline: Timeline {
    }

    // Get size of a database in blocks
-    fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<usize> {
+    pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result<usize> {
        let mut total_blocks = 0;

        let rels = self.list_rels(spcnode, dbnode, lsn)?;

        for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn)?;
+            let n_blocks = self.get_rel_size(rel, lsn, latest)?;
            total_blocks += n_blocks as usize;
        }
        Ok(total_blocks)
    }

    /// Get size of a relation file
-    fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
+    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result<BlockNumber> {
        ensure!(tag.relnode != 0, "invalid relnode");

        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
@@ -130,7 +127,7 @@ pub trait DatadirTimeline: Timeline {

        if (tag.forknum == pg_constants::FSM_FORKNUM
            || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn)?
+            && !self.get_rel_exists(tag, lsn, latest)?
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
@@ -143,13 +140,21 @@ pub trait DatadirTimeline: Timeline {
        let mut buf = self.get(key, lsn)?;
        let nblocks = buf.get_u32_le();

-        // Update relation size cache
-        self.update_cached_rel_size(tag, lsn, nblocks);
+        if latest {
+            // Update relation size cache only if "latest" flag is set.
+            // This flag is set by compute when it is working with most recent version of relation.
+            // Typically master compute node always set latest=true.
+            // Please notice, that even if compute node "by mistake" specifies old LSN but set
+            // latest=true, then it can not cause cache corruption, because with latest=true
+            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
+            // associated with most recent value of LSN.
+            self.update_cached_rel_size(tag, lsn, nblocks);
+        }
        Ok(nblocks)
    }

    /// Does relation exist?
-    fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
+    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result<bool> {
        ensure!(tag.relnode != 0, "invalid relnode");

        // first try to lookup relation in cache
@@ -167,7 +172,7 @@ pub trait DatadirTimeline: Timeline {
    }

    /// Get a list of all existing relations in given tablespace and database.
-    fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
+    pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
        // fetch directory listing
        let key = rel_dir_to_key(spcnode, dbnode);
        let buf = self.get(key, lsn)?;
@@ -185,7 +190,7 @@ pub trait DatadirTimeline: Timeline {
    }

    /// Look up given SLRU page version.
-    fn get_slru_page_at_lsn(
+    pub fn get_slru_page_at_lsn(
        &self,
        kind: SlruKind,
        segno: u32,
@@ -197,14 +202,19 @@ pub trait DatadirTimeline: Timeline {
    }

    /// Get size of an SLRU segment
-    fn get_slru_segment_size(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<BlockNumber> {
+    pub fn get_slru_segment_size(
+        &self,
+        kind: SlruKind,
+        segno: u32,
+        lsn: Lsn,
+    ) -> Result<BlockNumber> {
        let key = slru_segment_size_to_key(kind, segno);
        let mut buf = self.get(key, lsn)?;
        Ok(buf.get_u32_le())
    }

    /// Get size of an SLRU segment
-    fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
+    pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
        // fetch directory listing
        let key = slru_dir_to_key(kind);
        let buf = self.get(key, lsn)?;
@@ -221,7 +231,7 @@ pub trait DatadirTimeline: Timeline {
    /// so it's not well defined which LSN you get if there were multiple commits
    /// "in flight" at that point in time.
    ///
-    fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
+    pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
        let min_lsn = *gc_cutoff_lsn_guard;
        let max_lsn = self.get_last_record_lsn();
@@ -284,7 +294,7 @@ pub trait DatadirTimeline: Timeline {
    /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
    /// with a smaller/larger timestamp.
    ///
-    fn is_latest_commit_timestamp_ge_than(
+    pub fn is_latest_commit_timestamp_ge_than(
        &self,
        search_timestamp: TimestampTz,
        probe_lsn: Lsn,
@@ -297,9 +307,9 @@ pub trait DatadirTimeline: Timeline {
                let clog_page =
                    self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;

-                if clog_page.len() == pg_constants::BLCKSZ as usize + 8 {
+                if clog_page.len() == BLCKSZ as usize + 8 {
                    let mut timestamp_bytes = [0u8; 8];
-                    timestamp_bytes.copy_from_slice(&clog_page[pg_constants::BLCKSZ as usize..]);
+                    timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);

                    if timestamp >= search_timestamp {
@@ -315,7 +325,7 @@ pub trait DatadirTimeline: Timeline {
    }

    /// Get a list of SLRU segments
-    fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
+    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
        // fetch directory entry
        let key = slru_dir_to_key(kind);

@@ -325,14 +335,14 @@ pub trait DatadirTimeline: Timeline {
        Ok(dir.segments)
    }

-    fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
        let key = relmap_file_key(spcnode, dbnode);

        let buf = self.get(key, lsn)?;
        Ok(buf)
    }

-    fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
+    pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
        // fetch directory entry
        let buf = self.get(DBDIR_KEY, lsn)?;
        let dir = DbDirectory::des(&buf)?;
@@ -340,13 +350,13 @@ pub trait DatadirTimeline: Timeline {
        Ok(dir.dbdirs)
    }

-    fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
        let key = twophase_file_key(xid);
        let buf = self.get(key, lsn)?;
        Ok(buf)
    }

-    fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
+    pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
        // fetch directory entry
        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
        let dir = TwoPhaseDirectory::des(&buf)?;
@@ -354,11 +364,11 @@ pub trait DatadirTimeline: Timeline {
        Ok(dir.xids)
    }

-    fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
        self.get(CONTROLFILE_KEY, lsn)
    }

-    fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
+    pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
        self.get(CHECKPOINT_KEY, lsn)
    }

@@ -367,29 +377,29 @@ pub trait DatadirTimeline: Timeline {
    ///
    /// Only relation blocks are counted currently. That excludes metadata,
    /// SLRUs, twophase files etc.
-    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize> {
+    pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<u64> {
        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn)?;
        let dbdir = DbDirectory::des(&buf)?;

-        let mut total_size: usize = 0;
+        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
            for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
                let relsize_key = rel_size_to_key(rel);
                let mut buf = self.get(relsize_key, lsn)?;
                let relsize = buf.get_u32_le();

-                total_size += relsize as usize;
+                total_size += relsize as u64;
            }
        }
-        Ok(total_size * pg_constants::BLCKSZ as usize)
+        Ok(total_size * BLCKSZ as u64)
    }

    ///
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
-    fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
+    pub fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -463,27 +473,54 @@ pub trait DatadirTimeline: Timeline {
    }

    /// Get cached size of relation if it not updated after specified LSN
-    fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber>;
+    pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
+        let rel_size_cache = self.rel_size_cache.read().unwrap();
+        if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
+            if lsn >= *cached_lsn {
+                return Some(*nblocks);
+            }
+        }
+        None
+    }

    /// Update cached relation size if there is no more recent update
-    fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
+    pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        match rel_size_cache.entry(tag) {
+            hash_map::Entry::Occupied(mut entry) => {
+                let cached_lsn = entry.get_mut();
+                if lsn >= cached_lsn.0 {
+                    *cached_lsn = (lsn, nblocks);
+                }
+            }
+            hash_map::Entry::Vacant(entry) => {
+                entry.insert((lsn, nblocks));
+            }
+        }
+    }

    /// Store cached relation size
-    fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
+    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        rel_size_cache.insert(tag, (lsn, nblocks));
+    }

    /// Remove cached relation size
-    fn remove_cached_rel_size(&self, tag: &RelTag);
+    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        rel_size_cache.remove(tag);
+    }
 }

 /// DatadirModification represents an operation to ingest an atomic set of
 /// updates to the repository. It is created by the 'begin_record'
 /// function. It is called for each WAL record, so that all the modifications
 /// by a one WAL record appear atomic.
-pub struct DatadirModification<'a, T: DatadirTimeline> {
+pub struct DatadirModification<'a> {
    /// The timeline this modification applies to. You can access this to
    /// read the state, but note that any pending updates are *not* reflected
    /// in the state in 'tline' yet.
-    pub tline: &'a T,
+    pub tline: &'a Timeline,

    /// Lsn assigned by begin_modification
    pub lsn: Lsn,
@@ -493,10 +530,10 @@ pub struct DatadirModification<'a, T: DatadirTimeline> {
    // underlying key-value store by the 'finish' function.
    pending_updates: HashMap<Key, Value>,
    pending_deletions: Vec<Range<Key>>,
-    pending_nblocks: isize,
+    pending_nblocks: i64,
 }

-impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
+impl<'a> DatadirModification<'a> {
    /// Initialize a completely new repository.
    ///
    /// This inserts the directory metadata entries that are assumed to
@@ -636,7 +673,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
    pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
        let req_lsn = self.tline.get_last_record_lsn();

-        let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn)?;
+        let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?;

        // Remove entry from dbdir
        let buf = self.get(DBDIR_KEY)?;
@@ -652,7 +689,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
        }

        // Update logical database size.
-        self.pending_nblocks -= total_blocks as isize;
+        self.pending_nblocks -= total_blocks as i64;

        // Delete all relations and metadata files for the spcnode/dnode
        self.delete(dbdir_key_range(spcnode, dbnode));
@@ -695,7 +732,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
        let buf = nblocks.to_le_bytes();
        self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));

-        self.pending_nblocks += nblocks as isize;
+        self.pending_nblocks += nblocks as i64;

        // Update relation size cache
        self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
@@ -709,7 +746,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
        ensure!(rel.relnode != 0, "invalid relnode");
        let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn)? {
+        if self.tline.get_rel_exists(rel, last_lsn, true)? {
            let size_key = rel_size_to_key(rel);
            // Fetch the old size first
            let old_size = self.get(size_key)?.get_u32_le();
@@ -725,7 +762,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);

            // Update logical database size.
-            self.pending_nblocks -= old_size as isize - nblocks as isize;
+            self.pending_nblocks -= old_size as i64 - nblocks as i64;
        }
        Ok(())
    }
@@ -747,7 +784,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
            // Update relation size cache
            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);

-            self.pending_nblocks += nblocks as isize - old_size as isize;
+            self.pending_nblocks += nblocks as i64 - old_size as i64;
        }
        Ok(())
    }
@@ -770,7 +807,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
        // update logical size
        let size_key = rel_size_to_key(rel);
        let old_size = self.get(size_key)?.get_u32_le();
-        self.pending_nblocks -= old_size as isize;
+        self.pending_nblocks -= old_size as i64;

        // Remove enty from relation size cache
        self.tline.remove_cached_rel_size(&rel);
@@ -912,7 +949,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
        result?;

        if pending_nblocks != 0 {
-            writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize);
+            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
            self.pending_nblocks = 0;
        }

@@ -924,7 +961,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
    /// underlying timeline.
    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
-    pub fn commit(&mut self) -> Result<()> {
+    pub fn commit(&mut self) -> anyhow::Result<()> {
        let writer = self.tline.writer();
        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
@@ -940,7 +977,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
        writer.finish_write(lsn);

        if pending_nblocks != 0 {
-            writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize);
+            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
        }

        Ok(())
@@ -1014,7 +1051,7 @@ struct SlruSegmentDirectory {
    segments: HashSet<u32>,
 }

-static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; pg_constants::BLCKSZ as usize]);
+static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);

 // Layout of the Key address space
 //
@@ -1366,10 +1403,10 @@ fn is_slru_block_key(key: Key) -> bool {
 //

 #[cfg(test)]
-pub fn create_test_timeline<R: Repository>(
-    repo: R,
+pub fn create_test_timeline(
+    repo: &crate::layered_repository::Repository,
    timeline_id: utils::zid::ZTimelineId,
-) -> Result<std::sync::Arc<R::Timeline>> {
+) -> Result<std::sync::Arc<Timeline>> {
    let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
    let mut m = tline.begin_modification(Lsn(8));
    m.init_empty()?;
@@ -1475,19 +1512,19 @@ mod tests {
        writer.finish()?;

        // Test read before rel creation. Should error out.
-        assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10)).is_err());
+        assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10), false).is_err());

        // Read block beyond end of relation at different points in time.
        // These reads should fall into different delta, image, and in-memory layers.
-        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20))?, ZERO_PAGE);
-        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25))?, ZERO_PAGE);
-        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30))?, ZERO_PAGE);
-        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35))?, ZERO_PAGE);
-        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, ZERO_PAGE);
-        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45))?, ZERO_PAGE);
-        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, ZERO_PAGE);
-        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55))?, ZERO_PAGE);
-        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, ZERO_PAGE);
+        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20), false)?, ZERO_PAGE);
+        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25), false)?, ZERO_PAGE);
+        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30), false)?, ZERO_PAGE);
+        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35), false)?, ZERO_PAGE);
+        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)?, ZERO_PAGE);
+        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45), false)?, ZERO_PAGE);
+        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)?, ZERO_PAGE);
+        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55), false)?, ZERO_PAGE);
+        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)?, ZERO_PAGE);

        // Test on an in-memory layer with no preceding layer
        let mut writer = tline.begin_record(Lsn(0x70));
@@ -1499,7 +1536,7 @@ mod tests {
        )?;
        writer.finish()?;

-        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70))?, ZERO_PAGE);
+        assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70), false)?6, ZERO_PAGE);

        Ok(())
    }
--- a/pageserver/src/reltag.rs
+++ b/pageserver/src/reltag.rs
@@ -2,8 +2,9 @@ use serde::{Deserialize, Serialize};
 use std::cmp::Ordering;
 use std::fmt;

-use postgres_ffi::relfile_utils::forknumber_to_name;
-use postgres_ffi::{pg_constants, Oid};
+use postgres_ffi::v14::pg_constants;
+use postgres_ffi::v14::relfile_utils::forknumber_to_name;
+use postgres_ffi::Oid;

 ///
 /// Relation data file segment id throughout the Postgres cluster.
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,19 +1,11 @@
-use crate::layered_repository::metadata::TimelineMetadata;
-use crate::storage_sync::index::RemoteIndex;
 use crate::walrecord::ZenithWalRecord;
-use crate::CheckpointConfig;
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::ops::{AddAssign, Range};
-use std::sync::{Arc, RwLockReadGuard};
 use std::time::Duration;
-use utils::{
-    lsn::{Lsn, RecordLsn},
-    zid::ZTimelineId,
-};

 #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
 /// Key used in the Repository kv-store.
@@ -181,102 +173,6 @@ impl Value {
    }
 }

-///
-/// A repository corresponds to one .neon directory. One repository holds multiple
-/// timelines, forked off from the same initial call to 'initdb'.
-pub trait Repository: Send + Sync {
-    type Timeline: crate::DatadirTimeline;
-
-    /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
-    /// See [`crate::remote_storage`] for more details about the synchronization.
-    fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
-
-    /// Get Timeline handle for given zenith timeline ID.
-    /// This function is idempotent. It doesn't change internal state in any way.
-    fn get_timeline(&self, timelineid: ZTimelineId) -> Option<RepositoryTimeline<Self::Timeline>>;
-
-    /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded.
-    fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result<Arc<Self::Timeline>>;
-
-    /// Lists timelines the repository contains.
-    /// Up to repository's implementation to omit certain timelines that ar not considered ready for use.
-    fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline<Self::Timeline>)>;
-
-    /// Create a new, empty timeline. The caller is responsible for loading data into it
-    /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
-    fn create_empty_timeline(
-        &self,
-        timeline_id: ZTimelineId,
-        initdb_lsn: Lsn,
-    ) -> Result<Arc<Self::Timeline>>;
-
-    /// Branch a timeline
-    fn branch_timeline(
-        &self,
-        src: ZTimelineId,
-        dst: ZTimelineId,
-        start_lsn: Option<Lsn>,
-    ) -> Result<()>;
-
-    /// Flush all data to disk.
-    ///
-    /// this is used at graceful shutdown.
-    fn checkpoint(&self) -> Result<()>;
-
-    /// perform one garbage collection iteration, removing old data files from disk.
-    /// this function is periodically called by gc thread.
-    /// also it can be explicitly requested through page server api 'do_gc' command.
-    ///
-    /// 'timelineid' specifies the timeline to GC, or None for all.
-    /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
-    /// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
-    /// to make tests more deterministic.
-    /// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
-    fn gc_iteration(
-        &self,
-        timelineid: Option<ZTimelineId>,
-        horizon: u64,
-        pitr: Duration,
-        checkpoint_before_gc: bool,
-    ) -> Result<GcResult>;
-
-    /// Perform one compaction iteration.
-    /// This function is periodically called by compactor thread.
-    /// Also it can be explicitly requested per timeline through page server
-    /// api's 'compact' command.
-    fn compaction_iteration(&self) -> Result<()>;
-
-    /// removes timeline-related in-memory data
-    fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()>;
-
-    /// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
-    fn get_remote_index(&self) -> &RemoteIndex;
-}
-
-/// A timeline, that belongs to the current repository.
-pub enum RepositoryTimeline<T> {
-    /// Timeline, with its files present locally in pageserver's working directory.
-    /// Loaded into pageserver's memory and ready to be used.
-    Loaded(Arc<T>),
-
-    /// All the data is available locally, but not loaded into memory, so loading have to be done before actually using the timeline
-    Unloaded {
-        // It is ok to keep metadata here, because it is not changed when timeline is unloaded.
-        // FIXME can s3 sync actually change it? It can change it when timeline is in awaiting download state.
-        //  but we currently do not download something for the timeline once it is local (even if there are new checkpoints) is it correct?
-        // also it is not that good to keep TimelineMetadata here, because it is layered repo implementation detail
-        metadata: TimelineMetadata,
-    },
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub enum LocalTimelineState {
-    // timeline is loaded into memory (with layer map and all the bits),
-    Loaded,
-    // timeline is on disk locally and ready to be loaded into memory.
-    Unloaded,
-}
-
 ///
 /// Result of performing GC
 ///
@@ -304,622 +200,3 @@ impl AddAssign for GcResult {
        self.elapsed += other.elapsed;
    }
 }
-
-pub trait Timeline: Send + Sync {
-    //------------------------------------------------------------------------------
-    // Public GET functions
-    //------------------------------------------------------------------------------
-
-    ///
-    /// Wait until WAL has been received and processed up to this LSN.
-    ///
-    /// You should call this before any of the other get_* or list_* functions. Calling
-    /// those functions with an LSN that has been processed yet is an error.
-    ///
-    fn wait_lsn(&self, lsn: Lsn) -> Result<()>;
-
-    /// Lock and get timeline's GC cuttof
-    fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard<Lsn>;
-
-    /// Look up given page version.
-    ///
-    /// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction
-    /// above this needs to store suitable metadata to track what data exists with
-    /// what keys, in separate metadata entries. If a non-existent key is requested,
-    /// the Repository implementation may incorrectly return a value from an ancestor
-    /// branch, for example, or waste a lot of cycles chasing the non-existing key.
-    ///
-    fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes>;
-
-    /// Get the ancestor's timeline id
-    fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId>;
-
-    /// Get the LSN where this branch was created
-    fn get_ancestor_lsn(&self) -> Lsn;
-
-    //------------------------------------------------------------------------------
-    // Public PUT functions, to update the repository with new page versions.
-    //
-    // These are called by the WAL receiver to digest WAL records.
-    //------------------------------------------------------------------------------
-    /// Atomically get both last and prev.
-    fn get_last_record_rlsn(&self) -> RecordLsn;
-
-    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
-    fn get_last_record_lsn(&self) -> Lsn;
-
-    fn get_prev_record_lsn(&self) -> Lsn;
-
-    fn get_disk_consistent_lsn(&self) -> Lsn;
-
-    /// Mutate the timeline with a [`TimelineWriter`].
-    ///
-    /// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter
-    /// is a generic type in this trait. But that doesn't currently work in
-    /// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html
-    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
-
-    ///
-    /// Flush to disk all data that was written with the put_* functions
-    ///
-    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
-    /// know anything about them here in the repository.
-    fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;
-
-    ///
-    /// Check that it is valid to request operations with that lsn.
-    fn check_lsn_is_in_scope(
-        &self,
-        lsn: Lsn,
-        latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
-    ) -> Result<()>;
-
-    /// Get the physical size of the timeline at the latest LSN
-    fn get_physical_size(&self) -> u64;
-    /// Get the physical size of the timeline at the latest LSN non incrementally
-    fn get_physical_size_non_incremental(&self) -> Result<u64>;
-}
-
-/// Various functions to mutate the timeline.
-// TODO Currently, Deref is used to allow easy access to read methods from this trait.
-// This is probably considered a bad practice in Rust and should be fixed eventually,
-// but will cause large code changes.
-pub trait TimelineWriter<'a> {
-    /// Put a new page version that can be constructed from a WAL record
-    ///
-    /// This will implicitly extend the relation, if the page is beyond the
-    /// current end-of-file.
-    fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()>;
-
-    fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()>;
-
-    /// Track the end of the latest digested WAL record.
-    ///
-    /// Call this after you have finished writing all the WAL up to 'lsn'.
-    ///
-    /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
-    /// the 'lsn' or anything older. The previous last record LSN is stored alongside
-    /// the latest and can be read.
-    fn finish_write(&self, lsn: Lsn);
-
-    fn update_current_logical_size(&self, delta: isize);
-}
-
-#[cfg(test)]
-pub mod repo_harness {
-    use bytes::BytesMut;
-    use once_cell::sync::Lazy;
-    use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
-    use std::{fs, path::PathBuf};
-
-    use crate::RepositoryImpl;
-    use crate::{
-        config::PageServerConf,
-        layered_repository::LayeredRepository,
-        walredo::{WalRedoError, WalRedoManager},
-    };
-
-    use super::*;
-    use crate::tenant_config::{TenantConf, TenantConfOpt};
-    use hex_literal::hex;
-    use utils::zid::ZTenantId;
-
-    pub const TIMELINE_ID: ZTimelineId =
-        ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
-    pub const NEW_TIMELINE_ID: ZTimelineId =
-        ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
-
-    /// Convenience function to create a page image with given string as the only content
-    #[allow(non_snake_case)]
-    pub fn TEST_IMG(s: &str) -> Bytes {
-        let mut buf = BytesMut::new();
-        buf.extend_from_slice(s.as_bytes());
-        buf.resize(64, 0);
-
-        buf.freeze()
-    }
-
-    static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
-
-    impl From<TenantConf> for TenantConfOpt {
-        fn from(tenant_conf: TenantConf) -> Self {
-            Self {
-                checkpoint_distance: Some(tenant_conf.checkpoint_distance),
-                checkpoint_timeout: Some(tenant_conf.checkpoint_timeout),
-                compaction_target_size: Some(tenant_conf.compaction_target_size),
-                compaction_period: Some(tenant_conf.compaction_period),
-                compaction_threshold: Some(tenant_conf.compaction_threshold),
-                gc_horizon: Some(tenant_conf.gc_horizon),
-                gc_period: Some(tenant_conf.gc_period),
-                image_creation_threshold: Some(tenant_conf.image_creation_threshold),
-                pitr_interval: Some(tenant_conf.pitr_interval),
-                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
-                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
-                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
-            }
-        }
-    }
-
-    pub struct RepoHarness<'a> {
-        pub conf: &'static PageServerConf,
-        pub tenant_conf: TenantConf,
-        pub tenant_id: ZTenantId,
-
-        pub lock_guard: (
-            Option<RwLockReadGuard<'a, ()>>,
-            Option<RwLockWriteGuard<'a, ()>>,
-        ),
-    }
-
-    impl<'a> RepoHarness<'a> {
-        pub fn create(test_name: &'static str) -> Result<Self> {
-            Self::create_internal(test_name, false)
-        }
-        pub fn create_exclusive(test_name: &'static str) -> Result<Self> {
-            Self::create_internal(test_name, true)
-        }
-        fn create_internal(test_name: &'static str, exclusive: bool) -> Result<Self> {
-            let lock_guard = if exclusive {
-                (None, Some(LOCK.write().unwrap()))
-            } else {
-                (Some(LOCK.read().unwrap()), None)
-            };
-
-            let repo_dir = PageServerConf::test_repo_dir(test_name);
-            let _ = fs::remove_dir_all(&repo_dir);
-            fs::create_dir_all(&repo_dir)?;
-
-            let conf = PageServerConf::dummy_conf(repo_dir);
-            // Make a static copy of the config. This can never be free'd, but that's
-            // OK in a test.
-            let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-            let tenant_conf = TenantConf::dummy_conf();
-
-            let tenant_id = ZTenantId::generate();
-            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
-            fs::create_dir_all(conf.timelines_path(&tenant_id))?;
-
-            Ok(Self {
-                conf,
-                tenant_conf,
-                tenant_id,
-                lock_guard,
-            })
-        }
-
-        pub fn load(&self) -> RepositoryImpl {
-            self.try_load().expect("failed to load test repo")
-        }
-
-        pub fn try_load(&self) -> Result<RepositoryImpl> {
-            let walredo_mgr = Arc::new(TestRedoManager);
-
-            let repo = LayeredRepository::new(
-                self.conf,
-                TenantConfOpt::from(self.tenant_conf),
-                walredo_mgr,
-                self.tenant_id,
-                RemoteIndex::default(),
-                false,
-            );
-            // populate repo with locally available timelines
-            for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id))
-                .expect("should be able to read timelines dir")
-            {
-                let timeline_dir_entry = timeline_dir_entry.unwrap();
-                let timeline_id: ZTimelineId = timeline_dir_entry
-                    .path()
-                    .file_name()
-                    .unwrap()
-                    .to_string_lossy()
-                    .parse()
-                    .unwrap();
-
-                repo.attach_timeline(timeline_id)?;
-            }
-
-            Ok(repo)
-        }
-
-        pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
-            self.conf.timeline_path(timeline_id, &self.tenant_id)
-        }
-    }
-
-    // Mock WAL redo manager that doesn't do much
-    pub struct TestRedoManager;
-
-    impl WalRedoManager for TestRedoManager {
-        fn request_redo(
-            &self,
-            key: Key,
-            lsn: Lsn,
-            base_img: Option<Bytes>,
-            records: Vec<(Lsn, ZenithWalRecord)>,
-        ) -> Result<Bytes, WalRedoError> {
-            let s = format!(
-                "redo for {} to get to {}, with {} and {} records",
-                key,
-                lsn,
-                if base_img.is_some() {
-                    "base image"
-                } else {
-                    "no base image"
-                },
-                records.len()
-            );
-            println!("{}", s);
-
-            Ok(TEST_IMG(&s))
-        }
-    }
-}
-
-///
-/// Tests that should work the same with any Repository/Timeline implementation.
-///
-#[allow(clippy::bool_assert_comparison)]
-#[cfg(test)]
-mod tests {
-    use super::repo_harness::*;
-    use super::*;
-    //use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
-    //use std::sync::Arc;
-    use bytes::BytesMut;
-    use hex_literal::hex;
-    use once_cell::sync::Lazy;
-
-    static TEST_KEY: Lazy<Key> =
-        Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
-
-    #[test]
-    fn test_basic() -> Result<()> {
-        let repo = RepoHarness::create("test_basic")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-
-        let writer = tline.writer();
-        writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
-        writer.finish_write(Lsn(0x10));
-        drop(writer);
-
-        let writer = tline.writer();
-        writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
-        writer.finish_write(Lsn(0x20));
-        drop(writer);
-
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
-        assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
-
-        Ok(())
-    }
-
-    #[test]
-    fn no_duplicate_timelines() -> Result<()> {
-        let repo = RepoHarness::create("no_duplicate_timelines")?.load();
-        let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-
-        match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) {
-            Ok(_) => panic!("duplicate timeline creation should fail"),
-            Err(e) => assert_eq!(e.to_string(), "Timeline already exists"),
-        }
-
-        Ok(())
-    }
-
-    /// Convenience function to create a page image with given string as the only content
-    pub fn test_value(s: &str) -> Value {
-        let mut buf = BytesMut::new();
-        buf.extend_from_slice(s.as_bytes());
-        Value::Image(buf.freeze())
-    }
-
-    ///
-    /// Test branch creation
-    ///
-    #[test]
-    fn test_branch() -> Result<()> {
-        let repo = RepoHarness::create("test_branch")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        let writer = tline.writer();
-        use std::str::from_utf8;
-
-        #[allow(non_snake_case)]
-        let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
-        #[allow(non_snake_case)]
-        let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
-
-        // Insert a value on the timeline
-        writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?;
-        writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?;
-        writer.finish_write(Lsn(0x20));
-
-        writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?;
-        writer.finish_write(Lsn(0x30));
-        writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?;
-        writer.finish_write(Lsn(0x40));
-
-        //assert_current_logical_size(&tline, Lsn(0x40));
-
-        // Branch the history, modify relation differently on the new timeline
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
-        let newtline = repo
-            .get_timeline_load(NEW_TIMELINE_ID)
-            .expect("Should have a local timeline");
-        let new_writer = newtline.writer();
-        new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
-        new_writer.finish_write(Lsn(0x40));
-
-        // Check page contents on both branches
-        assert_eq!(
-            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?,
-            "foo at 0x40"
-        );
-        assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?,
-            "bar at 0x40"
-        );
-        assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?,
-            "foobar at 0x20"
-        );
-
-        //assert_current_logical_size(&tline, Lsn(0x40));
-
-        Ok(())
-    }
-
-    fn make_some_layers<T: Timeline>(tline: &T, start_lsn: Lsn) -> Result<()> {
-        let mut lsn = start_lsn;
-        #[allow(non_snake_case)]
-        {
-            let writer = tline.writer();
-            // Create a relation on the timeline
-            writer.put(
-                *TEST_KEY,
-                lsn,
-                &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
-            )?;
-            writer.finish_write(lsn);
-            lsn += 0x10;
-            writer.put(
-                *TEST_KEY,
-                lsn,
-                &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
-            )?;
-            writer.finish_write(lsn);
-            lsn += 0x10;
-        }
-        tline.checkpoint(CheckpointConfig::Forced)?;
-        {
-            let writer = tline.writer();
-            writer.put(
-                *TEST_KEY,
-                lsn,
-                &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
-            )?;
-            writer.finish_write(lsn);
-            lsn += 0x10;
-            writer.put(
-                *TEST_KEY,
-                lsn,
-                &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
-            )?;
-            writer.finish_write(lsn);
-        }
-        tline.checkpoint(CheckpointConfig::Forced)
-    }
-
-    #[test]
-    fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> {
-        let repo =
-            RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
-
-        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
-        // FIXME: this doesn't actually remove any layer currently, given how the checkpointing
-        // and compaction works. But it does set the 'cutoff' point so that the cross check
-        // below should fail.
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
-
-        // try to branch at lsn 25, should fail because we already garbage collected the data
-        match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
-            Ok(_) => panic!("branching should have failed"),
-            Err(err) => {
-                assert!(err.to_string().contains("invalid branch start lsn"));
-                assert!(err
-                    .source()
-                    .unwrap()
-                    .to_string()
-                    .contains("we might've already garbage collected needed data"))
-            }
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> {
-        let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
-
-        repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
-        // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
-        match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
-            Ok(_) => panic!("branching should have failed"),
-            Err(err) => {
-                assert!(&err.to_string().contains("invalid branch start lsn"));
-                assert!(&err
-                    .source()
-                    .unwrap()
-                    .to_string()
-                    .contains("is earlier than latest GC horizon"));
-            }
-        }
-
-        Ok(())
-    }
-
-    /*
-    // FIXME: This currently fails to error out. Calling GC doesn't currently
-    // remove the old value, we'd need to work a little harder
-    #[test]
-    fn test_prohibit_get_for_garbage_collected_data() -> Result<()> {
-        let repo =
-            RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
-            .load();
-
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
-
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
-        let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
-        assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
-        match tline.get(*TEST_KEY, Lsn(0x25)) {
-            Ok(_) => panic!("request for page should have failed"),
-            Err(err) => assert!(err.to_string().contains("not found at")),
-        }
-        Ok(())
-    }
-     */
-
-    #[test]
-    fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
-        let repo =
-            RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
-
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
-        let newtline = repo
-            .get_timeline_load(NEW_TIMELINE_ID)
-            .expect("Should have a local timeline");
-        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
-        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
-
-        Ok(())
-    }
-    #[test]
-    fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
-        let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        make_some_layers(tline.as_ref(), Lsn(0x20))?;
-
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
-        let newtline = repo
-            .get_timeline_load(NEW_TIMELINE_ID)
-            .expect("Should have a local timeline");
-
-        make_some_layers(newtline.as_ref(), Lsn(0x60))?;
-
-        // run gc on parent
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
-
-        // Check that the data is still accessible on the branch.
-        assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50))?,
-            TEST_IMG(&format!("foo at {}", Lsn(0x40)))
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn timeline_load() -> Result<()> {
-        const TEST_NAME: &str = "timeline_load";
-        let harness = RepoHarness::create(TEST_NAME)?;
-        {
-            let repo = harness.load();
-            let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
-            make_some_layers(tline.as_ref(), Lsn(0x8000))?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
-        }
-
-        let repo = harness.load();
-        let tline = repo
-            .get_timeline(TIMELINE_ID)
-            .expect("cannot load timeline");
-        assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
-
-        assert!(repo.get_timeline_load(TIMELINE_ID).is_ok());
-
-        let tline = repo
-            .get_timeline(TIMELINE_ID)
-            .expect("cannot load timeline");
-        assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
-
-        Ok(())
-    }
-
-    #[test]
-    fn timeline_load_with_ancestor() -> Result<()> {
-        const TEST_NAME: &str = "timeline_load_with_ancestor";
-        let harness = RepoHarness::create(TEST_NAME)?;
-        // create two timelines
-        {
-            let repo = harness.load();
-            let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-
-            make_some_layers(tline.as_ref(), Lsn(0x20))?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
-
-            repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
-
-            let newtline = repo
-                .get_timeline_load(NEW_TIMELINE_ID)
-                .expect("Should have a local timeline");
-
-            make_some_layers(newtline.as_ref(), Lsn(0x60))?;
-            tline.checkpoint(CheckpointConfig::Forced)?;
-        }
-
-        // check that both of them are initially unloaded
-        let repo = harness.load();
-        {
-            let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
-            assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
-
-            let tline = repo
-                .get_timeline(NEW_TIMELINE_ID)
-                .expect("cannot get timeline");
-            assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
-        }
-        // load only child timeline
-        let _ = repo
-            .get_timeline_load(NEW_TIMELINE_ID)
-            .expect("cannot load timeline");
-
-        // check that both, child and ancestor are loaded
-        let tline = repo
-            .get_timeline(NEW_TIMELINE_ID)
-            .expect("cannot get timeline");
-        assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
-
-        let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
-        assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
-
-        Ok(())
-    }
-}
--- a/pageserver/src/storage_sync.rs
+++ b/pageserver/src/storage_sync.rs
@@ -37,7 +37,7 @@
 //!                                                            | access to this storage |
 //!                                                            +------------------------+
 //!
-//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so.
+//! First, during startup, the pageserver inits the storage sync task with the async loop, or leaves the loop uninitialised, if configured so.
 //! The loop inits the storage connection and checks the remote files stored.
 //! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
 //! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can
@@ -145,21 +145,19 @@ mod upload;

 use std::{
    collections::{hash_map, HashMap, HashSet, VecDeque},
-    ffi::OsStr,
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
    ops::ControlFlow,
    path::{Path, PathBuf},
-    sync::{Arc, Condvar, Mutex},
+    sync::{Condvar, Mutex},
 };

 use anyhow::{anyhow, bail, Context};
 use futures::stream::{FuturesUnordered, StreamExt};
-use once_cell::sync::{Lazy, OnceCell};
-use remote_storage::{GenericRemoteStorage, RemoteStorage};
+use once_cell::sync::OnceCell;
+use remote_storage::GenericRemoteStorage;
 use tokio::{
    fs,
-    runtime::Runtime,
    time::{Duration, Instant},
 };
 use tracing::*;
@@ -173,288 +171,54 @@ use self::{
 use crate::{
    config::PageServerConf,
    exponential_backoff,
-    layered_repository::{
-        ephemeral_file::is_ephemeral_file,
-        metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
-    },
-    storage_sync::{self, index::RemoteIndex},
-    tenant_mgr::attach_downloaded_tenants,
-    thread_mgr,
-    thread_mgr::ThreadKind,
+    layered_repository::metadata::{metadata_path, TimelineMetadata},
+    storage_sync::index::RemoteIndex,
+    task_mgr,
+    task_mgr::TaskKind,
+    task_mgr::BACKGROUND_RUNTIME,
+    tenant_mgr::attach_local_tenants,
+};
+use crate::{
+    metrics::{IMAGE_SYNC_TIME, REMAINING_SYNC_ITEMS, REMOTE_INDEX_UPLOAD},
+    TenantTimelineValues,
 };

-use metrics::{
-    register_histogram_vec, register_int_counter_vec, register_int_gauge, HistogramVec,
-    IntCounterVec, IntGauge,
-};
 use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};

 use self::download::download_index_parts;
 pub use self::download::gather_tenant_timelines_index_parts;
-pub use self::download::TEMP_DOWNLOAD_EXTENSION;
-
-static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!(
-        "pageserver_remote_storage_remaining_sync_items",
-        "Number of storage sync items left in the queue"
-    )
-    .expect("failed to register pageserver remote storage remaining sync items int gauge")
-});
-
-static IMAGE_SYNC_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_remote_storage_image_sync_seconds",
-        "Time took to synchronize (download or upload) a whole pageserver image. \
-        Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
-        &["tenant_id", "timeline_id", "operation_kind", "status"],
-        vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
-    )
-    .expect("failed to register pageserver image sync time histogram vec")
-});
-
-static REMOTE_INDEX_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_remote_storage_remote_index_uploads_total",
-        "Number of remote index uploads",
-        &["tenant_id", "timeline_id"],
-    )
-    .expect("failed to register pageserver remote index upload vec")
-});

 static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();

 /// A timeline status to share with pageserver's sync counterpart,
 /// after comparing local and remote timeline state.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone)]
 pub enum LocalTimelineInitStatus {
    /// The timeline has every remote layer present locally.
    /// There could be some layers requiring uploading,
    /// but this does not block the timeline from any user interaction.
-    LocallyComplete,
+    LocallyComplete(TimelineMetadata),
    /// A timeline has some files remotely, that are not present locally and need downloading.
    /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only,
    /// so the data needs to be downloaded first before the timeline can be used.
    NeedsSync,
 }

-type LocalTimelineInitStatuses = HashMap<ZTenantId, HashMap<ZTimelineId, LocalTimelineInitStatus>>;
+impl std::fmt::Debug for LocalTimelineInitStatus {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::LocallyComplete(_) => write!(f, "LocallyComplete"),
+            Self::NeedsSync => write!(f, "NeedsSync"),
+        }
+    }
+}

 /// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization.
 /// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still,
 /// to simplify the received code.
 pub struct SyncStartupData {
    pub remote_index: RemoteIndex,
-    pub local_timeline_init_statuses: LocalTimelineInitStatuses,
-}
-
-/// Based on the config, initiates the remote storage connection and starts a separate thread
-/// that ensures that pageserver and the remote storage are in sync with each other.
-/// If no external configuration connection given, no thread or storage initialization is done.
-/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states.
-pub fn start_local_timeline_sync(
-    config: &'static PageServerConf,
-) -> anyhow::Result<SyncStartupData> {
-    let local_timeline_files = local_tenant_timeline_files(config)
-        .context("Failed to collect local tenant timeline files")?;
-
-    match config.remote_storage_config.as_ref() {
-        Some(storage_config) => {
-            match GenericRemoteStorage::new(config.workdir.clone(), storage_config)
-                .context("Failed to init the generic remote storage")?
-            {
-                GenericRemoteStorage::Local(local_fs_storage) => {
-                    storage_sync::spawn_storage_sync_thread(
-                        config,
-                        local_timeline_files,
-                        local_fs_storage,
-                        storage_config.max_concurrent_syncs,
-                        storage_config.max_sync_errors,
-                    )
-                }
-                GenericRemoteStorage::S3(s3_bucket_storage) => {
-                    storage_sync::spawn_storage_sync_thread(
-                        config,
-                        local_timeline_files,
-                        s3_bucket_storage,
-                        storage_config.max_concurrent_syncs,
-                        storage_config.max_sync_errors,
-                    )
-                }
-            }
-            .context("Failed to spawn the storage sync thread")
-        }
-        None => {
-            info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
-            let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
-            for (
-                ZTenantTimelineId {
-                    tenant_id,
-                    timeline_id,
-                },
-                _,
-            ) in local_timeline_files
-            {
-                local_timeline_init_statuses
-                    .entry(tenant_id)
-                    .or_default()
-                    .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete);
-            }
-            Ok(SyncStartupData {
-                local_timeline_init_statuses,
-                remote_index: RemoteIndex::default(),
-            })
-        }
-    }
-}
-
-fn local_tenant_timeline_files(
-    config: &'static PageServerConf,
-) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
-    let mut local_tenant_timeline_files = HashMap::new();
-    let tenants_dir = config.tenants_path();
-    for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
-        .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
-    {
-        match &tenants_dir_entry {
-            Ok(tenants_dir_entry) => {
-                match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) {
-                    Ok(collected_files) => {
-                        local_tenant_timeline_files.extend(collected_files.into_iter())
-                    }
-                    Err(e) => error!(
-                        "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
-                        tenants_dir.display(),
-                        tenants_dir_entry,
-                        e
-                    ),
-                }
-            }
-            Err(e) => error!(
-                "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
-                tenants_dir_entry,
-                tenants_dir.display(),
-                e
-            ),
-        }
-    }
-
-    Ok(local_tenant_timeline_files)
-}
-
-fn collect_timelines_for_tenant(
-    config: &'static PageServerConf,
-    tenant_path: &Path,
-) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
-    let mut timelines = HashMap::new();
-    let tenant_id = tenant_path
-        .file_name()
-        .and_then(OsStr::to_str)
-        .unwrap_or_default()
-        .parse::<ZTenantId>()
-        .context("Could not parse tenant id out of the tenant dir name")?;
-    let timelines_dir = config.timelines_path(&tenant_id);
-
-    for timelines_dir_entry in std::fs::read_dir(&timelines_dir).with_context(|| {
-        format!(
-            "Failed to list timelines dir entry for tenant {}",
-            tenant_id
-        )
-    })? {
-        match timelines_dir_entry {
-            Ok(timelines_dir_entry) => {
-                let timeline_path = timelines_dir_entry.path();
-                match collect_timeline_files(&timeline_path) {
-                    Ok((timeline_id, metadata, timeline_files)) => {
-                        timelines.insert(
-                            ZTenantTimelineId {
-                                tenant_id,
-                                timeline_id,
-                            },
-                            (metadata, timeline_files),
-                        );
-                    }
-                    Err(e) => error!(
-                        "Failed to process timeline dir contents at '{}', reason: {:?}",
-                        timeline_path.display(),
-                        e
-                    ),
-                }
-            }
-            Err(e) => error!(
-                "Failed to list timelines for entry tenant {}, reason: {:?}",
-                tenant_id, e
-            ),
-        }
-    }
-
-    Ok(timelines)
-}
-
-// discover timeline files and extract timeline metadata
-//  NOTE: ephemeral files are excluded from the list
-fn collect_timeline_files(
-    timeline_dir: &Path,
-) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet<PathBuf>)> {
-    let mut timeline_files = HashSet::new();
-    let mut timeline_metadata_path = None;
-
-    let timeline_id = timeline_dir
-        .file_name()
-        .and_then(OsStr::to_str)
-        .unwrap_or_default()
-        .parse::<ZTimelineId>()
-        .context("Could not parse timeline id out of the timeline dir name")?;
-    let timeline_dir_entries =
-        std::fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
-    for entry in timeline_dir_entries {
-        let entry_path = entry.context("Failed to list timeline dir entry")?.path();
-        if entry_path.is_file() {
-            if entry_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME) {
-                timeline_metadata_path = Some(entry_path);
-            } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
-                debug!("skipping ephemeral file {}", entry_path.display());
-                continue;
-            } else if entry_path.extension().and_then(OsStr::to_str)
-                == Some(TEMP_DOWNLOAD_EXTENSION)
-            {
-                info!("removing temp download file at {}", entry_path.display());
-                std::fs::remove_file(&entry_path).with_context(|| {
-                    format!(
-                        "failed to remove temp download file at {}",
-                        entry_path.display()
-                    )
-                })?;
-            } else if entry_path.extension().and_then(OsStr::to_str) == Some("temp") {
-                info!("removing temp layer file at {}", entry_path.display());
-                std::fs::remove_file(&entry_path).with_context(|| {
-                    format!(
-                        "failed to remove temp layer file at {}",
-                        entry_path.display()
-                    )
-                })?;
-            } else {
-                timeline_files.insert(entry_path);
-            }
-        }
-    }
-
-    // FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed
-    //   then attach is lost. There would be no retries for that,
-    //   initial collect will fail because there is no metadata.
-    //   We either need to start download if we see empty dir after restart or attach caller should
-    //   be aware of that and retry attach if awaits_download for timeline switched from true to false
-    //   but timelinne didn't appear locally.
-    //   Check what happens with remote index in that case.
-    let timeline_metadata_path = match timeline_metadata_path {
-        Some(path) => path,
-        None => bail!("No metadata file found in the timeline directory"),
-    };
-    let metadata = TimelineMetadata::from_bytes(
-        &std::fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?,
-    )
-    .context("Failed to parse timeline metadata file bytes")?;
-
-    Ok((timeline_id, metadata, timeline_files))
+    pub local_timeline_init_statuses: TenantTimelineValues<LocalTimelineInitStatus>,
 }

 /// Global queue of sync tasks.
@@ -500,7 +264,7 @@ impl SyncQueue {
                .unwrap()
                .0;

-            if thread_mgr::is_shutdown_requested() {
+            if task_mgr::is_shutdown_requested() {
                return (HashMap::new(), q.len());
            }
        }
@@ -810,17 +574,13 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) {

 /// Launch a thread to perform remote storage sync tasks.
 /// See module docs for loop step description.
-pub(super) fn spawn_storage_sync_thread<P, S>(
+pub fn spawn_storage_sync_task(
    conf: &'static PageServerConf,
-    local_timeline_files: HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
-    storage: S,
+    local_timeline_files: TenantTimelineValues<(TimelineMetadata, HashSet<PathBuf>)>,
+    storage: GenericRemoteStorage,
    max_concurrent_timelines_sync: NonZeroUsize,
    max_sync_errors: NonZeroU32,
-) -> anyhow::Result<SyncStartupData>
-where
-    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
-{
+) -> anyhow::Result<SyncStartupData> {
    let sync_queue = SyncQueue::new(max_concurrent_timelines_sync);
    SYNC_QUEUE
        .set(sync_queue)
@@ -830,65 +590,80 @@ where
        None => bail!("Could not get sync queue during the sync loop step, aborting"),
    };

-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .context("Failed to create storage sync runtime")?;
+    // TODO we are able to "attach" empty tenants, but not doing it now since it might require big wait time:
+    // * we need to list every timeline for tenant on S3, that might be a costly operation
+    // * we need to download every timeline for the tenant, to activate it in memory
+    //
+    // When on-demand download gets merged, we're able to do this fast by storing timeline metadata only.
+    let mut empty_tenants = TenantTimelineValues::<LocalTimelineInitStatus>::new();
+    let mut keys_for_index_part_downloads = HashSet::new();
+    let mut timelines_to_sync = HashMap::new();

-    let applicable_index_parts = runtime.block_on(download_index_parts(
+    for (tenant_id, timeline_data) in local_timeline_files.0 {
+        if timeline_data.is_empty() {
+            let _ = empty_tenants.0.entry(tenant_id).or_default();
+        } else {
+            for (timeline_id, timeline_data) in timeline_data {
+                let id = ZTenantTimelineId::new(tenant_id, timeline_id);
+                keys_for_index_part_downloads.insert(id);
+                timelines_to_sync.insert(id, timeline_data);
+            }
+        }
+    }
+
+    let applicable_index_parts = BACKGROUND_RUNTIME.block_on(download_index_parts(
        conf,
        &storage,
-        local_timeline_files.keys().copied().collect(),
+        keys_for_index_part_downloads,
    ));

    let remote_index = RemoteIndex::from_parts(conf, applicable_index_parts)?;

-    let local_timeline_init_statuses = schedule_first_sync_tasks(
-        &mut runtime.block_on(remote_index.write()),
+    let mut local_timeline_init_statuses = schedule_first_sync_tasks(
+        &mut BACKGROUND_RUNTIME.block_on(remote_index.write()),
        sync_queue,
-        local_timeline_files,
+        timelines_to_sync,
    );
+    local_timeline_init_statuses
+        .0
+        .extend(empty_tenants.0.into_iter());

    let remote_index_clone = remote_index.clone();
-    thread_mgr::spawn(
-        ThreadKind::StorageSync,
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::StorageSync,
        None,
        None,
-        "Remote storage sync thread",
+        "Remote storage sync task",
        false,
-        move || {
+        async move {
            storage_sync_loop(
-                runtime,
                conf,
-                (Arc::new(storage), remote_index_clone, sync_queue),
+                (storage, remote_index_clone, sync_queue),
                max_sync_errors,
-            );
+            )
+            .await;
            Ok(())
        },
-    )
-    .context("Failed to spawn remote storage sync thread")?;
+    );
    Ok(SyncStartupData {
        remote_index,
        local_timeline_init_statuses,
    })
 }

-fn storage_sync_loop<P, S>(
-    runtime: Runtime,
+async fn storage_sync_loop(
    conf: &'static PageServerConf,
-    (storage, index, sync_queue): (Arc<S>, RemoteIndex, &SyncQueue),
+    (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue),
    max_sync_errors: NonZeroU32,
-) where
-    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
-{
+) {
    info!("Starting remote storage sync loop");
    loop {
-        let loop_storage = Arc::clone(&storage);
+        let loop_storage = storage.clone();

        let (batched_tasks, remaining_queue_length) = sync_queue.next_task_batch();

-        if thread_mgr::is_shutdown_requested() {
+        if task_mgr::is_shutdown_requested() {
            info!("Shutdown requested, stopping");
            break;
        }
@@ -902,20 +677,19 @@ fn storage_sync_loop<P, S>(
        }

        // Concurrently perform all the tasks in the batch
-        let loop_step = runtime.block_on(async {
-            tokio::select! {
-                step = process_batches(
-                    conf,
-                    max_sync_errors,
-                    loop_storage,
-                    &index,
-                    batched_tasks,
-                    sync_queue,
-                )
-                    .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step),
-                _ = thread_mgr::shutdown_watcher() => ControlFlow::Break(()),
-            }
-        });
+        let loop_step = tokio::select! {
+            step = process_batches(
+                conf,
+                max_sync_errors,
+                loop_storage,
+                &index,
+                batched_tasks,
+                sync_queue,
+            )
+                .instrument(info_span!("storage_sync_loop_step")) => ControlFlow::Continue(step)
+                ,
+            _ = task_mgr::shutdown_watcher() => ControlFlow::Break(()),
+        };

        match loop_step {
            ControlFlow::Continue(updated_tenants) => {
@@ -926,9 +700,8 @@ fn storage_sync_loop<P, S>(
                        "Sync loop step completed, {} new tenant state update(s)",
                        updated_tenants.len()
                    );
-                    let mut sync_status_updates: HashMap<ZTenantId, HashSet<ZTimelineId>> =
-                        HashMap::new();
-                    let index_accessor = runtime.block_on(index.read());
+                    let mut timelines_to_attach = TenantTimelineValues::new();
+                    let index_accessor = index.read().await;
                    for tenant_id in updated_tenants {
                        let tenant_entry = match index_accessor.tenant_entry(&tenant_id) {
                            Some(tenant_entry) => tenant_entry,
@@ -953,13 +726,20 @@ fn storage_sync_loop<P, S>(
                            // and register them all at once in a repository for download
                            // to be submitted in a single operation to repository
                            // so it can apply them at once to internal timeline map.
-                            sync_status_updates
-                                .insert(tenant_id, tenant_entry.keys().copied().collect());
+                            timelines_to_attach.0.insert(
+                                tenant_id,
+                                tenant_entry
+                                    .iter()
+                                    .map(|(&id, entry)| (id, entry.metadata.clone()))
+                                    .collect(),
+                            );
                        }
                    }
                    drop(index_accessor);
                    // Batch timeline download registration to ensure that the external registration code won't block any running tasks before.
-                    attach_downloaded_tenants(conf, &index, sync_status_updates);
+                    if let Err(e) = attach_local_tenants(conf, &index, timelines_to_attach) {
+                        error!("Failed to attach new timelines: {e:?}");
+                    };
                }
            }
            ControlFlow::Break(()) => {
@@ -979,26 +759,22 @@ enum DownloadStatus {
 #[derive(Debug)]
 enum UploadStatus {
    Uploaded,
-    Failed,
+    Failed(anyhow::Error),
    Nothing,
 }

-async fn process_batches<P, S>(
+async fn process_batches(
    conf: &'static PageServerConf,
    max_sync_errors: NonZeroU32,
-    storage: Arc<S>,
+    storage: GenericRemoteStorage,
    index: &RemoteIndex,
    batched_tasks: HashMap<ZTenantTimelineId, SyncTaskBatch>,
    sync_queue: &SyncQueue,
-) -> HashSet<ZTenantId>
-where
-    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
-{
+) -> HashSet<ZTenantId> {
    let mut sync_results = batched_tasks
        .into_iter()
        .map(|(sync_id, batch)| {
-            let storage = Arc::clone(&storage);
+            let storage = storage.clone();
            let index = index.clone();
            async move {
                let state_update = process_sync_task_batch(
@@ -1030,17 +806,13 @@ where
    downloaded_timelines
 }

-async fn process_sync_task_batch<P, S>(
+async fn process_sync_task_batch(
    conf: &'static PageServerConf,
-    (storage, index, sync_queue): (Arc<S>, RemoteIndex, &SyncQueue),
+    (storage, index, sync_queue): (GenericRemoteStorage, RemoteIndex, &SyncQueue),
    max_sync_errors: NonZeroU32,
    sync_id: ZTenantTimelineId,
    batch: SyncTaskBatch,
-) -> DownloadStatus
-where
-    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
-{
+) -> DownloadStatus {
    let sync_start = Instant::now();
    let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() };

@@ -1056,41 +828,43 @@ where
    let (upload_status, download_status) = tokio::join!(
        async {
            if let Some(upload_data) = upload_data {
-                match validate_task_retries(upload_data, max_sync_errors)
+                let upload_retries = upload_data.retries;
+                match validate_task_retries(upload_retries, max_sync_errors)
                    .instrument(info_span!("retries_validation"))
                    .await
                {
-                    ControlFlow::Continue(new_upload_data) => {
+                    ControlFlow::Continue(()) => {
                        upload_timeline_data(
                            conf,
-                            (storage.as_ref(), &index, sync_queue),
+                            (&storage, &index, sync_queue),
                            current_remote_timeline.as_ref(),
                            sync_id,
-                            new_upload_data,
+                            upload_data,
                            sync_start,
                            "upload",
                        )
-                        .await;
-                        UploadStatus::Uploaded
-                    }
-                    ControlFlow::Break(failed_upload_data) => {
-                        if let Err(e) = update_remote_data(
-                            conf,
-                            storage.as_ref(),
-                            &index,
-                            sync_id,
-                            RemoteDataUpdate::Upload {
-                                uploaded_data: failed_upload_data.data,
-                                upload_failed: true,
-                            },
-                        )
                        .await
-                        {
-                            error!("Failed to update remote timeline {sync_id}: {e:?}");
-                        }
-
-                        UploadStatus::Failed
                    }
+                    ControlFlow::Break(()) => match update_remote_data(
+                        conf,
+                        &storage,
+                        &index,
+                        sync_id,
+                        RemoteDataUpdate::Upload {
+                            uploaded_data: upload_data.data,
+                            upload_failed: true,
+                        },
+                    )
+                    .await
+                    {
+                        Ok(()) => UploadStatus::Failed(anyhow::anyhow!(
+                            "Aborted after retries validation, current retries: {upload_retries}, max retries allowed: {max_sync_errors}"
+                        )),
+                        Err(e) => {
+                            error!("Failed to update remote timeline {sync_id}: {e:?}");
+                            UploadStatus::Failed(e)
+                        }
+                    },
                }
            } else {
                UploadStatus::Nothing
@@ -1099,23 +873,23 @@ where
        .instrument(info_span!("upload_timeline_data")),
        async {
            if let Some(download_data) = download_data {
-                match validate_task_retries(download_data, max_sync_errors)
+                match validate_task_retries(download_data.retries, max_sync_errors)
                    .instrument(info_span!("retries_validation"))
                    .await
                {
-                    ControlFlow::Continue(new_download_data) => {
+                    ControlFlow::Continue(()) => {
                        return download_timeline_data(
                            conf,
-                            (storage.as_ref(), &index, sync_queue),
+                            (&storage, &index, sync_queue),
                            current_remote_timeline.as_ref(),
                            sync_id,
-                            new_download_data,
+                            download_data,
                            sync_start,
                            "download",
                        )
                        .await;
                    }
-                    ControlFlow::Break(_) => {
+                    ControlFlow::Break(()) => {
                        index
                            .write()
                            .await
@@ -1132,29 +906,29 @@ where
    if let Some(delete_data) = batch.delete {
        match upload_status {
            UploadStatus::Uploaded | UploadStatus::Nothing => {
-                match validate_task_retries(delete_data, max_sync_errors)
+                match validate_task_retries(delete_data.retries, max_sync_errors)
                    .instrument(info_span!("retries_validation"))
                    .await
                {
-                    ControlFlow::Continue(new_delete_data) => {
+                    ControlFlow::Continue(()) => {
                        delete_timeline_data(
                            conf,
-                            (storage.as_ref(), &index, sync_queue),
+                            (&storage, &index, sync_queue),
                            sync_id,
-                            new_delete_data,
+                            delete_data,
                            sync_start,
                            "delete",
                        )
                        .instrument(info_span!("delete_timeline_data"))
                        .await;
                    }
-                    ControlFlow::Break(failed_delete_data) => {
+                    ControlFlow::Break(()) => {
                        if let Err(e) = update_remote_data(
                            conf,
-                            storage.as_ref(),
+                            &storage,
                            &index,
                            sync_id,
-                            RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers),
+                            RemoteDataUpdate::Delete(&delete_data.data.deleted_layers),
                        )
                        .await
                        {
@@ -1163,8 +937,8 @@ where
                    }
                }
            }
-            UploadStatus::Failed => {
-                warn!("Skipping delete task due to failed upload tasks, reenqueuing");
+            UploadStatus::Failed(e) => {
+                warn!("Skipping delete task due to failed upload tasks, reenqueuing. Upload data: {:?}, delete data: {delete_data:?}. Upload failure: {e:#}", batch.upload);
                sync_queue.push(sync_id, SyncTask::Delete(delete_data));
            }
        }
@@ -1173,19 +947,15 @@ where
    download_status
 }

-async fn download_timeline_data<P, S>(
+async fn download_timeline_data(
    conf: &'static PageServerConf,
-    (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue),
+    (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
    current_remote_timeline: Option<&RemoteTimeline>,
    sync_id: ZTenantTimelineId,
    new_download_data: SyncData<LayersDownload>,
    sync_start: Instant,
    task_name: &str,
-) -> DownloadStatus
-where
-    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
-{
+) -> DownloadStatus {
    match download_timeline_layers(
        conf,
        storage,
@@ -1296,17 +1066,14 @@ async fn update_local_metadata(
    Ok(())
 }

-async fn delete_timeline_data<P, S>(
+async fn delete_timeline_data(
    conf: &'static PageServerConf,
-    (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue),
+    (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
    sync_id: ZTenantTimelineId,
    mut new_delete_data: SyncData<LayersDeletion>,
    sync_start: Instant,
    task_name: &str,
-) where
-    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
-{
+) {
    let timeline_delete = &mut new_delete_data.data;

    if !timeline_delete.deletion_registered {
@@ -1341,18 +1108,15 @@ async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMeta
    .context("Failed to parse metadata bytes")
 }

-async fn upload_timeline_data<P, S>(
+async fn upload_timeline_data(
    conf: &'static PageServerConf,
-    (storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue),
+    (storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
    current_remote_timeline: Option<&RemoteTimeline>,
    sync_id: ZTenantTimelineId,
    new_upload_data: SyncData<LayersUpload>,
    sync_start: Instant,
    task_name: &str,
-) where
-    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
-{
+) -> UploadStatus {
    let mut uploaded_data = match upload_timeline_layers(
        storage,
        sync_queue,
@@ -1362,9 +1126,9 @@ async fn upload_timeline_data<P, S>(
    )
    .await
    {
-        UploadedTimeline::FailedAndRescheduled => {
+        UploadedTimeline::FailedAndRescheduled(e) => {
            register_sync_status(sync_id, sync_start, task_name, Some(false));
-            return;
+            return UploadStatus::Failed(e);
        }
        UploadedTimeline::Successful(upload_data) => upload_data,
    };
@@ -1383,12 +1147,14 @@ async fn upload_timeline_data<P, S>(
    {
        Ok(()) => {
            register_sync_status(sync_id, sync_start, task_name, Some(true));
+            UploadStatus::Uploaded
        }
        Err(e) => {
            error!("Failed to update remote timeline {sync_id}: {e:?}");
            uploaded_data.retries += 1;
            sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
            register_sync_status(sync_id, sync_start, task_name, Some(false));
+            UploadStatus::Failed(e)
        }
    }
 }
@@ -1401,17 +1167,13 @@ enum RemoteDataUpdate<'a> {
    Delete(&'a HashSet<PathBuf>),
 }

-async fn update_remote_data<P, S>(
+async fn update_remote_data(
    conf: &'static PageServerConf,
-    storage: &S,
+    storage: &GenericRemoteStorage,
    index: &RemoteIndex,
    sync_id: ZTenantTimelineId,
    update: RemoteDataUpdate<'_>,
-) -> anyhow::Result<()>
-where
-    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
-{
+) -> anyhow::Result<()> {
    let updated_remote_timeline = {
        let mut index_accessor = index.write().await;

@@ -1491,32 +1253,27 @@ where
        .context("Failed to upload new index part")
 }

-async fn validate_task_retries<T>(
-    sync_data: SyncData<T>,
+async fn validate_task_retries(
+    current_attempt: u32,
    max_sync_errors: NonZeroU32,
-) -> ControlFlow<SyncData<T>, SyncData<T>> {
-    let current_attempt = sync_data.retries;
+) -> ControlFlow<(), ()> {
    let max_sync_errors = max_sync_errors.get();
    if current_attempt >= max_sync_errors {
-        error!(
-            "Aborting task that failed {current_attempt} times, exceeding retries threshold of {max_sync_errors}",
-        );
-        return ControlFlow::Break(sync_data);
+        return ControlFlow::Break(());
    }

    exponential_backoff(current_attempt, 1.0, 30.0).await;
-    ControlFlow::Continue(sync_data)
+    ControlFlow::Continue(())
 }

 fn schedule_first_sync_tasks(
    index: &mut RemoteTimelineIndex,
    sync_queue: &SyncQueue,
    local_timeline_files: HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
-) -> LocalTimelineInitStatuses {
-    let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
+) -> TenantTimelineValues<LocalTimelineInitStatus> {
+    let mut local_timeline_init_statuses = TenantTimelineValues::new();

-    let mut new_sync_tasks =
-        VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len()));
+    let mut new_sync_tasks = VecDeque::with_capacity(local_timeline_files.len());

    for (sync_id, (local_metadata, local_files)) in local_timeline_files {
        match index.timeline_entry_mut(&sync_id) {
@@ -1528,18 +1285,27 @@ fn schedule_first_sync_tasks(
                    local_files,
                    remote_timeline,
                );
-                let was_there = local_timeline_init_statuses
+                match local_timeline_init_statuses
+                    .0
                    .entry(sync_id.tenant_id)
                    .or_default()
-                    .insert(sync_id.timeline_id, timeline_status);
-
-                if was_there.is_some() {
-                    // defensive check
-                    warn!(
-                        "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}",
-                        sync_id.timeline_id
-                    );
+                    .entry(sync_id.timeline_id)
+                {
+                    hash_map::Entry::Occupied(mut o) => {
+                        {
+                            // defensive check
+                            warn!(
+                                "Overwriting timeline init sync status. Status {timeline_status:?}, timeline {}",
+                                sync_id.timeline_id
+                            );
+                        }
+                        o.insert(timeline_status);
+                    }
+                    hash_map::Entry::Vacant(v) => {
+                        v.insert(timeline_status);
+                    }
                }
+
                remote_timeline.awaits_download = awaits_download;
            }
            None => {
@@ -1550,15 +1316,16 @@ fn schedule_first_sync_tasks(
                    SyncTask::upload(LayersUpload {
                        layers_to_upload: local_files,
                        uploaded_layers: HashSet::new(),
-                        metadata: Some(local_metadata),
+                        metadata: Some(local_metadata.clone()),
                    }),
                ));
                local_timeline_init_statuses
+                    .0
                    .entry(sync_id.tenant_id)
                    .or_default()
                    .insert(
                        sync_id.timeline_id,
-                        LocalTimelineInitStatus::LocallyComplete,
+                        LocalTimelineInitStatus::LocallyComplete(local_metadata),
                    );
            }
        }
@@ -1592,7 +1359,10 @@ fn compare_local_and_remote_timeline(
        // we do not need to manipulate with remote consistent lsn here
        // because it will be updated when sync will be completed
    } else {
-        (LocalTimelineInitStatus::LocallyComplete, false)
+        (
+            LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()),
+            false,
+        )
    };

    let layers_to_upload = local_files
@@ -1641,7 +1411,7 @@ fn register_sync_status(
 mod test_utils {
    use utils::lsn::Lsn;

-    use crate::repository::repo_harness::RepoHarness;
+    use crate::layered_repository::repo_harness::RepoHarness;

    use super::*;

@@ -1686,7 +1456,7 @@ mod test_utils {
 #[cfg(test)]
 mod tests {
    use super::test_utils::dummy_metadata;
-    use crate::repository::repo_harness::TIMELINE_ID;
+    use crate::layered_repository::repo_harness::TIMELINE_ID;
    use hex_literal::hex;
    use utils::lsn::Lsn;

--- a/Show More
+++ b/Show More