Try larger sleep

Wait for pid death
Add hacky solution
2026-02-04 11:10:37 +00:00 · 2022-08-12 09:52:40 -04:00 · 2022-08-12 09:21:44 -04:00 · 2022-08-12 09:05:51 -04:00 · 2022-08-12 09:01:17 -04:00 · 2022-08-12 19:13:42 +07:00
156 changed files with 8971 additions and 5869 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,369 +0,0 @@
-version: 2.1
-
-executors:
-  neon-xlarge-executor:
-    resource_class: xlarge
-    docker:
-      # NB: when changed, do not forget to update rust image tag in all Dockerfiles
-      - image: neondatabase/rust:1.58
-  neon-executor:
-    docker:
-      - image: neondatabase/rust:1.58
-
-jobs:
-  # A job to build postgres
-  build-postgres:
-    executor: neon-xlarge-executor
-    parameters:
-      build_type:
-        type: enum
-        enum: ["debug", "release"]
-    environment:
-      BUILD_TYPE: << parameters.build_type >>
-    steps:
-        # Checkout the git repo (circleci doesn't have a flag to enable submodules here)
-      - checkout
-
-        # Grab the postgres git revision to build a cache key.
-        # Append makefile as it could change the way postgres is built.
-        # Note this works even though the submodule hasn't been checkout out yet.
-      - run:
-          name: Get postgres cache key
-          command: |
-              git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
-              cat Makefile >> /tmp/cache-key-postgres
-
-      - restore_cache:
-          name: Restore postgres cache
-          keys:
-            # Restore ONLY if the rev key matches exactly
-            - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
-
-        # Build postgres if the restore_cache didn't find a build.
-        # `make` can't figure out whether the cache is valid, since
-        # it only compares file timestamps.
-      - run:
-          name: build postgres
-          command: |
-            if [ ! -e tmp_install/bin/postgres ]; then
-              # "depth 1" saves some time by not cloning the whole repo
-              git submodule update --init --depth 1
-              # bail out on any warnings
-              COPT='-Werror' mold -run make postgres -j$(nproc)
-            fi
-
-      - save_cache:
-          name: Save postgres cache
-          key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
-          paths:
-            - tmp_install
-
-  # A job to build Neon rust code
-  build-neon:
-    executor: neon-xlarge-executor
-    parameters:
-      build_type:
-        type: enum
-        enum: ["debug", "release"]
-    environment:
-      BUILD_TYPE: << parameters.build_type >>
-    steps:
-        # Checkout the git repo (without submodules)
-      - checkout
-
-        # Grab the postgres git revision to build a cache key.
-        # Append makefile as it could change the way postgres is built.
-        # Note this works even though the submodule hasn't been checkout out yet.
-      - run:
-          name: Get postgres cache key
-          command: |
-            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
-            cat Makefile >> /tmp/cache-key-postgres
-
-
-      - restore_cache:
-          name: Restore postgres cache
-          keys:
-            # Restore ONLY if the rev key matches exactly
-            - v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
-
-      - restore_cache:
-          name: Restore rust cache
-          keys:
-            # Require an exact match. While an out of date cache might speed up the build,
-            # there's no way to clean out old packages, so the cache grows every time something
-            # changes.
-            - v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
-
-        # Build the rust code, including test binaries
-      - run:
-          name: Rust build << parameters.build_type >>
-          command: |
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              CARGO_FLAGS=
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              CARGO_FLAGS="--release --features profiling"
-            fi
-
-            export CARGO_INCREMENTAL=0
-            export CACHEPOT_BUCKET=zenith-rust-cachepot
-            export RUSTC_WRAPPER=""
-            export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
-            export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
-            mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
-            cachepot -s
-
-      - save_cache:
-          name: Save rust cache
-          key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
-          paths:
-            - ~/.cargo/registry
-            - ~/.cargo/git
-            - target
-
-        # Run rust unit tests
-      - run:
-          name: cargo test
-          command: |
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              CARGO_FLAGS=
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              CARGO_FLAGS=--release
-            fi
-
-            cargo test $CARGO_FLAGS
-
-        # Install the rust binaries, for use by test jobs
-      - run:
-          name: Install rust binaries
-          command: |
-            binaries=$(
-              cargo metadata --format-version=1 --no-deps |
-              jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
-            )
-
-            mkdir -p /tmp/zenith/bin
-            mkdir -p /tmp/zenith/test_bin
-            mkdir -p /tmp/zenith/etc
-
-            # Install target binaries
-            for bin in $binaries; do
-              SRC=target/$BUILD_TYPE/$bin
-              DST=/tmp/zenith/bin/$bin
-              cp $SRC $DST
-            done
-
-        # Install the postgres binaries, for use by test jobs
-      - run:
-          name: Install postgres binaries
-          command: |
-            cp -a tmp_install /tmp/zenith/pg_install
-
-      # Save rust binaries for other jobs in the workflow
-      - persist_to_workspace:
-          root: /tmp/zenith
-          paths:
-            - "*"
-
-  check-codestyle-python:
-    executor: neon-executor
-    steps:
-      - checkout
-      - restore_cache:
-          keys:
-            - v2-python-deps-{{ checksum "poetry.lock" }}
-      - run:
-          name: Install deps
-          command: ./scripts/pysync
-      - save_cache:
-          key: v2-python-deps-{{ checksum "poetry.lock" }}
-          paths:
-            - /home/circleci/.cache/pypoetry/virtualenvs
-      - run:
-          name: Print versions
-          when: always
-          command: |
-              poetry run python --version
-              poetry show
-      - run:
-          name: Run yapf to ensure code format
-          when: always
-          command: poetry run yapf --recursive --diff .
-      - run:
-          name: Run mypy to check types
-          when: always
-          command: poetry run mypy .
-
-  run-pytest:
-    executor: neon-executor
-    parameters:
-      # pytest args to specify the tests to run.
-      #
-      # This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory,
-      # or '-k foobar' to run tests containing string 'foobar'. See pytest man page
-      # section SPECIFYING TESTS / SELECTING TESTS for details.
-      #
-      # Select the type of Rust build. Must be "release" or "debug".
-      build_type:
-        type: string
-        default: "debug"
-      # This parameter is required, to prevent the mistake of running all tests in one job.
-      test_selection:
-        type: string
-        default: ""
-      # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
-      extra_params:
-        type: string
-        default: ""
-      needs_postgres_source:
-        type: boolean
-        default: false
-      run_in_parallel:
-        type: boolean
-        default: true
-      save_perf_report:
-        type: boolean
-        default: false
-    environment:
-      BUILD_TYPE: << parameters.build_type >>
-    steps:
-      - attach_workspace:
-          at: /tmp/zenith
-      - checkout
-      - when:
-          condition: << parameters.needs_postgres_source >>
-          steps:
-            - run: git submodule update --init --depth 1
-      - restore_cache:
-          keys:
-            - v2-python-deps-{{ checksum "poetry.lock" }}
-      - run:
-          name: Install deps
-          command: ./scripts/pysync
-      - save_cache:
-          key: v2-python-deps-{{ checksum "poetry.lock" }}
-          paths:
-            - /home/circleci/.cache/pypoetry/virtualenvs
-      - run:
-          name: Run pytest
-          # pytest doesn't output test logs in real time, so CI job may fail with
-          # `Too long with no output` error, if a test is running for a long time.
-          # In that case, tests should have internal timeouts that are less than
-          # no_output_timeout, specified here.
-          no_output_timeout: 10m
-          environment:
-            - NEON_BIN: /tmp/zenith/bin
-            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
-            - TEST_OUTPUT: /tmp/test_output
-            # this variable will be embedded in perf test report
-            # and is needed to distinguish different environments
-            - PLATFORM: zenith-local-ci
-          command: |
-            PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
-            rm -rf $PERF_REPORT_DIR
-
-            TEST_SELECTION="test_runner/<< parameters.test_selection >>"
-            EXTRA_PARAMS="<< parameters.extra_params >>"
-            if [ -z "$TEST_SELECTION" ]; then
-              echo "test_selection must be set"
-              exit 1
-            fi
-            if << parameters.run_in_parallel >>; then
-              EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
-            fi
-            if << parameters.save_perf_report >>; then
-              if [[ $CIRCLE_BRANCH == "main" ]]; then
-                mkdir -p "$PERF_REPORT_DIR"
-                EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
-              fi
-            fi
-
-            export GITHUB_SHA=$CIRCLE_SHA1
-
-            # Run the tests.
-            #
-            # The junit.xml file allows CircleCI to display more fine-grained test information
-            # in its "Tests" tab in the results page.
-            # --verbose prints name of each test (helpful when there are
-            # multiple tests in one file)
-            # -rA prints summary in the end
-            # -n4 uses four processes to run tests via pytest-xdist
-            # -s is not used to prevent pytest from capturing output, because tests are running
-            # in parallel and logs are mixed between different tests
-            ./scripts/pytest \
-              --junitxml=$TEST_OUTPUT/junit.xml \
-              --tb=short \
-              --verbose \
-              -m "not remote_cluster" \
-              -rA $TEST_SELECTION $EXTRA_PARAMS
-
-            if << parameters.save_perf_report >>; then
-              if [[ $CIRCLE_BRANCH == "main" ]]; then
-                export REPORT_FROM="$PERF_REPORT_DIR"
-                export REPORT_TO=local
-                scripts/generate_and_push_perf_report.sh
-              fi
-            fi
-      - run:
-          # CircleCI artifacts are preserved one file at a time, so skipping
-          # this step isn't a good idea. If you want to extract the
-          # pageserver state, perhaps a tarball would be a better idea.
-          name: Delete all data but logs
-          when: always
-          command: |
-            du -sh /tmp/test_output/*
-            find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
-            du -sh /tmp/test_output/*
-      - store_artifacts:
-          path: /tmp/test_output
-      # The store_test_results step tells CircleCI where to find the junit.xml file.
-      - store_test_results:
-          path: /tmp/test_output
-      # Save data (if any)
-      - persist_to_workspace:
-          root: /tmp/zenith
-          paths:
-            - "*"
-
-workflows:
-  build_and_test:
-    jobs:
-      - check-codestyle-python
-      - build-postgres:
-          name: build-postgres-<< matrix.build_type >>
-          matrix:
-            parameters:
-              build_type: ["debug", "release"]
-      - build-neon:
-          name: build-neon-<< matrix.build_type >>
-          matrix:
-            parameters:
-              build_type: ["debug", "release"]
-          requires:
-            - build-postgres-<< matrix.build_type >>
-      - run-pytest:
-          name: pg_regress-tests-<< matrix.build_type >>
-          matrix:
-            parameters:
-              build_type: ["debug", "release"]
-          test_selection: batch_pg_regress
-          needs_postgres_source: true
-          requires:
-            - build-neon-<< matrix.build_type >>
-      - run-pytest:
-          name: other-tests-<< matrix.build_type >>
-          matrix:
-            parameters:
-              build_type: ["debug", "release"]
-          test_selection: batch_others
-          requires:
-            - build-neon-<< matrix.build_type >>
-      - run-pytest:
-          name: benchmarks
-          context: PERF_TEST_RESULT_CONNSTR
-          build_type: release
-          test_selection: performance
-          run_in_parallel: false
-          save_perf_report: true
-          requires:
-            - build-neon-release
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -0,0 +1,56 @@
+name: "Download an artifact"
+description: "Custom download action"
+inputs:
+  name:
+    description: "Artifact name"
+    required: true
+  path:
+    description: "A directory to put artifact into"
+    default: "."
+    required: false
+  skip-if-does-not-exist:
+    description: "Allow to skip if file doesn't exist, fail otherwise"
+    default: false
+    required: false
+
+runs:
+  using: "composite"
+  steps:
+    - name: Download artifact
+      id: download-artifact
+      shell: bash -euxo pipefail {0}
+      env:
+        TARGET: ${{ inputs.path }}
+        ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
+        SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
+      run: |
+        BUCKET=neon-github-public-dev
+        PREFIX=artifacts/${GITHUB_RUN_ID}
+        FILENAME=$(basename $ARCHIVE)
+
+        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+        if [ -z "${S3_KEY}" ]; then
+          if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
+            echo '::set-output name=SKIPPED::true'
+            exit 0
+          else
+            echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} nor its version from previous attempts exist"
+            exit 1
+          fi
+        fi
+
+        echo '::set-output name=SKIPPED::false'
+
+        mkdir -p $(dirname $ARCHIVE)
+        time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE}
+
+    - name: Extract artifact
+      if: ${{ steps.download-artifact.outputs.SKIPPED == 'false' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        TARGET: ${{ inputs.path }}
+        ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
+      run: |
+        mkdir -p ${TARGET}
+        time tar -xf ${ARCHIVE} -C ${TARGET}
+        rm -f ${ARCHIVE}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -27,35 +27,35 @@ inputs:
    description: 'Whether to upload the performance report'
    required: false
    default: 'false'
+  run_with_real_s3:
+    description: 'Whether to pass real s3 credentials to the test suite'
+    required: false
+    default: 'false'
+  real_s3_bucket:
+    description: 'Bucket name for real s3 tests'
+    required: false
+    default: ''
+  real_s3_region:
+    description: 'Region name for real s3 tests'
+    required: false
+    default: ''
+  real_s3_access_key_id:
+    description: 'Access key id'
+    required: false
+    default: ''
+  real_s3_secret_access_key:
+    description: 'Secret access key'
+    required: false
+    default: ''

 runs:
  using: "composite"
  steps:
-    - name: Get Neon artifact for restoration
-      uses: actions/download-artifact@v3
+    - name: Get Neon artifact
+      uses: ./.github/actions/download
      with:
        name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
-        path: ./neon-artifact/
-
-    - name: Get Postgres artifact for restoration
-      uses: actions/download-artifact@v3
-      with:
-        name: postgres-${{ runner.os }}-${{ inputs.build_type }}-artifact
-        path: ./pg-artifact/
-
-    - name: Extract Neon artifact
-      shell: bash -ex {0}
-      run: |
-        mkdir -p /tmp/neon/
-        tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
-        rm -rf ./neon-artifact/
-
-    - name: Extract Postgres artifact
-      shell: bash -ex {0}
-      run: |
-        mkdir -p /tmp/neon/tmp_install
-        tar -xf ./pg-artifact/pg.tgz -C /tmp/neon/tmp_install
-        rm -rf ./pg-artifact/
+        path: /tmp/neon

    - name: Checkout
      if: inputs.needs_postgres_source == 'true'
@@ -72,18 +72,21 @@ runs:
        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}

    - name: Install Python deps
-      shell: bash -ex {0}
+      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

    - name: Run pytest
      env:
        NEON_BIN: /tmp/neon/bin
-        POSTGRES_DISTRIB_DIR: /tmp/neon/tmp_install
+        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
        TEST_OUTPUT: /tmp/test_output
        # this variable will be embedded in perf test report
        # and is needed to distinguish different environments
        PLATFORM: github-actions-selfhosted
-      shell: bash -ex {0}
+        BUILD_TYPE: ${{ inputs.build_type }}
+        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
+        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
+      shell: bash -euxo pipefail {0}
      run: |
        PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
        rm -rf $PERF_REPORT_DIR
@@ -97,6 +100,14 @@ runs:
        if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
        fi
+
+        if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
+          echo "REAL S3 ENABLED"
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=${{ inputs.real_s3_bucket }}
+          export REMOTE_STORAGE_S3_REGION=${{ inputs.real_s3_region }}
+        fi
+
        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
          if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
            mkdir -p "$PERF_REPORT_DIR"
@@ -112,7 +123,7 @@ runs:

        # Run the tests.
        #
-        # The junit.xml file allows CircleCI to display more fine-grained test information
+        # The junit.xml file allows CI tools to display more fine-grained test information
        # in its "Tests" tab in the results page.
        # --verbose prints name of each test (helpful when there are
        # multiple tests in one file)
@@ -136,7 +147,7 @@ runs:
        fi

    - name: Delete all data but logs
-      shell: bash -ex {0}
+      shell: bash -euxo pipefail {0}
      if: always()
      run: |
        du -sh /tmp/test_output/*
@@ -145,9 +156,7 @@ runs:

    - name: Upload python test logs
      if: always()
-      uses: actions/upload-artifact@v3
+      uses: ./.github/actions/upload
      with:
-        retention-days: 7
-        if-no-files-found: error
        name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
        path: /tmp/test_output/
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -5,13 +5,18 @@ runs:
  using: "composite"
  steps:
    - name: Merge coverage data
-      shell: bash -ex {0}
+      shell: bash -euxo pipefail {0}
      run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge

-    - name: Upload coverage data
-      uses: actions/upload-artifact@v3
+    - name: Download previous coverage data into the same directory
+      uses: ./.github/actions/download
      with:
-        retention-days: 7
-        if-no-files-found: error
        name: coverage-data-artifact
-        path: /tmp/coverage/
+        path: /tmp/coverage
+        skip-if-does-not-exist: true # skip if there's no previous coverage to download
+
+    - name: Upload coverage data
+      uses: ./.github/actions/upload
+      with:
+        name: coverage-data-artifact
+        path: /tmp/coverage
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -0,0 +1,51 @@
+name: "Upload an artifact"
+description: "Custom upload action"
+inputs:
+  name:
+    description: "Artifact name"
+    required: true
+  path:
+    description: "A directory or file to upload"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Prepare artifact
+      shell: bash -euxo pipefail {0}
+      env:
+        SOURCE: ${{ inputs.path }}
+        ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
+      run: |
+        mkdir -p $(dirname $ARCHIVE)
+
+        if [ -f ${ARCHIVE} ]; then
+          echo 2>&1 "File ${ARCHIVE} already exist. Something went wrong before"
+          exit 1
+        fi
+
+        ZSTD_NBTHREADS=0
+        if [ -d  ${SOURCE} ]; then
+          time tar -C ${SOURCE} -cf ${ARCHIVE} --zstd .
+        elif [ -f ${SOURCE} ]; then
+          time tar -cf ${ARCHIVE} --zstd ${SOURCE}
+        else
+          echo 2>&1 "${SOURCE} neither directory nor file, don't know how to handle it"
+        fi
+
+    - name: Upload artifact
+      shell: bash -euxo pipefail {0}
+      env:
+        SOURCE: ${{ inputs.path }}
+        ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
+      run: |
+        BUCKET=neon-github-public-dev
+        PREFIX=artifacts/${GITHUB_RUN_ID}
+        FILENAME=$(basename $ARCHIVE)
+
+        FILESIZE=$(du -sh ${ARCHIVE} | cut -f1)
+
+        time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}
+
+        # Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary
+        echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY}
--- a/.github/ansible/production.hosts
+++ b/.github/ansible/production.hosts
@@ -17,4 +17,4 @@ env_name = prod-1
 console_mgmt_base_url = http://console-release.local
 bucket_name           = zenith-storage-oregon
 bucket_region         = us-west-2
-etcd_endpoints        = etcd-release.local:2379
+etcd_endpoints        = zenith-1-etcd.local:2379
--- a/.github/ansible/scripts/init_safekeeper.sh
+++ b/.github/ansible/scripts/init_safekeeper.sh
@@ -12,10 +12,9 @@ cat <<EOF | tee /tmp/payload
  "version": 1,
  "host": "${HOST}",
  "port": 6500,
+  "http_port": 7676,
  "region_id": {{ console_region_id }},
-  "instance_id": "${INSTANCE_ID}",
-  "http_host": "${HOST}",
-  "http_port": 7676
+  "instance_id": "${INSTANCE_ID}"
 }
 EOF

--- a/.github/ansible/staging.hosts
+++ b/.github/ansible/staging.hosts
@@ -17,4 +17,4 @@ env_name = us-stage
 console_mgmt_base_url = http://console-staging.local
 bucket_name           = zenith-staging-storage-us-east-1
 bucket_region         = us-east-1
-etcd_endpoints        = etcd-staging.local:2379
+etcd_endpoints        = zenith-us-stage-etcd.local:2379
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,7 +11,7 @@ on:
    #          │ │ ┌───────────── day of the month (1 - 31)
    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '36 7 * * *' # run once a day, timezone is utc
+    - cron:  '36 4 * * *' # run once a day, timezone is utc

  workflow_dispatch: # adds ability to run this manually

@@ -60,7 +60,7 @@ jobs:
    - name: Setup cluster
      env:
        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
-      shell: bash
+      shell: bash -euxo pipefail {0}
      run: |
        set -e

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -3,13 +3,13 @@ name: Test and Deploy
 on:
  push:
    branches:
-    - main
-    - release
+      - main
+      - release
  pull_request:

 defaults:
  run:
-    shell: bash -ex {0}
+    shell: bash -euxo pipefail {0}

 concurrency:
  # Allow only one workflow per any non-`main` branch.
@@ -21,8 +21,9 @@ env:
  COPT: '-Werror'

 jobs:
-  build-postgres:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
+  build-neon:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
    strategy:
      fail-fast: false
      matrix:
@@ -31,7 +32,19 @@ jobs:

    env:
      BUILD_TYPE: ${{ matrix.build_type }}
+      GIT_VERSION: ${{ github.sha }}
+
    steps:
+      - name: Fix git ownerwhip
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
      - name: Checkout
        uses: actions/checkout@v3
        with:
@@ -42,58 +55,28 @@ jobs:
        id: pg_ver
        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)

-      - name: Cache postgres build
-        id: cache_pg
-        uses: actions/cache@v3
-        with:
-          path: tmp_install/
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Build postgres
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: mold -run make postgres -j$(nproc)
-
-      # actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache
-      - name: Prepare postgres artifact
-        run: tar -C tmp_install/ -czf ./pg.tgz .
-      - name: Upload postgres artifact
-        uses: actions/upload-artifact@v3
-        with:
-          retention-days: 7
-          if-no-files-found: error
-          name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact
-          path: ./pg.tgz
-
-
-  build-neon:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
-    needs: [ build-postgres ]
-    strategy:
-      fail-fast: false
-      matrix:
-        build_type: [ debug, release ]
-        rust_toolchain: [ 1.58 ]
-
-    env:
-      BUILD_TYPE: ${{ matrix.build_type }}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 1
-
-      - name: Get postgres artifact for restoration
-        uses: actions/download-artifact@v3
-        with:
-          name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact
-          path: ./postgres-artifact/
-      - name: Extract postgres artifact
+      # Set some environment variables used by all the steps.
+      #
+      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
+      #   It also includes --features, if any
+      #
+      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
+      #   because "cargo metadata" doesn't accept --release or --debug options
+      #
+      - name: Set env variables
        run: |
-          mkdir ./tmp_install/
-          tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/
-          rm -rf ./postgres-artifact/
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
+            CARGO_FEATURES=""
+            CARGO_FLAGS=""
+          elif [[ $BUILD_TYPE == "release" ]]; then
+            cov_prefix=""
+            CARGO_FEATURES="--features profiling"
+            CARGO_FLAGS="--release $CARGO_FEATURES"
+          fi
+          echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
+          echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
+          echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV

      # Don't include the ~/.cargo/registry/src directory. It contains just
      # uncompressed versions of the crates in ~/.cargo/registry/cache
@@ -110,59 +93,36 @@ jobs:
            target/
          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
          key: |
-            v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
-            v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
+            v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+            v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
+
+      - name: Cache postgres build
+        id: cache_pg
+        uses: actions/cache@v3
+        with:
+          path: tmp_install/
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: mold -run make postgres -j$(nproc)

      - name: Run cargo build
        run: |
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
-            CARGO_FLAGS=
-          elif [[ $BUILD_TYPE == "release" ]]; then
-            cov_prefix=()
-            CARGO_FLAGS="--release --features profiling"
-          fi
-
-          "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests

      - name: Run cargo test
        run: |
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
-            CARGO_FLAGS=
-          elif [[ $BUILD_TYPE == "release" ]]; then
-            cov_prefix=()
-            CARGO_FLAGS=--release
-          fi
-
-          "${cov_prefix[@]}" cargo test $CARGO_FLAGS
+          ${cov_prefix} cargo test $CARGO_FLAGS

      - name: Install rust binaries
        run: |
-          if [[ $BUILD_TYPE == "debug" ]]; then
-            cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
-          elif [[ $BUILD_TYPE == "release" ]]; then
-            cov_prefix=()
-          fi
-
+          # Install target binaries
+          mkdir -p /tmp/neon/bin/
          binaries=$(
-            "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
+            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
          )
-
-          test_exe_paths=$(
-            "${cov_prefix[@]}" cargo test --message-format=json --no-run |
-            jq -r '.executable | select(. != null)'
-          )
-
-          mkdir -p /tmp/neon/bin/
-          mkdir -p /tmp/neon/test_bin/
-          mkdir -p /tmp/neon/etc/
-
-          # Keep bloated coverage data files away from the rest of the artifact
-          mkdir -p /tmp/coverage/
-
-          # Install target binaries
          for bin in $binaries; do
            SRC=target/$BUILD_TYPE/$bin
            DST=/tmp/neon/bin/$bin
@@ -171,9 +131,15 @@ jobs:

          # Install test executables and write list of all binaries (for code coverage)
          if [[ $BUILD_TYPE == "debug" ]]; then
-            for bin in $binaries; do
-              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
-            done
+            # Keep bloated coverage data files away from the rest of the artifact
+            mkdir -p /tmp/coverage/
+
+            mkdir -p /tmp/neon/test_bin/
+
+            test_exe_paths=$(
+              ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run |
+              jq -r '.executable | select(. != null)'
+            )
            for bin in $test_exe_paths; do
              SRC=$bin
              DST=/tmp/neon/test_bin/$(basename $bin)
@@ -183,27 +149,29 @@ jobs:
              strip "$SRC" -o "$DST"
              echo "$DST" >> /tmp/coverage/binaries.list
            done
+
+            for bin in $binaries; do
+              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
+            done
          fi

-      - name: Prepare neon artifact
-        run: tar -C /tmp/neon/ -czf ./neon.tgz .
+      - name: Install postgres binaries
+        run: cp -a tmp_install /tmp/neon/pg_install

-      - name: Upload neon binaries
-        uses: actions/upload-artifact@v3
+      - name: Upload Neon artifact
+        uses: ./.github/actions/upload
        with:
-          retention-days: 7
-          if-no-files-found: error
          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
-          path: ./neon.tgz
+          path: /tmp/neon

      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug'
        uses: ./.github/actions/save-coverage-data

-
  pg_regress-tests:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
    needs: [ build-neon ]
    strategy:
      fail-fast: false
@@ -230,7 +198,8 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  other-tests:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
    needs: [ build-neon ]
    strategy:
      fail-fast: false
@@ -250,14 +219,20 @@ jobs:
          build_type: ${{ matrix.build_type }}
          rust_toolchain: ${{ matrix.rust_toolchain }}
          test_selection: batch_others
-
+          run_with_real_s3: true
+          real_s3_bucket: ci-tests-s3
+          real_s3_region: us-west-2
+          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
+          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug'
        uses: ./.github/actions/save-coverage-data

  benchmarks:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
    needs: [ build-neon ]
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
      matrix:
@@ -285,7 +260,8 @@ jobs:
      # while coverage is currently collected for the debug ones

  coverage-report:
-    runs-on: [ self-hosted, Linux, k8s-runner ]
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
    needs: [ other-tests, pg_regress-tests ]
    strategy:
      fail-fast: false
@@ -308,25 +284,19 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git/
            target/
-          key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+          key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}

-      - name: Get Neon artifact for restoration
-        uses: actions/download-artifact@v3
+      - name: Get Neon artifact
+        uses: ./.github/actions/download
        with:
          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
-          path: ./neon-artifact/
+          path: /tmp/neon

-      - name: Extract Neon artifact
-        run: |
-          mkdir -p /tmp/neon/
-          tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
-          rm -rf ./neon-artifact/
-
-      - name: Restore coverage data
-        uses: actions/download-artifact@v3
+      - name: Get coverage artifact
+        uses: ./.github/actions/download
        with:
          name: coverage-data-artifact
-          path: /tmp/coverage/
+          path: /tmp/coverage

      - name: Merge coverage data
        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
@@ -364,40 +334,40 @@ jobs:
            }"

  trigger-e2e-tests:
-   runs-on: [ self-hosted, Linux, k8s-runner ]
-   needs: [ build-neon ]
-   steps:
-     - name: Set PR's status to pending and request a remote CI test
-       run: |
-         COMMIT_SHA=${{ github.event.pull_request.head.sha }}
-         COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ build-neon ]
+    steps:
+      - name: Set PR's status to pending and request a remote CI test
+        run: |
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}

-         REMOTE_REPO="${{ github.repository_owner }}/cloud"
+          REMOTE_REPO="${{ github.repository_owner }}/cloud"

-         curl -f -X POST \
-         https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-         -H "Accept: application/vnd.github.v3+json" \
-         --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-         --data \
-           "{
-             \"state\": \"pending\",
-             \"context\": \"neon-cloud-e2e\",
-             \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-           }"
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"neon-cloud-e2e\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"

-         curl -f -X POST \
-         https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-         -H "Accept: application/vnd.github.v3+json" \
-         --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-         --data \
-           "{
-             \"ref\": \"main\",
-             \"inputs\": {
-               \"ci_job_name\": \"neon-cloud-e2e\",
-               \"commit_hash\": \"$COMMIT_SHA\",
-               \"remote_repo\": \"${{ github.repository }}\"
-             }
-           }"
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"neon-cloud-e2e\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\"
+              }
+            }"

  docker-image:
    runs-on: [ self-hosted, Linux, k8s-runner ]
@@ -440,9 +410,9 @@ jobs:
      - name: Get legacy build tag
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            echo "::set-output name=tag::latest
+            echo "::set-output name=tag::latest"
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            echo "::set-output name=tag::release
+            echo "::set-output name=tag::release"
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
@@ -502,9 +472,9 @@ jobs:
      - name: Get legacy build tag
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            echo "::set-output name=tag::latest
+            echo "::set-output name=tag::latest"
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            echo "::set-output name=tag::release
+            echo "::set-output name=tag::release"
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -8,7 +8,7 @@ on:

 defaults:
  run:
-    shell: bash -ex {0}
+    shell: bash -euxo pipefail {0}

 concurrency:
  # Allow only one workflow per any non-`main` branch.
@@ -27,7 +27,7 @@ jobs:
        # Rust toolchains (e.g. nightly or 1.37.0), add them here.
        rust_toolchain: [1.58]
        os: [ubuntu-latest, macos-latest]
-    timeout-minutes: 50
+    timeout-minutes: 60
    name: run regression test suite
    runs-on: ${{ matrix.os }}

@@ -101,7 +101,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git
            target
-          key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
+          key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}

      - name: Run cargo clippy
        run: ./run_clippy.sh
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -40,7 +40,7 @@ jobs:
        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}

    - name: Install Python deps
-      shell: bash -ex {0}
+      shell: bash -euxo pipefail {0}
      run: ./scripts/pysync

    - name: Run pytest
@@ -49,7 +49,7 @@ jobs:
        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
        TEST_OUTPUT: /tmp/test_output
        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      shell: bash -ex {0}
+      shell: bash -euxo pipefail {0}
      run: |
        # Test framework expects we have psql binary;
        # but since we don't really need it in this test, let's mock it
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,17 +11,15 @@ than it was before.

 ## Submitting changes

-1. Make a PR for every change.
-
-   Even seemingly trivial patches can break things in surprising ways.
-Use of common sense is OK. If you're only fixing a typo in a comment,
-it's probably fine to just push it. But if in doubt, open a PR.
-
-2. Get at least one +1 on your PR before you push.
+1. Get at least one +1 on your PR before you push.

   For simple patches, it will only take a minute for someone to review
 it.

+2. Don't force push small changes after making the PR ready for review.
+Doing so will force readers to re-read your entire PR, which will delay
+the review process.
+
 3. Always keep the CI green.

   Do not push, if the CI failed on your PR. Even if you think it's not
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -154,9 +154,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"

 [[package]]
 name = "axum"
-version = "0.5.12"
+version = "0.5.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d16705af05732b7d3258ec0f7b73c03a658a28925e050d8852d5b568ee8bcf4e"
+checksum = "6b9496f0c1d1afb7a2af4338bbe1d969cddfead41d87a9fb3aaa6d0bbc7af648"
 dependencies = [
 "async-trait",
 "axum-core",
@@ -317,15 +317,6 @@ dependencies = [
 "serde",
 ]

-[[package]]
-name = "cast"
-version = "0.2.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a"
-dependencies = [
- "rustc_version",
-]
-
 [[package]]
 name = "cast"
 version = "0.3.0"
@@ -467,7 +458,6 @@ dependencies = [
 "clap 3.2.12",
 "env_logger",
 "hyper",
- "libc",
 "log",
 "postgres",
 "regex",
@@ -505,8 +495,8 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "lazy_static",
 "nix",
+ "once_cell",
 "pageserver",
 "postgres",
 "regex",
@@ -517,7 +507,6 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
- "url",
 "utils",
 "workspace_hack",
 ]
@@ -581,7 +570,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f"
 dependencies = [
 "atty",
- "cast 0.3.0",
+ "cast",
 "clap 2.34.0",
 "criterion-plot",
 "csv",
@@ -602,11 +591,11 @@ dependencies = [

 [[package]]
 name = "criterion-plot"
-version = "0.4.4"
+version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57"
+checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876"
 dependencies = [
- "cast 0.2.7",
+ "cast",
 "itertools",
 ]

@@ -682,9 +671,9 @@ dependencies = [

 [[package]]
 name = "crypto-common"
-version = "0.1.5"
+version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ccfd8c0ee4cce11e45b3fd6f9d5e69e0cc62912aa6a0cb1bf4617b0eba5a12f"
+checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
 dependencies = [
 "generic-array",
 "typenum",
@@ -1118,9 +1107,9 @@ dependencies = [

 [[package]]
 name = "gimli"
-version = "0.26.1"
+version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78cc372d058dcf6d5ecd98510e7fbc9e5aec4d21de70f65fea8fecebcd881bd4"
+checksum = "22030e2c5a68ec659fde1e949a745124b48e6fa8b045b7ed5bd1fe4ccc5c4e5d"

 [[package]]
 name = "git-version"
@@ -1186,9 +1175,9 @@ dependencies = [

 [[package]]
 name = "hashbrown"
-version = "0.12.2"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "607c8a29735385251a339424dd462993c0fed8fa09d378f259377df08c126022"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"

 [[package]]
 name = "heck"
@@ -1390,7 +1379,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
 dependencies = [
 "autocfg",
- "hashbrown 0.12.2",
+ "hashbrown 0.12.3",
 ]

 [[package]]
@@ -1602,7 +1591,6 @@ dependencies = [
 name = "metrics"
 version = "0.1.0"
 dependencies = [
- "lazy_static",
 "libc",
 "once_cell",
 "prometheus",
@@ -1677,7 +1665,6 @@ dependencies = [
 "git-version",
 "pageserver",
 "postgres",
- "postgres_ffi",
 "safekeeper",
 "serde_json",
 "utils",
@@ -1855,9 +1842,9 @@ dependencies = [

 [[package]]
 name = "os_str_bytes"
-version = "6.1.0"
+version = "6.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "21326818e99cfe6ce1e524c2a805c189a99b5ae555a35d19f9a284b427d86afa"
+checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4"

 [[package]]
 name = "pageserver"
@@ -1883,7 +1870,6 @@ dependencies = [
 "humantime-serde",
 "hyper",
 "itertools",
- "lazy_static",
 "metrics",
 "nix",
 "once_cell",
@@ -1905,7 +1891,6 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-postgres",
- "tokio-stream",
 "toml_edit",
 "tracing",
 "url",
@@ -2130,9 +2115,9 @@ dependencies = [
 "crc32c",
 "env_logger",
 "hex",
- "lazy_static",
 "log",
 "memoffset",
+ "once_cell",
 "postgres",
 "rand",
 "regex",
@@ -2292,9 +2277,9 @@ dependencies = [
 "hex",
 "hmac 0.12.1",
 "hyper",
- "lazy_static",
 "md5",
 "metrics",
+ "once_cell",
 "parking_lot 0.12.1",
 "pin-project-lite",
 "rand",
@@ -2740,9 +2725,9 @@ dependencies = [

 [[package]]
 name = "rustversion"
-version = "1.0.7"
+version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0a5f7c728f5d284929a1cccb5bc19884422bfe6ef4d6c409da2c41838983fcf"
+checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8"

 [[package]]
 name = "ryu"
@@ -2764,12 +2749,10 @@ dependencies = [
 "daemonize",
 "etcd_broker",
 "fs2",
- "futures",
 "git-version",
 "hex",
 "humantime",
 "hyper",
- "lazy_static",
 "metrics",
 "once_cell",
 "postgres",
@@ -2784,12 +2767,10 @@ dependencies = [
 "tempfile",
 "tokio",
 "tokio-postgres",
- "tokio-util",
 "toml_edit",
 "tracing",
 "url",
 "utils",
- "walkdir",
 "workspace_hack",
 ]

@@ -3625,9 +3606,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992"

 [[package]]
 name = "unicode-ident"
-version = "1.0.1"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c"
+checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7"

 [[package]]
 name = "unicode-normalization"
@@ -3688,9 +3669,9 @@ dependencies = [
 "hex-literal",
 "hyper",
 "jsonwebtoken",
- "lazy_static",
 "metrics",
 "nix",
+ "once_cell",
 "pin-project-lite",
 "postgres",
 "postgres-protocol",
--- a/4
+++ b/4
@@ -17,6 +17,10 @@ RUN set -e \
 FROM neondatabase/rust:1.58 AS build
 ARG GIT_VERSION=local

+# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
+# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
+# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
+ARG RUSTC_WRAPPER=cachepot
 ARG CACHEPOT_BUCKET=zenith-rust-cachepot
 ARG AWS_ACCESS_KEY_ID
 ARG AWS_SECRET_ACCESS_KEY
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,11 @@
 # First transient image to build compute_tools binaries
-# NB: keep in sync with rust image version in .circle/config.yml
+# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
 FROM neondatabase/rust:1.58 AS rust-build

+# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
+# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
+# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
+ARG RUSTC_WRAPPER=cachepot
 ARG CACHEPOT_BUCKET=zenith-rust-cachepot
 ARG AWS_ACCESS_KEY_ID
 ARG AWS_SECRET_ACCESS_KEY
--- a/4
+++ b/4
@@ -29,9 +29,11 @@ else
 endif

 # macOS with brew-installed openssl requires explicit paths
+# It can be configured with OPENSSL_PREFIX variable
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Darwin)
-    PG_CONFIGURE_OPTS += --with-includes=$(HOMEBREW_PREFIX)/opt/openssl/include --with-libraries=$(HOMEBREW_PREFIX)/opt/openssl/lib
+    OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+    PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
 endif

 # Choose whether we should be silent or verbose
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Neon

-Neon is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.
+Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.

 The project used to be called "Zenith". Many of the commands and code comments
 still refer to "zenith", but we are in the process of renaming things.
@@ -12,32 +12,31 @@ Alternatively, compile and run the project [locally](#running-local-installation

 ## Architecture overview

-A Neon installation consists of compute nodes and Neon storage engine.
+A Neon installation consists of compute nodes and a Neon storage engine.

-Compute nodes are stateless PostgreSQL nodes, backed by Neon storage engine.
+Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.

-Neon storage engine consists of two major components:
- Pageserver. Scalable storage backend for compute nodes.
- WAL service. The service that receives WAL from compute node and ensures that it is stored durably.
+The Neon storage engine consists of two major components:
+- Pageserver. Scalable storage backend for the compute nodes.
+- WAL service. The service receives WAL from the compute node and ensures that it is stored durably.

 Pageserver consists of:
 - Repository - Neon storage implementation.
 - WAL receiver - service that receives WAL from WAL service and stores it in the repository.
 - Page service - service that communicates with compute nodes and responds with pages from the repository.
- WAL redo - service that builds pages from base images and WAL records on Page service request.
-
+- WAL redo - service that builds pages from base images and WAL records on Page service request
 ## Running local installation


 #### Installing dependencies on Linux
-1. Install build dependencies and other useful packages
+1. Install build dependencies and other applicable packages

-* On Ubuntu or Debian this set of packages should be sufficient to build the code:
+* On Ubuntu or Debian, this set of packages should be sufficient to build the code:
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client
 ```
-* On Fedora these packages are needed:
+* On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib
@@ -69,7 +68,7 @@ brew install libpq
 brew link --force libpq
 ```

-#### Building on Linux and OSX
+#### Building on Linux

 1. Build neon and patched postgres
 ```
@@ -80,19 +79,35 @@ cd neon

 # The preferred and default is to make a debug build. This will create a 
 # demonstrably slower build than a release build. If you want to use a release
-# build, utilize "`BUILD_TYPE=release make -j`nproc``" 
+# build, utilize "BUILD_TYPE=release make -j`nproc`" 

 make -j`nproc`
 ```

-#### dependency installation notes
+#### Building on OSX
+
+1. Build neon and patched postgres
+```
+# Note: The path to the neon sources can not contain a space.
+
+git clone --recursive https://github.com/neondatabase/neon.git
+cd neon
+
+# The preferred and default is to make a debug build. This will create a 
+# demonstrably slower build than a release build. If you want to use a release
+# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" 
+
+make -j`sysctl -n hw.logicalcpu`
+```
+
+#### Dependency installation notes
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
+Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry](https://python-poetry.org/)) in the project directory.


-#### running neon database
+#### Running neon database
 1. Start pageserver and postgres on top of it (should be called from repo root):
 ```sh
 # Create repository in .neon with proper paths to binaries and data
@@ -123,7 +138,7 @@ Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=pos
 main  127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
 ```

-2. Now it is possible to connect to postgres and run some queries:
+2. Now, it is possible to connect to postgres and run some queries:
 ```text
 > psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# CREATE TABLE t(key int primary key, value text);
@@ -181,14 +196,16 @@ postgres=# select * from t;
 (1 row)
 ```

-4. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
-   you have just started. You can stop them all with one command:
+4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
+   you have just started. You can terminate them all with one command:
 ```sh
 > ./target/debug/neon_local stop
 ```

 ## Running tests

+Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
+
 ```sh
 git clone --recursive https://github.com/neondatabase/neon.git
 make # builds also postgres and installs it to ./tmp_install
@@ -205,8 +222,8 @@ To view your `rustdoc` documentation in a browser, try running `cargo doc --no-d

 ### Postgres-specific terms

-Due to Neon's very close relation with PostgreSQL internals, there are numerous specific terms used.
-Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.
+Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used.
+The same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.

 To get more familiar with this aspect, refer to:

--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -4,7 +4,6 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-libc = "0.2"
 anyhow = "1.0"
 chrono = "0.4"
 clap = "3.0"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -157,7 +157,7 @@ fn main() -> Result<()> {
            exit(code)
        }
        Err(error) => {
-            error!("could not start the compute node: {}", error);
+            error!("could not start the compute node: {:?}", error);

            let mut state = compute.state.write().unwrap();
            state.error = Some(format!("{:?}", error));
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,12 +9,11 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "1.12.0"
 toml = "0.5"
-lazy_static = "1.4"
+once_cell = "1.13.0"
 regex = "1"
 anyhow = "1.0"
 thiserror = "1"
 nix = "0.23"
-url = "2.2.2"
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }

 pageserver = { path = "../pageserver" }
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -30,14 +30,14 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let etcd_stdout_file =
        fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
            format!(
-                "Failed to create ectd stout file in directory {}",
+                "Failed to create etcd stout file in directory {}",
                etcd_data_dir.display()
            )
        })?;
    let etcd_stderr_file =
        fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
            format!(
-                "Failed to create ectd stderr file in directory {}",
+                "Failed to create etcd stderr file in directory {}",
                etcd_data_dir.display()
            )
        })?;
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -51,7 +51,11 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
 }

 fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
-    for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
+    for env_key in [
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
+    ] {
        if let Ok(value) = std::env::var(env_key) {
            cmd = cmd.env(env_key, value);
        }
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -5,7 +5,7 @@
 /// enough to extract a few settings we need in Zenith, assuming you don't do
 /// funny stuff like include-directives or funny escaping.
 use anyhow::{bail, Context, Result};
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
 use regex::Regex;
 use std::collections::HashMap;
 use std::fmt;
@@ -19,9 +19,7 @@ pub struct PostgresConf {
    hash: HashMap<String, String>,
 }

-lazy_static! {
-    static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap();
-}
+static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());

 impl PostgresConf {
    pub fn new() -> PostgresConf {
@@ -139,10 +137,10 @@ fn escape_str(s: &str) -> String {
    //
    // This regex is a bit more conservative than the rules in guc-file.l, so we quote some
    // strings that PostgreSQL would accept without quoting, but that's OK.
-    lazy_static! {
-        static ref UNQUOTED_RE: Regex =
-            Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap();
-    }
+
+    static UNQUOTED_RE: Lazy<Regex> =
+        Lazy::new(|| Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap());
+
    if UNQUOTED_RE.is_match(s) {
        s.to_string()
    } else {
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -247,7 +247,7 @@ impl SafekeeperNode {
        // Shutting down may take a long time,
        // if safekeeper flushes a lot of data
        let mut tcp_stopped = false;
-        for _ in 0..100 {
+        for i in 0..600 {
            if !tcp_stopped {
                if let Err(err) = TcpStream::connect(&address) {
                    tcp_stopped = true;
@@ -272,9 +272,11 @@ impl SafekeeperNode {
                    }
                }
            }
-            print!(".");
-            io::stdout().flush().unwrap();
-            thread::sleep(Duration::from_secs(1));
+            if i % 10 == 0 {
+                print!(".");
+                io::stdout().flush().unwrap();
+            }
+            thread::sleep(Duration::from_millis(100));
        }

        bail!("Failed to stop safekeeper with pid {}", pid);
@@ -304,10 +306,9 @@ impl SafekeeperNode {
        Ok(self
            .http_request(
                Method::POST,
-                format!("{}/{}", self.http_base_url, "timeline"),
+                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
            )
            .json(&TimelineCreateRequest {
-                tenant_id,
                timeline_id,
                peer_ids,
            })
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -12,9 +12,9 @@ use anyhow::{bail, Context};
 use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
-use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest};
-use pageserver::tenant_mgr::TenantInfo;
-use pageserver::timelines::TimelineInfo;
+use pageserver::http::models::{
+    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
+};
 use postgres::{Config, NoTls};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
@@ -318,7 +318,7 @@ impl PageServerNode {
        // Shutting down may take a long time,
        // if pageserver checkpoints a lot of data
        let mut tcp_stopped = false;
-        for _ in 0..100 {
+        for i in 0..600 {
            if !tcp_stopped {
                if let Err(err) = TcpStream::connect(&address) {
                    tcp_stopped = true;
@@ -344,9 +344,11 @@ impl PageServerNode {
                    }
                }
            }
-            print!(".");
-            io::stdout().flush().unwrap();
-            thread::sleep(Duration::from_secs(1));
+            if i % 10 == 0 {
+                print!(".");
+                io::stdout().flush().unwrap();
+            }
+            thread::sleep(Duration::from_millis(100));
        }

        bail!("Failed to stop pageserver with pid {}", pid);
@@ -399,6 +401,7 @@ impl PageServerNode {
                    .get("checkpoint_distance")
                    .map(|x| x.parse::<u64>())
                    .transpose()?,
+                checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
                compaction_target_size: settings
                    .get("compaction_target_size")
                    .map(|x| x.parse::<u64>())
@@ -453,6 +456,7 @@ impl PageServerNode {
                    .map(|x| x.parse::<u64>())
                    .transpose()
                    .context("Failed to parse 'checkpoint_distance' as an integer")?,
+                checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
                compaction_target_size: settings
                    .get("compaction_target_size")
                    .map(|x| x.parse::<u64>())
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 set -eux

+pageserver_id_param="${NODE_ID:-10}"
+
 broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
 if [ "$broker_endpoints_param" != "absent" ]; then
    broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
@@ -8,10 +10,12 @@ else
    broker_endpoints_param=''
 fi

+remote_storage_param="${REMOTE_STORAGE:-}"
+
 if [ "$1" = 'pageserver' ]; then
    if [ ! -d "/data/tenants" ]; then
        echo "Initializing pageserver data directory"
-        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param
+        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param
    fi
    echo "Staring pageserver at 0.0.0.0:6400"
    pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -52,10 +52,8 @@
 - [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
 - [settings.md](./settings.md)
 #FIXME: move these under sourcetree.md
-#- [pageserver/README.md](/pageserver/README.md)
 #- [postgres_ffi/README.md](/libs/postgres_ffi/README.md)
 #- [test_runner/README.md](/test_runner/README.md)
-#- [safekeeper/README.md](/safekeeper/README.md)


 # RFCs
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -6,206 +6,514 @@ is to eliminate all these changes, by submitting patches to upstream
 and refactoring code into extensions, so that you can run unmodified
 PostgreSQL against Neon storage.

+In Neon, we run PostgreSQL in the compute nodes, but we also run a special WAL redo process in the
+page server. We currently use the same binary for both, with --wal-redo runtime flag to launch it in
+the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for
+the WAL redo process.

-1. Add t_cid to XLOG record
- Why?
-  The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax.
+In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the
+smgr interface. Once all the core changes have been submitted to upstream or eliminated some other
+way, the extension could live outside the postgres repository and build against vanilla PostgreSQL.

-  To recap, the problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares abut it anymore.
+Below is a list of all the PostgreSQL source code changes, categorized into changes needed for
+compute, and changes needed for the WAL redo process:

- Alternatives?
-  I don't know
+# Changes for Compute node

-2. Add PD_WAL_LOGGED.
- Why?
-  Postgres sometimes writes data to the page before it is wal-logged. If such page ais swapped out, we  will loose this change. The problem is currently solved by setting PD_WAL_LOGGED bit in page header. When page without this bit set is written to the SMGR, then it is forced to be written to the WAL as FPI using log_newpage_copy() function.
+## Add t_cid to heap WAL records

-  There was wrong assumption that it can happen only during construction of some exotic indexes (like gist). It is not true. The same situation can happen with COPY,VACUUM and when record hint bits are set.
+```
+ src/backend/access/heap/heapam.c                            |   26 +-
+ src/include/access/heapam_xlog.h                            |    6 +-
+```

- Discussion:
-  https://discord.com/channels/869525774699462656/882681420986851359
+We have added a new t_cid field to heap WAL records. This changes the WAL record format, making Neon WAL format incompatible with vanilla PostgreSQL!

- Alternatives:
-  Do not store this flag in page header, but associate this bit with shared buffer. Logically it is more correct but in practice we will get not advantages: neither in space, neither in CPU overhead.
+### Problem we're trying to solve
+
+The problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works in PostgreSQL, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares about it anymore. But with Neon, we rely on WAL replay to reconstruct the page, even while the original transaction is still running.
+
+### How to get rid of the patch
+
+Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information.


-3. XLogReadBufferForRedo not always loads and pins requested buffer. So we need to add extra checks that buffer is really pinned. Also do not use BufferGetBlockNumber for buffer returned by XLogReadBufferForRedo.
- Why?
-  XLogReadBufferForRedo is not pinning pages which are not requested by wal-redo. It is specific only for wal-redo Postgres.
+### Alternatives
+Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated.

- Alternatives?
-  No
+## ginfast.c
+
+```
+diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
+index e0d9940946..2d964c02e9 100644
+--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
+@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
+                memset(&sublist, 0, sizeof(GinMetaPageData));
+                makeSublist(index, collector->tuples, collector->ntuples, &sublist);
+ 
+               if (metadata->head != InvalidBlockNumber)
+               {
+                       /*
+                        * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call
+                        * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from
+                        * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write()
+                        * will try to WAL-log an image of the page.
+                        */
+                       buffer = ReadBuffer(index, metadata->tail);
+               }
+
+                if (needWal)
+                        XLogBeginInsert();
+ 
+@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
+                        data.prevTail = metadata->tail;
+                        data.newRightlink = sublist.head;
+ 
+-                       buffer = ReadBuffer(index, metadata->tail);
+                        LockBuffer(buffer, GIN_EXCLUSIVE);
+                        page = BufferGetPage(buffer);
+```
+
+The problem is explained in the comment above
+
+### How to get rid of the patch
+
+Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical
+section or something.
+
+Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images?


-4. Eliminate reporting of some warnings related with hint bits, for example
-"page is not marked all-visible but visibility map bit is set in relation".
- Why?
-  Hint bit may be not WAL logged.
+## Mark index builds that use buffer manager without logging explicitly

- Alternative?
-  Always wal log any page changes.
+```
+ src/backend/access/gin/gininsert.c                          |    7 +
+ src/backend/access/gist/gistbuild.c                         |   15 +-
+ src/backend/access/spgist/spginsert.c                       |    8 +-
+
+also some changes in src/backend/storage/smgr/smgr.c
+```
+
+When a GIN index is built, for example, it is built by inserting the entries into the index more or
+less normally, but without WAL-logging anything. After the index has been built, we iterate through
+all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged
+and is evicted from the buffer cache, it is lost. We have an check to catch that in the Neon
+extension. To fix that, we've added a few functions to track explicitly when we're performing such
+an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1` and
+`smgr_end_unlogged_build`.


-5. Maintain last written LSN.
- Why?
-  When compute node requests page from page server, we need to specify LSN. Ideally it should be LSN
-  of WAL record performing last update of this pages. But we do not know it, because we do not have page.
-  We can use current WAL flush position, but in this case there is high probability that page server
-  will be blocked until this peace of WAL is delivered.
-  As better approximation we can keep max LSN of written page. It will be better to take in account LSNs only of evicted pages,
-  but SMGR API doesn't provide such knowledge.
+### How to get rid of the patch

- Alternatives?
-  Maintain map of LSNs of evicted pages.
+I think it would make sense to be more explicit about that in PostgreSQL too. So extract these
+changes to a patch and post to pgsql-hackers.


-6. Launching Postgres without WAL.
- Why?
-  According to Zenith architecture compute node is stateless. So when we are launching
-  compute node, we need to provide some dummy PG_DATADIR. Relation pages
-  can be requested on demand from page server. But Postgres still need some non-relational data:
-  control and configuration files, SLRUs,...
-  It is currently implemented  using basebackup (do not mix with pg_basebackup) which is created
-  by pageserver. It includes in this tarball config/control files, SLRUs and required directories.
-  As far as pageserver do not have original (non-scattered) WAL segments, it includes in
-  this tarball dummy WAL segment which contains only SHUTDOWN_CHECKPOINT record at the beginning of segment,
-  which redo field points to the end of wal. It allows to load checkpoint record in more or less
-  standard way with minimal changes of Postgres, but then some special handling is needed,
-  including restoring previous record position from zenith.signal file.
-  Also we have to correctly initialize header of last WAL page (pointed by checkpoint.redo)
-  to pass checks performed by XLogReader.
+## Track last-written page LSN

- Alternatives?
-  We may not include fake WAL segment in tarball at all and modify xlog.c to load checkpoint record
-  in special way. But it may only increase number of changes in xlog.c
+```
+ src/backend/commands/dbcommands.c                           |   17 +-

-7. Add redo_read_buffer_filter callback to XLogReadBufferForRedoExtended
- Why?
-  We need a way in wal-redo Postgres to ignore pages which are not requested by pageserver.
-  So wal-redo Postgres reconstructs only requested page and for all other returns BLK_DONE
-  which means that recovery for them is not needed.
+Also one call to SetLastWrittenPageLSN() in spginsert.c, maybe elsewhere too
+```

- Alternatives?
-  No
+Whenever a page is evicted from the buffer cache, we remember its LSN, so that we can use the same
+LSN in the GetPage@LSN request when reading the page back from the page server. The value is
+conservative: it would be correct to always use the last-inserted LSN, but it would be slow because
+then the page server would need to wait for the recent WAL to be streamed and processed, before
+responding to any GetPage@LSN request.

-8. Enforce WAL logging of sequence updates.
- Why?
-  Due to performance reasons Postgres don't want to log each fetching of a value from a sequence,
-  so we pre-log a few fetches in advance. In the event of crash we can lose
-  (skip over) as many values as we pre-logged.
-  But it doesn't work with Zenith because page with sequence value can be evicted from buffer cache
-  and we will get a gap in sequence values even without crash.
+The last-written page LSN is mostly tracked in the smgrwrite() function, without core code changes,
+but there are a few exceptions where we've had to add explicit calls to the Neon-specific
+SetLastWrittenPageLSN() function.

- Alternatives:
-  Do not try to preserve sequential order but avoid performance penalty.
+There's an open PR to track the LSN in a more-fine grained fashion:
+https://github.com/neondatabase/postgres/pull/177
+
+PostgreSQL v15 introduces a new method to do CREATE DATABASE that WAL-logs the database instead of
+relying copying files and checkpoint. With that method, we probably won't need any special handling.
+The old method is still available, though.
+
+### How to get rid of the patch
+
+Wait until v15?


-9. Treat unlogged tables as normal (permanent) tables.
- Why?
-  Unlogged tables are not transient, so them have to survive node restart (unlike temporary tables).
-  But as far as compute node is stateless, we need to persist their data to storage node.
-  And it can only be done through the WAL.
+## Cache relation sizes

- Alternatives?
-  * Store unlogged tables locally (violates requirement of stateless compute nodes).
-  * Prohibit unlogged tables at all.
+The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going
+to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the
+relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for
+Neon)


-10. Support start Postgres in wal-redo mode
- Why?
-  To be able to apply WAL record and reconstruct pages at page server.
+## Misc change in vacuumlazy.c

- Alternatives?
-  * Rewrite redo handlers in Rust
-  * Do not reconstruct pages at page server at all and do it at compute node.
+```
+index 8aab6e324e..c684c4fbee 100644
+--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
+@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
+                else if (all_visible_according_to_vm && !PageIsAllVisible(page)
+                                 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
+                {
+-                       elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
+                       /* ZENITH-XXX: all visible hint is not wal-logged
+                        * FIXME: Replay visibilitymap changes in pageserver
+                        */
+                       elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
+                                 vacrel->relname, blkno);
+                        visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
+                                                                VISIBILITYMAP_VALID_BITS);
+```


-11. WAL proposer
- Why?
-  WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.
-  It is currently implemented as patch to standard WAL sender.
-
- Alternatives?
-  Can be moved to extension if some extra callbacks will be added to wal sender code.
+Is this still needed? If that WARNING happens, it looks like potential corruption that we should
+fix!


-12. Secure Computing BPF API wrapper.
- Why?
-  Pageserver delegates complex WAL decoding duties to Postgres,
-  which means that the latter might fall victim to carefully designed
-  malicious WAL records and start doing harmful things to the system.
-  To prevent this, it has been decided to limit possible interactions
-  with the outside world using the Secure Computing BPF mode.
+## Use buffer manager when extending VM or FSM

- Alternatives:
-  * Rewrite redo handlers in Rust.
-  * Add more checks to guarantee correctness of WAL records.
-  * Move seccomp.c to extension
-  * Many other discussed approaches to neutralize incorrect WAL records vulnerabilities.
+```
+ src/backend/storage/freespace/freespace.c                   |   14 +-
+ src/backend/access/heap/visibilitymap.c                     |   15 +-
+
+diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
+index e198df65d8..addfe93eac 100644
+--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
+@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
+        /* Now extend the file */
+        while (vm_nblocks_now < vm_nblocks)
+        {
+-               PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
+               /*
+                * ZENITH: Initialize VM pages through buffer cache to prevent loading
+                * them from pageserver.
+                */
+               Buffer  buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW,
+                                                                                       RBM_ZERO_AND_LOCK, NULL);
+               Page    page = BufferGetPage(buffer);
+
+               PageInit((Page) page, BLCKSZ, 0);
+               PageSetChecksumInplace(page, vm_nblocks_now);
+               MarkBufferDirty(buffer);
+               UnlockReleaseBuffer(buffer);
+ 
+-               smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
+-                                  pg.data, false);
+                vm_nblocks_now++;
+        }
+```
+
+### Problem we're trying to solve
+
+???
+
+### How to get rid of the patch
+
+Maybe this would be a reasonable change in PostgreSQL too?


-13. Callbacks for replica feedbacks
- Why?
-  Allowing waproposer to interact with walsender code.
+## Allow startup without reading checkpoint record

- Alternatives
-  Copy walsender code to walproposer.
+In Neon, the compute node is stateless. So when we are launching compute node, we need to provide
+some dummy PG_DATADIR. Relation pages can be requested on demand from page server. But Postgres
+still need some non-relational data: control and configuration files, SLRUs,...  It is currently
+implemented using basebackup (do not mix with pg_basebackup) which is created by pageserver. It
+includes in this tarball config/control files, SLRUs and required directories.
+
+As pageserver does not have the original WAL segments, the basebackup tarball includes an empty WAL
+segment to bootstrap the WAL writing, but it doesn't contain the checkpoint record.  There are some
+changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
+from WAL.
+
+This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
+at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
+checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.


-14. Support multiple SMGR implementations.
- Why?
-  Postgres provides abstract API for storage manager but it has only one implementation
-  and provides no way to replace it with custom storage manager.
+### How to get rid of the patch

- Alternatives?
-  None.
+???


-15. Calculate database size as sum of all database relations.
- Why?
-  Postgres is calculating database size by traversing data directory
-  but as far as Zenith compute node is stateless we can not do it.
+### Alternatives

- Alternatives?
-  Send this request directly to pageserver and calculate real (physical) size
-  of Zenith representation of database/timeline, rather than sum logical size of all relations.
+Include a fake checkpoint record in the tarball. Creating fake WAL is a bit risky, though; I'm
+afraid it might accidentally get streamed to the safekeepers and overwrite or corrupt the real WAL.
+
+## Disable sequence caching
+
+```
+diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
+index 0415df9ccb..9f9db3c8bc 100644
+--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
+@@ -53,7 +53,9 @@
+  * so we pre-log a few fetches in advance. In the event of
+  * crash we can lose (skip over) as many values as we pre-logged.
+  */
+-#define SEQ_LOG_VALS   32
+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
+/* #define SEQ_LOG_VALS        32 */
+#define SEQ_LOG_VALS   0
+```
+
+Due to performance reasons Postgres don't want to log each fetching of a value from a sequence, so
+it pre-logs a few fetches in advance. In the event of crash we can lose (skip over) as many values
+as we pre-logged. But with Neon, because page with sequence value can be evicted from buffer cache,
+we can get a gap in sequence values even without crash.
+
+### How to get rid of the patch
+
+Maybe we can just remove it, and accept the gaps. Or add some special handling for sequence
+relations in the Neon extension, to WAL log the sequence page when it's about to be evicted. It
+would be weird if the sequence moved backwards though, think of PITR.
+
+Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon.


-----------------------------------------------
-Not currently committed but proposed:
+## Walproposer

-1. Disable ring buffer buffer manager strategies
- Why?
-  Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
-  Even if there are free space in buffer cache, pages may be evicted.
-  Negative effect of it can be somehow compensated by file system cache, but in case of Zenith
-  cost of requesting page from page server is much higher.
+```
+ src/Makefile                                                |    1 +
+ src/backend/replication/libpqwalproposer/Makefile           |   37 +
+ src/backend/replication/libpqwalproposer/libpqwalproposer.c |  416 ++++++++++++
+ src/backend/postmaster/bgworker.c                           |    4 +
+ src/backend/postmaster/postmaster.c                         |    6 +
+ src/backend/replication/Makefile                            |    4 +-
+ src/backend/replication/walproposer.c                       | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ src/backend/replication/walproposer_utils.c                 |  402 +++++++++++
+ src/backend/replication/walreceiver.c                       |    7 +
+ src/backend/replication/walsender.c                         |  320 ++++++---
+ src/backend/storage/ipc/ipci.c                              |    6 +
+ src/include/replication/walproposer.h                       |  565 ++++++++++++++++
+```

- Alternatives?
-  Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
-  for example copy evicted page from ring buffer to some other buffer if there is free space
-  in buffer cache.
+WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.  It is
+currently implemented as patch to standard WAL sender.

-2. Disable marking page as dirty when hint bits are set.
- Why?
-  Postgres has to modify page twice: first time when some tuple is updated and second time when
-  hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
+### How to get rid of the patch

- Alternatives?
-  Add special WAL record for setting page hints.
+Refactor into an extension. Submit hooks or APIs into upstream if necessary.

-3. Prefetching
- Why?
-  As far as pages in Zenith are loaded on demand, to reduce node startup time
-  and also speedup some massive queries we need some mechanism for bulk loading to
-  reduce page request round-trip overhead.
+@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96

-  Currently Postgres is supporting prefetching only for bitmap scan.
-  In Zenith we also use prefetch for sequential and index scan. For sequential scan we prefetch
-  some number of following pages. For index scan we prefetch pages of heap relation addressed by TIDs.
+## Ignore unexpected data beyond EOF in bufmgr.c

-4. Prewarming.
- Why?
-  Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
-  But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
-  We can capture state of compute node buffer cache and send bulk request for this pages at startup.
+```
+@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+                 */
+                bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+                if (!PageIsNew((Page) bufBlock))
+-                       ereport(ERROR,
+               {
+                        // XXX-ZENITH
+                        MemSet((char *) bufBlock, 0, BLCKSZ);
+                        ereport(DEBUG1,
+                                        (errmsg("unexpected data beyond EOF in block %u of relation %s",
+                                                        blockNum, relpath(smgr->smgr_rnode, forkNum)),
+                                         errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
+-
+               }
+                /*
+                 * We *must* do smgrextend before succeeding, else the page will not
+                 * be reserved by the kernel, and the next P_NEW call will decide to
+```
+
+PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros
+first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend
+a relation at the same time, the pages can be WAL-logged in different order.
+
+I'm not sure what scenario exactly required this change in Neon, though.
+
+### How to get rid of the patch
+
+Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit
+confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation,
+and finally WAL-log that the extension succeeded.
+
+## Make smgr interface available to extensions
+
+```
+ src/backend/storage/smgr/smgr.c                             |  203 +++---
+ src/include/storage/smgr.h                                  |   72 +-
+```
+
+### How to get rid of the patch
+
+Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression.
+
+
+## Added relpersistence argument to smgropen()
+
+```
+ src/backend/access/heap/heapam_handler.c                    |    2 +-
+ src/backend/catalog/storage.c                               |   10 +-
+ src/backend/commands/tablecmds.c                            |    2 +-
+ src/backend/storage/smgr/md.c                               |    4 +-
+ src/include/utils/rel.h                                     |    3 +-
+```
+
+Neon needs to treat unlogged relations differently from others, so the smgrread(), smgrwrite() etc.
+implementations need to know the 'relpersistence' of the relation. To get that information where
+it's needed, we added the 'relpersistence' field to smgropen().
+
+### How to get rid of the patch
+
+Maybe 'relpersistence' would be useful in PostgreSQL for debugging purposes? Or simply for the
+benefit of extensions like Neon. Should consider this in the patch to make smgr API usable to
+extensions.
+
+## Alternatives
+
+Currently in Neon, unlogged tables live on local disk in the compute node, and are wiped away on
+compute node restart. One alternative would be to instead WAL-log even unlogged tables, essentially
+ignoring the UNLOGGED option. Or prohibit UNLOGGED tables completely. But would we still need the
+relpersistence argument to handle index builds? See item on "Mark index builds that use buffer
+manager without logging explicitly".
+
+## Use smgr and dbsize_hook for size calculations
+
+```
+ src/backend/utils/adt/dbsize.c                              |   61 +-
+```
+
+In PostgreSQL, the rel and db-size functions scan the data directory directly. That won't work in Neon.
+
+### How to get rid of the patch
+
+Send patch to PostgreSQL, to use smgr API functions for relation size calculation instead. Maybe as
+part of the general smgr API patch.
+
+
+
+# WAL redo process changes
+
+Pageserver delegates complex WAL decoding duties to Postgres, which means that the latter might fall
+victim to carefully designed malicious WAL records and start doing harmful things to the system.  To
+prevent this, the redo functions are executed in a separate process that is sandboxed with Linux
+Secure Computing mode (see seccomp(2) man page).
+
+As an alternative to having a separate WAL redo process, we could rewrite all redo handlers in Rust
+This is infeasible. However, it would take a lot of effort to rewrite them, ensure that you've done
+the rewrite correctly, and once you've done that, it would be a lot of ongoing maintenance effort to
+keep the rewritten code in sync over time, across new PostgreSQL versions. That's why we want to
+leverage PostgreSQL code.
+
+Another alternative would be to harden all the PostgreSQL WAL redo functions so that it would be
+safe to call them directly from Rust code, without needing the security sandbox. That's not feasible
+for similar reasons as rewriting them in Rust.
+
+
+## Don't replay change in XLogReadBufferForRedo that are not for the target page we're replaying
+
+```
+ src/backend/access/gin/ginxlog.c                            |   19 +-
+
+Also some changes in xlog.c and xlogutils.c
+
+Example:
+
+@@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record)
+        if (!isLeaf)
+                ginRedoClearIncompleteSplit(record, 3);
+ 
+-       if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED)
+       action = XLogReadBufferForRedo(record, 0, &lbuffer);
+       if (action != BLK_RESTORED && action != BLK_DONE)
+                elog(ERROR, "GIN split record did not contain a full-page image of left page");
+```
+
+### Problem we're trying to solve
+
+In PostgreSQL, if a WAL redo function calls XLogReadBufferForRead() for a page that has a full-page
+image, it always succeeds. However, Neon WAL redo process is only concerned about replaying changes
+to a singe page, so replaying any changes for other pages is a waste of cycles. We have modified
+XLogReadBufferForRead() to return BLK_DONE for all other pages, to avoid the overhead. That is
+unexpected by code like the above.
+
+### How to get rid of the patch
+
+Submit the changes to upstream, hope the community accepts them. There's no harm to PostgreSQL from
+these changes, although it doesn't have any benefit either.
+
+To make these changes useful to upstream PostgreSQL, we could implement a feature to look ahead the
+WAL, and detect truncated relations. Even in PostgreSQL, it is a waste of cycles to replay changes
+to pages that are later truncated away, so we could have XLogReadBufferForRedo() return BLK_DONE or
+BLK_NOTFOUND for pages that are known to be truncated away later in the WAL stream.
+
+### Alternatives
+
+Maybe we could revert this optimization, and restore pages other than the target page too.
+
+## Add predefined_sysidentifier flag to initdb
+
+```
+ src/backend/bootstrap/bootstrap.c                           |   13 +-
+ src/bin/initdb/initdb.c                                     |    4 +
+
+And some changes in xlog.c
+```
+
+This is used to help with restoring a database when you have all the WAL, all the way back to
+initdb, but no backup. You can reconstruct the missing backup by running initdb again, with the same
+sysidentifier.
+
+
+### How to get rid of the patch
+
+Ignore it. This is only needed for disaster recovery, so once we've eliminated all other Postgres
+patches, we can just keep it around as a patch or as separate branch in a repo.
+
+
+# Not currently committed but proposed
+
+## Disable ring buffer buffer manager strategies
+
+### Why?
+
+Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
+Even if there are free space in buffer cache, pages may be evicted.
+Negative effect of it can be somehow compensated by file system cache, but in Neon,
+cost of requesting page from page server is much higher.
+
+### Alternatives?
+
+Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
+for example copy evicted page from ring buffer to some other buffer if there is free space
+in buffer cache.
+
+## Disable marking page as dirty when hint bits are set.
+
+### Why?
+
+Postgres has to modify page twice: first time when some tuple is updated and second time when
+hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
+
+### Alternatives?
+
+Add special WAL record for setting page hints.
+
+## Prefetching
+
+### Why?
+
+As far as pages in Neon are loaded on demand, to reduce node startup time
+and also speedup some massive queries we need some mechanism for bulk loading to
+reduce page request round-trip overhead.
+
+Currently Postgres is supporting prefetching only for bitmap scan.
+In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us.
+For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages
+of heap relation addressed by TIDs.
+
+## Prewarming
+
+### Why?
+
+Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
+But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
+We can capture state of compute node buffer cache and send bulk request for this pages at startup.
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -75,7 +75,7 @@ layer's Segment and range of LSNs.
 There are two kinds of layers, in-memory and on-disk layers. In-memory
 layers are used to ingest incoming WAL, and provide fast access
 to the recent page versions. On-disk layers are stored as files on disk, and
-are immutable. See pageserver/src/layered_repository/README.md for more.
+are immutable. See [pageserver-storage.md](./pageserver-storage.md) for more.

 ### Layer file (on-disk layer)

@@ -111,7 +111,7 @@ PostgreSQL LSNs and functions to monitor them:
 * `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
 [source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):

-Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
+Neon safekeeper LSNs. See [safekeeper protocol section](safekeeper-protocol.md) for more information.
 * `CommitLSN`: position in WAL confirmed by quorum safekeepers.
 * `RestartLSN`: position in WAL confirmed by all safekeepers.
 * `FlushLSN`: part of WAL persisted to the disk by safekeeper.
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -68,8 +68,6 @@ There are the following implementations present:
 * local filesystem — to use in tests mainly
 * AWS S3           - to use in production

-Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).
-
 The backup service is disabled by default and can be enabled to interact with a single remote storage.

 CLI examples:
@@ -118,7 +116,7 @@ implemented by the LayeredRepository object in
 `layered_repository.rs`. There is only that one implementation of the
 Repository trait, but it's still a useful abstraction that keeps the
 interface for the low-level storage functionality clean. The layered
-storage format is described in layered_repository/README.md.
+storage format is described in [pageserver-storage.md](./pageserver-storage.md).

 Each repository consists of multiple Timelines. Timeline is a
 workhorse that accepts page changes from the WAL, and serves
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -15,7 +15,7 @@ listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'

 checkpoint_distance = '268435456' # in bytes
-checkpoint_period = '1 s'
+checkpoint_timeout = '10m'

 gc_period = '100 s'
 gc_horizon = '67108864'
@@ -46,7 +46,7 @@ Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#ta

 All values can be passed as an argument to the pageserver binary, using the `-c` parameter and specified as a valid TOML string. All tables should be passed in the inline form.

-Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage={local_path='/some/local/path/'}"`
+Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage={local_path='/some/local/path/'}"`

 Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.

@@ -82,6 +82,14 @@ S3.

 The unit is # of bytes.

+#### checkpoint_timeout
+
+Apart from `checkpoint_distance`, open layer flushing is also triggered
+`checkpoint_timeout` after the last flush. This makes WAL eventually uploaded to
+s3 when activity is stopped.
+
+The default is 10m.
+
 #### compaction_period

 Every `compaction_period` seconds, the page server checks if
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -28,7 +28,7 @@ The pageserver has a few different duties:
 - Receive WAL from the WAL service and decode it.
 - Replay WAL that's applicable to the chunks that the Page Server maintains

-For more detailed info, see [/pageserver/README](/pageserver/README.md)
+For more detailed info, see [pageserver-services.md](./pageserver-services.md)

 `/proxy`:

@@ -57,7 +57,7 @@ PostgreSQL extension that contains functions needed for testing and debugging.
 The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
 It acts as a holding area and redistribution center for recently generated WAL.

-For more detailed info, see [/safekeeper/README](/safekeeper/README.md)
+For more detailed info, see [walservice.md](./walservice.md)

 `/workspace_hack`:
 The workspace_hack crate exists only to pin down some dependencies.
--- a/docs/walservice.md
+++ b/docs/walservice.md
@@ -75,8 +75,8 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only
 one primary node can be actively streaming WAL to the quorum of
 safekeepers.

-See README_PROTO.md for a more detailed description of the consensus
-protocol. spec/ contains TLA+ specification of it.
+See [this section](safekeeper-protocol.md) for a more detailed description of
+the consensus protocol. spec/ contains TLA+ specification of it.

 # Q&A

--- a/libs/etcd_broker/Cargo.toml
+++ b/libs/etcd_broker/Cargo.toml
@@ -9,7 +9,7 @@
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "1.12.0"
- once_cell = "1.8.0"
+ once_cell = "1.13.0"

 utils = { path = "../utils" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -6,6 +6,5 @@ edition = "2021"
 [dependencies]
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 libc = "0.2"
-lazy_static = "1.4"
-once_cell = "1.8.0"
+once_cell = "1.13.0"
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,7 +2,10 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
+use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec};
+pub use prometheus::opts;
+pub use prometheus::register;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_gauge, Gauge};
@@ -18,6 +21,17 @@ pub use prometheus::{Encoder, TextEncoder};
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};

+pub type UIntGauge = GenericGauge<AtomicU64>;
+pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
+
+#[macro_export]
+macro_rules! register_uint_gauge_vec {
+    ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
+        let gauge_vec = UIntGaugeVec::new($crate::opts!($NAME, $HELP), $LABELS_NAMES).unwrap();
+        $crate::register(Box::new(gauge_vec.clone())).map(|_| gauge_vec)
+    }};
+}
+
 /// Gathers all Prometheus metrics and records the I/O stats just before that.
 ///
 /// Metrics gathering is a relatively simple and standalone operation, so
@@ -27,19 +41,22 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
    prometheus::gather()
 }

-lazy_static! {
-    static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
+static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
        "libmetrics_disk_io_bytes_total",
        "Bytes written and read from disk, grouped by the operation (read|write)",
        &["io_operation"]
    )
-    .expect("Failed to register disk i/o bytes int gauge vec");
-    static ref MAXRSS_KB: IntGauge = register_int_gauge!(
+    .expect("Failed to register disk i/o bytes int gauge vec")
+});
+
+static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
        "libmetrics_maxrss_kb",
        "Memory usage (Maximum Resident Set Size)"
    )
-    .expect("Failed to register maxrss_kb int gauge");
-}
+    .expect("Failed to register maxrss_kb int gauge")
+});

 pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
--- a/libs/metrics/src/wrappers.rs
+++ b/libs/metrics/src/wrappers.rs
@@ -10,13 +10,13 @@ use std::io::{Read, Result, Write};
 /// # use std::io::{Result, Read};
 /// # use metrics::{register_int_counter, IntCounter};
 /// # use metrics::CountedReader;
+/// # use once_cell::sync::Lazy;
 /// #
-/// # lazy_static::lazy_static! {
-/// #     static ref INT_COUNTER: IntCounter = register_int_counter!(
+/// # static INT_COUNTER: Lazy<IntCounter> = Lazy::new( || { register_int_counter!(
 /// #         "int_counter",
 /// #         "let's count something!"
-/// #     ).unwrap();
-/// # }
+/// #     ).unwrap()
+/// # });
 /// #
 /// fn do_some_reads(stream: impl Read, count: usize) -> Result<Vec<u8>> {
 ///     let mut reader = CountedReader::new(stream, |cnt| {
@@ -85,13 +85,13 @@ impl<T: Read> Read for CountedReader<'_, T> {
 /// # use std::io::{Result, Write};
 /// # use metrics::{register_int_counter, IntCounter};
 /// # use metrics::CountedWriter;
+/// # use once_cell::sync::Lazy;
 /// #
-/// # lazy_static::lazy_static! {
-/// #     static ref INT_COUNTER: IntCounter = register_int_counter!(
+/// # static INT_COUNTER: Lazy<IntCounter> = Lazy::new( || { register_int_counter!(
 /// #         "int_counter",
 /// #         "let's count something!"
-/// #     ).unwrap();
-/// # }
+/// #     ).unwrap()
+/// # });
 /// #
 /// fn do_some_writes(stream: impl Write, payload: &[u8]) -> Result<()> {
 ///     let mut writer = CountedWriter::new(stream, |cnt| {
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -12,7 +12,7 @@ byteorder = "1.4.3"
 anyhow = "1.0"
 crc32c = "0.6.0"
 hex = "0.4.3"
-lazy_static = "1.4"
+once_cell = "1.13.0"
 log = "0.4.14"
 memoffset = "0.6.2"
 thiserror = "1.0"
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -49,12 +49,12 @@ fn main() {
    // Finding the location of C headers for the Postgres server:
    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/tmp_install`
    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/tmp_install/include/postgresql/server`
-    let mut pg_install_dir: PathBuf;
-    if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
-        pg_install_dir = postgres_install_dir.into();
+    let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR")
+    {
+        postgres_install_dir.into()
    } else {
-        pg_install_dir = PathBuf::from("tmp_install")
-    }
+        PathBuf::from("tmp_install")
+    };

    if pg_install_dir.is_relative() {
        let cwd = env::current_dir().unwrap();
--- a/libs/postgres_ffi/src/relfile_utils.rs
+++ b/libs/postgres_ffi/src/relfile_utils.rs
@@ -2,7 +2,7 @@
 //! Common utilities for dealing with PostgreSQL relation files.
 //!
 use crate::pg_constants;
-use lazy_static::lazy_static;
+use once_cell::sync::OnceCell;
 use regex::Regex;

 #[derive(Debug, Clone, thiserror::Error, PartialEq)]
@@ -54,11 +54,14 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
 /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
 ///
 pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
-    lazy_static! {
-        static ref RELFILE_RE: Regex =
-            Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
-    }
+    static RELFILE_RE: OnceCell<Regex> = OnceCell::new();
+    RELFILE_RE.get_or_init(|| {
+        Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap()
+    });
+
    let caps = RELFILE_RE
+        .get()
+        .unwrap()
        .captures(fname)
        .ok_or(FilePathError::InvalidFileName)?;

--- a/libs/postgres_ffi/src/waldecoder.rs
+++ b/libs/postgres_ffi/src/waldecoder.rs
@@ -13,24 +13,30 @@ use super::xlog_utils::*;
 use super::XLogLongPageHeaderData;
 use super::XLogPageHeaderData;
 use super::XLogRecord;
+use super::XLOG_PAGE_MAGIC;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use crc32c::*;
 use log::*;
 use std::cmp::min;
+use std::num::NonZeroU32;
 use thiserror::Error;
 use utils::lsn::Lsn;

+enum State {
+    WaitingForRecord,
+    ReassemblingRecord {
+        recordbuf: BytesMut,
+        contlen: NonZeroU32,
+    },
+    SkippingEverything {
+        skip_until_lsn: Lsn,
+    },
+}
+
 pub struct WalStreamDecoder {
    lsn: Lsn,
-
-    startlsn: Lsn, // LSN where this record starts
-    contlen: u32,
-    padlen: u32,
-
    inputbuf: BytesMut,
-
-    /// buffer used to reassemble records that cross page boundaries.
-    recordbuf: BytesMut,
+    state: State,
 }

 #[derive(Error, Debug, Clone)]
@@ -48,13 +54,8 @@ impl WalStreamDecoder {
    pub fn new(lsn: Lsn) -> WalStreamDecoder {
        WalStreamDecoder {
            lsn,
-
-            startlsn: Lsn(0),
-            contlen: 0,
-            padlen: 0,
-
            inputbuf: BytesMut::new(),
-            recordbuf: BytesMut::new(),
+            state: State::WaitingForRecord,
        }
    }

@@ -67,6 +68,58 @@ impl WalStreamDecoder {
        self.inputbuf.extend_from_slice(buf);
    }

+    fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> {
+        let validate_impl = || {
+            if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 {
+                return Err(format!(
+                    "invalid xlog page header: xlp_magic={}, expected {}",
+                    hdr.xlp_magic, XLOG_PAGE_MAGIC
+                ));
+            }
+            if hdr.xlp_pageaddr != self.lsn.0 {
+                return Err(format!(
+                    "invalid xlog page header: xlp_pageaddr={}, expected {}",
+                    hdr.xlp_pageaddr, self.lsn
+                ));
+            }
+            match self.state {
+                State::WaitingForRecord => {
+                    if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD != 0 {
+                        return Err(
+                            "invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD".into(),
+                        );
+                    }
+                    if hdr.xlp_rem_len != 0 {
+                        return Err(format!(
+                            "invalid xlog page header: xlp_rem_len={}, but it's not a contrecord",
+                            hdr.xlp_rem_len
+                        ));
+                    }
+                }
+                State::ReassemblingRecord { contlen, .. } => {
+                    if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD == 0 {
+                        return Err(
+                            "invalid xlog page header: XLP_FIRST_IS_CONTRECORD expected, not found"
+                                .into(),
+                        );
+                    }
+                    if hdr.xlp_rem_len != contlen.get() {
+                        return Err(format!(
+                            "invalid xlog page header: xlp_rem_len={}, expected {}",
+                            hdr.xlp_rem_len,
+                            contlen.get()
+                        ));
+                    }
+                }
+                State::SkippingEverything { .. } => {
+                    panic!("Should not be validating page header in the SkippingEverything state");
+                }
+            };
+            Ok(())
+        };
+        validate_impl().map_err(|msg| WalDecodeError { msg, lsn: self.lsn })
+    }
+
    /// Attempt to decode another WAL record from the input that has been fed to the
    /// decoder so far.
    ///
@@ -76,128 +129,121 @@ impl WalStreamDecoder {
    ///     Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid.
    ///
    pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
-        let recordbuf;
-
        // Run state machine that validates page headers, and reassembles records
        // that cross page boundaries.
        loop {
            // parse and verify page boundaries as we go
-            if self.padlen > 0 {
-                // We should first skip padding, as we may have to skip some page headers if we're processing the XLOG_SWITCH record.
-                if self.inputbuf.remaining() < self.padlen as usize {
-                    return Ok(None);
-                }
+            // However, we may have to skip some page headers if we're processing the XLOG_SWITCH record or skipping padding for whatever reason.
+            match self.state {
+                State::WaitingForRecord | State::ReassemblingRecord { .. } => {
+                    if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
+                        // parse long header

-                // skip padding
-                self.inputbuf.advance(self.padlen as usize);
-                self.lsn += self.padlen as u64;
-                self.padlen = 0;
-            } else if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
-                // parse long header
+                        if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
+                            return Ok(None);
+                        }

-                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
-                    return Ok(None);
-                }
+                        let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(
+                            |e| WalDecodeError {
+                                msg: format!("long header deserialization failed {}", e),
+                                lsn: self.lsn,
+                            },
+                        )?;

-                let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
-                    WalDecodeError {
-                        msg: format!("long header deserialization failed {}", e),
-                        lsn: self.lsn,
+                        self.validate_page_header(&hdr.std)?;
+
+                        self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
+                    } else if self.lsn.block_offset() == 0 {
+                        if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
+                            return Ok(None);
+                        }
+
+                        let hdr =
+                            XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
+                                WalDecodeError {
+                                    msg: format!("header deserialization failed {}", e),
+                                    lsn: self.lsn,
+                                }
+                            })?;
+
+                        self.validate_page_header(&hdr)?;
+
+                        self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
                    }
-                })?;
-
-                if hdr.std.xlp_pageaddr != self.lsn.0 {
-                    return Err(WalDecodeError {
-                        msg: "invalid xlog segment header".into(),
-                        lsn: self.lsn,
-                    });
                }
-                // TODO: verify the remaining fields in the header
-
-                self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
-                continue;
-            } else if self.lsn.block_offset() == 0 {
-                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
-                    return Ok(None);
-                }
-
-                let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
-                    WalDecodeError {
-                        msg: format!("header deserialization failed {}", e),
-                        lsn: self.lsn,
+                State::SkippingEverything { .. } => {}
+            }
+            match &mut self.state {
+                State::WaitingForRecord => {
+                    // need to have at least the xl_tot_len field
+                    if self.inputbuf.remaining() < 4 {
+                        return Ok(None);
                    }
-                })?;

-                if hdr.xlp_pageaddr != self.lsn.0 {
-                    return Err(WalDecodeError {
-                        msg: "invalid xlog page header".into(),
-                        lsn: self.lsn,
-                    });
+                    // peek xl_tot_len at the beginning of the record.
+                    // FIXME: assumes little-endian
+                    let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
+                    if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
+                        return Err(WalDecodeError {
+                            msg: format!("invalid xl_tot_len {}", xl_tot_len),
+                            lsn: self.lsn,
+                        });
+                    }
+                    // Fast path for the common case that the whole record fits on the page.
+                    let pageleft = self.lsn.remaining_in_block() as u32;
+                    if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
+                        self.lsn += xl_tot_len as u64;
+                        let recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
+                        return Ok(Some(self.complete_record(recordbuf)?));
+                    } else {
+                        // Need to assemble the record from pieces. Remember the size of the
+                        // record, and loop back. On next iteration, we will reach the 'else'
+                        // branch below, and copy the part of the record that was on this page
+                        // to 'recordbuf'.  Subsequent iterations will skip page headers, and
+                        // append the continuations from the next pages to 'recordbuf'.
+                        self.state = State::ReassemblingRecord {
+                            recordbuf: BytesMut::with_capacity(xl_tot_len as usize),
+                            contlen: NonZeroU32::new(xl_tot_len).unwrap(),
+                        }
+                    }
                }
-                // TODO: verify the remaining fields in the header
+                State::ReassemblingRecord { recordbuf, contlen } => {
+                    // we're continuing a record, possibly from previous page.
+                    let pageleft = self.lsn.remaining_in_block() as u32;

-                self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
-                continue;
-            } else if self.contlen == 0 {
-                assert!(self.recordbuf.is_empty());
+                    // read the rest of the record, or as much as fits on this page.
+                    let n = min(contlen.get(), pageleft) as usize;

-                // need to have at least the xl_tot_len field
-                if self.inputbuf.remaining() < 4 {
-                    return Ok(None);
+                    if self.inputbuf.remaining() < n {
+                        return Ok(None);
+                    }
+
+                    recordbuf.put(self.inputbuf.split_to(n));
+                    self.lsn += n as u64;
+                    *contlen = match NonZeroU32::new(contlen.get() - n as u32) {
+                        Some(x) => x,
+                        None => {
+                            // The record is now complete.
+                            let recordbuf = std::mem::replace(recordbuf, BytesMut::new()).freeze();
+                            return Ok(Some(self.complete_record(recordbuf)?));
+                        }
+                    }
                }
-
-                // peek xl_tot_len at the beginning of the record.
-                // FIXME: assumes little-endian
-                self.startlsn = self.lsn;
-                let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
-                if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
-                    return Err(WalDecodeError {
-                        msg: format!("invalid xl_tot_len {}", xl_tot_len),
-                        lsn: self.lsn,
-                    });
+                State::SkippingEverything { skip_until_lsn } => {
+                    assert!(*skip_until_lsn >= self.lsn);
+                    let n = skip_until_lsn.0 - self.lsn.0;
+                    if self.inputbuf.remaining() < n as usize {
+                        return Ok(None);
+                    }
+                    self.inputbuf.advance(n as usize);
+                    self.lsn += n;
+                    self.state = State::WaitingForRecord;
                }
-
-                // Fast path for the common case that the whole record fits on the page.
-                let pageleft = self.lsn.remaining_in_block() as u32;
-                if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
-                    // Take the record from the 'inputbuf', and validate it.
-                    recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
-                    self.lsn += xl_tot_len as u64;
-                    break;
-                } else {
-                    // Need to assemble the record from pieces. Remember the size of the
-                    // record, and loop back. On next iteration, we will reach the 'else'
-                    // branch below, and copy the part of the record that was on this page
-                    // to 'recordbuf'.  Subsequent iterations will skip page headers, and
-                    // append the continuations from the next pages to 'recordbuf'.
-                    self.recordbuf.reserve(xl_tot_len as usize);
-                    self.contlen = xl_tot_len;
-                    continue;
-                }
-            } else {
-                // we're continuing a record, possibly from previous page.
-                let pageleft = self.lsn.remaining_in_block() as u32;
-
-                // read the rest of the record, or as much as fits on this page.
-                let n = min(self.contlen, pageleft) as usize;
-
-                if self.inputbuf.remaining() < n {
-                    return Ok(None);
-                }
-
-                self.recordbuf.put(self.inputbuf.split_to(n));
-                self.lsn += n as u64;
-                self.contlen -= n as u32;
-
-                if self.contlen == 0 {
-                    // The record is now complete.
-                    recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
-                    break;
-                }
-                continue;
            }
        }
+    }

+    fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError> {
        // We now have a record in the 'recordbuf' local variable.
        let xlogrec =
            XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| {
@@ -219,18 +265,20 @@ impl WalStreamDecoder {

        // XLOG_SWITCH records are special. If we see one, we need to skip
        // to the next WAL segment.
-        if xlogrec.is_xlog_switch_record() {
+        let next_lsn = if xlogrec.is_xlog_switch_record() {
            trace!("saw xlog switch record at {}", self.lsn);
-            self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
+            self.lsn + self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64)
        } else {
            // Pad to an 8-byte boundary
-            self.padlen = self.lsn.calc_padding(8u32) as u32;
-        }
+            self.lsn.align()
+        };
+        self.state = State::SkippingEverything {
+            skip_until_lsn: next_lsn,
+        };

        // We should return LSN of the next record, not the last byte of this record or
        // the byte immediately after. Note that this handles both XLOG_SWITCH and usual
        // records, the former "spans" until the next WAL segment (see test_xlog_switch).
-        let result = (self.lsn + self.padlen as u64, recordbuf);
-        Ok(Some(result))
+        Ok((next_lsn, recordbuf))
    }
 }
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -16,7 +16,7 @@ use crate::XLogRecord;
 use crate::XLOG_PAGE_MAGIC;

 use crate::pg_constants::WAL_SEGMENT_SIZE;
-use anyhow::{bail, ensure};
+use anyhow::{anyhow, bail, ensure};
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::BytesMut;
 use bytes::{Buf, Bytes};
@@ -159,7 +159,7 @@ fn find_end_of_wal_segment(
    let mut buf = [0u8; XLOG_BLCKSZ];
    let file_name = XLogFileName(tli, segno, wal_seg_size);
    let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
-    let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
+    let mut file = File::open(data_dir.join(file_name.clone() + ".partial"))?;
    file.seek(SeekFrom::Start(offs as u64))?;
    // xl_crc is the last field in XLogRecord, will not be read into rec_hdr
    const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD);
@@ -396,10 +396,13 @@ pub fn find_end_of_wal(
    let mut high_tli: TimeLineID = 0;
    let mut high_ispartial = false;

-    for entry in fs::read_dir(data_dir).unwrap().flatten() {
+    for entry in fs::read_dir(data_dir)?.flatten() {
        let ispartial: bool;
        let entry_name = entry.file_name();
-        let fname = entry_name.to_str().unwrap();
+        let fname = entry_name
+            .to_str()
+            .ok_or_else(|| anyhow!("Invalid file name"))?;
+
        /*
         * Check if the filename looks like an xlog file, or a .partial file.
         */
@@ -411,7 +414,7 @@ pub fn find_end_of_wal(
            continue;
        }
        let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
-        if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
+        if !ispartial && entry.metadata()?.len() != wal_seg_size as u64 {
            continue;
        }
        if segno > high_segno
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -10,7 +10,7 @@ anyhow = "1.0"
 clap = "3.0"
 env_logger = "0.9"
 log = "0.4"
-once_cell = "1.8.0"
+once_cell = "1.13.0"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 postgres_ffi = { path = "../" }
 tempfile = "3.2"
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-trait = "0.1"
 metrics = { version = "0.1", path = "../metrics" }
-once_cell = "1.8.0"
+once_cell = "1.13.0"
 rusoto_core = "0.48"
 rusoto_s3 = "0.48"
 serde = { version = "1.0", features = ["derive"] }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -66,6 +66,9 @@ pub trait RemoteStorage: Send + Sync {
    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;

    /// Lists all top level subdirectories for a given prefix
+    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
+    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
+    /// so this method doesnt need to.
    async fn list_prefixes(
        &self,
        prefix: Option<Self::RemoteObjectId>,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -116,7 +116,7 @@ impl RemoteStorage for LocalFs {
        prefix: Option<Self::RemoteObjectId>,
    ) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
        let path = match prefix {
-            Some(prefix) => Cow::Owned(self.storage_root.join(prefix)),
+            Some(prefix) => Cow::Owned(prefix),
            None => Cow::Borrowed(&self.storage_root),
        };
        get_all_files(path.as_ref(), false).await
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -171,17 +171,25 @@ impl S3Bucket {

        let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
        let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
+        // session token is used when authorizing through sso
+        // which is typically the case when testing locally on developer machine
+        let session_token = std::env::var("AWS_SESSION_TOKEN").ok();

        let client = if access_key_id.is_none() && secret_access_key.is_none() {
            debug!("Using IAM-based AWS access");
            S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
        } else {
-            debug!("Using credentials-based AWS access");
+            debug!(
+                "Using credentials-based AWS access. Session token is set: {}",
+                session_token.is_some()
+            );
            S3Client::new_with(
                request_dispatcher,
-                StaticProvider::new_minimal(
+                StaticProvider::new(
                    access_key_id.unwrap_or_default(),
                    secret_access_key.unwrap_or_default(),
+                    session_token,
+                    None,
                ),
                region,
            )
@@ -304,32 +312,24 @@ impl RemoteStorage for S3Bucket {
        Ok(document_keys)
    }

+    /// See the doc for `RemoteStorage::list_prefixes`
    /// Note: it wont include empty "directories"
    async fn list_prefixes(
        &self,
        prefix: Option<Self::RemoteObjectId>,
    ) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
-        let list_prefix = match prefix {
-            Some(prefix) => {
-                let mut prefix_in_bucket = self.prefix_in_bucket.clone().unwrap_or_default();
-                // if there is no trailing / in default prefix and
-                // supplied prefix does not start with "/" insert it
-                if !(prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR)
-                    || prefix.0.starts_with(S3_PREFIX_SEPARATOR))
-                {
-                    prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
-                }
-
-                prefix_in_bucket.push_str(&prefix.0);
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let list_prefix = prefix
+            .map(|p| p.0)
+            .or_else(|| self.prefix_in_bucket.clone())
+            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if !prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) {
-                    prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
+                if !p.ends_with(S3_PREFIX_SEPARATOR) {
+                    p.push(S3_PREFIX_SEPARATOR);
                }
-                Some(prefix_in_bucket)
-            }
-            None => self.prefix_in_bucket.clone(),
-        };
+                p
+            });

        let mut document_keys = Vec::new();

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -8,7 +8,6 @@ anyhow = "1.0"
 bincode = "1.3"
 bytes = "1.0.1"
 hyper = { version = "0.14.7", features = ["full"] }
-lazy_static = "1.4.0"
 pin-project-lite = "0.2.7"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
@@ -28,6 +27,8 @@ rustls = "0.20.2"
 rustls-split = "0.3.0"
 git-version = "0.3.5"
 serde_with = "1.12.0"
+once_cell = "1.13.0"
+

 metrics = { path = "../metrics" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -4,8 +4,8 @@ use crate::zid::ZTenantId;
 use anyhow::anyhow;
 use hyper::header::AUTHORIZATION;
 use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
-use lazy_static::lazy_static;
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
+use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::RequestInfo;
 use routerify::{Middleware, Router, RouterBuilder, RouterService};
@@ -16,13 +16,13 @@ use std::net::TcpListener;

 use super::error::ApiError;

-lazy_static! {
-    static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
+static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
        "Number of metric requests made"
    )
-    .expect("failed to define a metric");
-}
+    .expect("failed to define a metric")
+});

 async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
    info!("{} {} {}", info.method(), info.uri().path(), res.status(),);
--- a/libs/utils/src/pq_proto.rs
+++ b/libs/utils/src/pq_proto.rs
@@ -47,10 +47,12 @@ pub enum FeStartupPacket {
    StartupMessage {
        major_version: u32,
        minor_version: u32,
-        params: HashMap<String, String>,
+        params: StartupMessageParams,
    },
 }

+pub type StartupMessageParams = HashMap<String, String>;
+
 #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
 pub struct CancelKeyData {
    pub backend_pid: i32,
--- a/libs/utils/tests/ssl_test.rs
+++ b/libs/utils/tests/ssl_test.rs
@@ -7,7 +7,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;

 use utils::postgres_backend::{AuthType, Handler, PostgresBackend};

@@ -19,16 +19,15 @@ fn make_tcp_pair() -> (TcpStream, TcpStream) {
    (server_stream, client_stream)
 }

-lazy_static! {
-    static ref KEY: rustls::PrivateKey = {
-        let mut cursor = Cursor::new(include_bytes!("key.pem"));
-        rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
-    };
-    static ref CERT: rustls::Certificate = {
-        let mut cursor = Cursor::new(include_bytes!("cert.pem"));
-        rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
-    };
-}
+static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
+    let mut cursor = Cursor::new(include_bytes!("key.pem"));
+    rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
+});
+
+static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
+    let mut cursor = Cursor::new(include_bytes!("cert.pem"));
+    rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
+});

 #[test]
 fn ssl() {
--- a/neon_local/Cargo.toml
+++ b/neon_local/Cargo.toml
@@ -15,6 +15,5 @@ git-version = "0.3.5"
 pageserver = { path = "../pageserver" }
 control_plane = { path = "../control_plane" }
 safekeeper = { path = "../safekeeper" }
-postgres_ffi = { path = "../libs/postgres_ffi" }
 utils = { path = "../libs/utils" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/neon_local/src/main.rs
+++ b/neon_local/src/main.rs
@@ -9,6 +9,7 @@ use pageserver::config::defaults::{
    DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
    DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
 };
+use pageserver::http::models::TimelineInfo;
 use safekeeper::defaults::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -25,8 +26,6 @@ use utils::{
    zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
 };

-use pageserver::timelines::TimelineInfo;
-
 // Default id of a safekeeper node, if not specified on the command line.
 const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1);
 const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
@@ -885,7 +884,7 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
    match sub_match.subcommand() {
        Some(("start", start_match)) => {
            if let Err(e) = pageserver.start(&pageserver_config_overrides(start_match)) {
-                eprintln!("pageserver start failed: {}", e);
+                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }
@@ -907,10 +906,19 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
            }

            if let Err(e) = pageserver.start(&pageserver_config_overrides(restart_match)) {
-                eprintln!("pageserver start failed: {}", e);
+                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }
+
+        Some(("status", _)) => match PageServerNode::from_env(env).check_status() {
+            Ok(_) => println!("Page server is up and running"),
+            Err(err) => {
+                eprintln!("Page server is not available: {}", err);
+                exit(1);
+            }
+        },
+
        Some((sub_name, _)) => bail!("Unexpected pageserver subcommand '{}'", sub_name),
        None => bail!("no pageserver subcommand provided"),
    }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -21,7 +21,6 @@ futures = "0.3.13"
 hex = "0.4.3"
 hyper = "0.14"
 itertools = "0.10.3"
-lazy_static = "1.4.0"
 clap = "3.0"
 daemonize = "0.4.1"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
@@ -29,7 +28,6 @@ postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-tokio-stream = "0.1.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
 crc32c = "0.6.0"
 thiserror = "1.0"
@@ -49,7 +47,7 @@ tracing = "0.1.27"
 signal-hook = "0.3.10"
 url = "2"
 nix = "0.23"
-once_cell = "1.8.0"
+once_cell = "1.13.0"
 crossbeam-utils = "0.8.5"
 fail = "0.5.0"
 git-version = "0.3.5"
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -23,8 +23,7 @@ use tar::{Builder, EntryType, Header};
 use tracing::*;

 use crate::reltag::{RelTag, SlruKind};
-use crate::repository::Timeline;
-use crate::DatadirTimelineImpl;
+use crate::DatadirTimeline;
 use postgres_ffi::xlog_utils::*;
 use postgres_ffi::*;
 use utils::lsn::Lsn;
@@ -32,12 +31,13 @@ use utils::lsn::Lsn;
 /// This is short-living object only for the time of tarball creation,
 /// created mostly to avoid passing a lot of parameters between various functions
 /// used for constructing tarball.
-pub struct Basebackup<'a, W>
+pub struct Basebackup<'a, W, T>
 where
    W: Write,
+    T: DatadirTimeline,
 {
    ar: Builder<AbortableWrite<W>>,
-    timeline: &'a Arc<DatadirTimelineImpl>,
+    timeline: &'a Arc<T>,
    pub lsn: Lsn,
    prev_record_lsn: Lsn,
    full_backup: bool,
@@ -52,17 +52,18 @@ where
 //  * When working without safekeepers. In this situation it is important to match the lsn
 //    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
 //    to start the replication.
-impl<'a, W> Basebackup<'a, W>
+impl<'a, W, T> Basebackup<'a, W, T>
 where
    W: Write,
+    T: DatadirTimeline,
 {
    pub fn new(
        write: W,
-        timeline: &'a Arc<DatadirTimelineImpl>,
+        timeline: &'a Arc<T>,
        req_lsn: Option<Lsn>,
        prev_lsn: Option<Lsn>,
        full_backup: bool,
-    ) -> Result<Basebackup<'a, W>> {
+    ) -> Result<Basebackup<'a, W, T>> {
        // Compute postgres doesn't have any previous WAL files, but the first
        // record that it's going to write needs to include the LSN of the
        // previous record (xl_prev). We include prev_record_lsn in the
@@ -79,13 +80,13 @@ where
        let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
            info!("waiting for {}", req_lsn);
-            timeline.tline.wait_lsn(req_lsn)?;
+            timeline.wait_lsn(req_lsn)?;

            // If the requested point is the end of the timeline, we can
            // provide prev_lsn. (get_last_record_rlsn() might return it as
            // zero, though, if no WAL has been generated on this timeline
            // yet.)
-            let end_of_timeline = timeline.tline.get_last_record_rlsn();
+            let end_of_timeline = timeline.get_last_record_rlsn();
            if req_lsn == end_of_timeline.last {
                (end_of_timeline.prev, req_lsn)
            } else {
@@ -93,7 +94,7 @@ where
            }
        } else {
            // Backup was requested at end of the timeline.
-            let end_of_timeline = timeline.tline.get_last_record_rlsn();
+            let end_of_timeline = timeline.get_last_record_rlsn();
            (end_of_timeline.prev, end_of_timeline.last)
        };

@@ -371,7 +372,7 @@ where
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
-            if self.lsn == self.timeline.tline.get_ancestor_lsn() {
+            if self.lsn == self.timeline.get_ancestor_lsn() {
                write!(zenith_signal, "PREV LSN: none")?;
            } else {
                write!(zenith_signal, "PREV LSN: invalid")?;
@@ -402,9 +403,10 @@ where
    }
 }

-impl<'a, W> Drop for Basebackup<'a, W>
+impl<'a, W, T> Drop for Basebackup<'a, W, T>
 where
    W: Write,
+    T: DatadirTimeline,
 {
    /// If the basebackup was not finished, prevent the Archive::drop() from
    /// writing the end-of-archive marker.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -59,6 +59,7 @@ pub mod defaults {

 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
+#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
 #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
 #compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
 #compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
@@ -452,6 +453,13 @@ impl PageServerConf {
                Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
        }

+        if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") {
+            t_conf.checkpoint_timeout = Some(parse_toml_duration(
+                "checkpoint_timeout",
+                checkpoint_timeout,
+            )?);
+        }
+
        if let Some(compaction_target_size) = item.get("compaction_target_size") {
            t_conf.compaction_target_size = Some(parse_toml_u64(
                "compaction_target_size",
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -7,6 +7,10 @@ use utils::{
    zid::{NodeId, ZTenantId, ZTimelineId},
 };

+// These enums are used in the API response fields.
+use crate::repository::LocalTimelineState;
+use crate::tenant_mgr::TenantState;
+
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
@@ -28,6 +32,7 @@ pub struct TenantCreateRequest {
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub new_tenant_id: Option<ZTenantId>,
    pub checkpoint_distance: Option<u64>,
+    pub checkpoint_timeout: Option<String>,
    pub compaction_target_size: Option<u64>,
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
@@ -66,6 +71,7 @@ pub struct TenantConfigRequest {
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub checkpoint_distance: Option<u64>,
+    pub checkpoint_timeout: Option<String>,
    pub compaction_target_size: Option<u64>,
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
@@ -83,6 +89,7 @@ impl TenantConfigRequest {
        TenantConfigRequest {
            tenant_id,
            checkpoint_distance: None,
+            checkpoint_timeout: None,
            compaction_target_size: None,
            compaction_period: None,
            compaction_threshold: None,
@@ -97,14 +104,59 @@ impl TenantConfigRequest {
    }
 }

-/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`.
-/// We keep one WAL receiver active per timeline.
+#[serde_as]
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TenantInfo {
+    #[serde_as(as = "DisplayFromStr")]
+    pub id: ZTenantId,
+    pub state: Option<TenantState>,
+    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
+    pub has_in_progress_downloads: Option<bool>,
+}
+
 #[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct WalReceiverEntry {
-    pub wal_producer_connstr: Option<String>,
+pub struct LocalTimelineInfo {
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_timeline_id: Option<ZTimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub last_record_lsn: Lsn,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub prev_record_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub latest_gc_cutoff_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
+    pub disk_consistent_lsn: Lsn,
+    pub current_logical_size: Option<usize>, // is None when timeline is Unloaded
+    pub current_physical_size: Option<u64>,  // is None when timeline is Unloaded
+    pub current_logical_size_non_incremental: Option<usize>,
+    pub current_physical_size_non_incremental: Option<u64>,
+    pub timeline_state: LocalTimelineState,
+
+    pub wal_source_connstr: Option<String>,
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
 }
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct RemoteTimelineInfo {
+    #[serde_as(as = "DisplayFromStr")]
+    pub remote_consistent_lsn: Lsn,
+    pub awaits_download: bool,
+}
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct TimelineInfo {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: ZTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub timeline_id: ZTimelineId,
+    pub local: Option<LocalTimelineInfo>,
+    pub remote: Option<RemoteTimelineInfo>,
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -78,6 +78,11 @@ paths:
        schema:
          type: string
          description: Controls calculation of current_logical_size_non_incremental
+      - name: include-non-incremental-physical-size
+        in: query
+        schema:
+          type: string
+          description: Controls calculation of current_physical_size_non_incremental
    get:
      description: Get timelines for tenant
      responses:
@@ -136,6 +141,11 @@ paths:
          schema:
            type: string
          description: Controls calculation of current_logical_size_non_incremental
+        - name: include-non-incremental-physical-size
+          in: query
+          schema:
+            type: string
+            description: Controls calculation of current_physical_size_non_incremental
      responses:
        "200":
          description: TimelineInfo
@@ -197,54 +207,6 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

-  /v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    get:
-      description: Get wal receiver's data attached to the timeline
-      responses:
-        "200":
-          description: WalReceiverEntry
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/WalReceiverEntry"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Error when no wal receiver is running or found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-
  /v1/tenant/{tenant_id}/attach:
    parameters:
      - name: tenant_id
@@ -577,6 +539,8 @@ components:
          type: string
        state:
          type: string
+        current_physical_size:
+          type: integer
        has_in_progress_downloads:
          type: boolean
    TenantCreateInfo:
@@ -596,6 +560,8 @@ components:
          type: string
        checkpoint_distance:
          type: integer
+        checkpoint_timeout:
+          type: string
        compaction_period:
          type: string
        compaction_threshold:
@@ -614,6 +580,8 @@ components:
          type: string
        checkpoint_distance:
          type: integer
+        checkpoint_timeout:
+          type: string
        compaction_period:
          type: string
        compaction_threshold:
@@ -671,18 +639,13 @@ components:
          format: hex
        current_logical_size:
          type: integer
+        current_physical_size:
+          type: integer
        current_logical_size_non_incremental:
          type: integer
-
-    WalReceiverEntry:
-      type: object
-      required:
-        - thread_id
-        - wal_producer_connstr
-      properties:
-        thread_id:
+        current_physical_size_non_incremental:
          type: integer
-        wal_producer_connstr:
+        wal_source_connstr:
          type: string
        last_received_msg_lsn:
          type: string
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -6,16 +6,19 @@ use hyper::{Body, Request, Response, Uri};
 use remote_storage::GenericRemoteStorage;
 use tracing::*;

+use super::models::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
 use super::models::{
-    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse,
+    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
    TimelineCreateRequest,
 };
-use crate::repository::Repository;
+use crate::layered_repository::metadata::TimelineMetadata;
+use crate::pgdatadir_mapping::DatadirTimeline;
+use crate::repository::{LocalTimelineState, RepositoryTimeline};
+use crate::repository::{Repository, Timeline};
 use crate::storage_sync;
 use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
 use crate::tenant_config::TenantConfOpt;
-use crate::tenant_mgr::TenantInfo;
-use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
+use crate::TimelineImpl;
 use crate::{config::PageServerConf, tenant_mgr, timelines};
 use utils::{
    auth::JwtAuth,
@@ -26,6 +29,7 @@ use utils::{
        request::parse_request_param,
        RequestExt, RouterBuilder,
    },
+    lsn::Lsn,
    zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
 };

@@ -79,6 +83,123 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
    get_state(request).conf
 }

+// Helper functions to construct a LocalTimelineInfo struct for a timeline
+
+fn local_timeline_info_from_loaded_timeline(
+    timeline: &TimelineImpl,
+    include_non_incremental_logical_size: bool,
+    include_non_incremental_physical_size: bool,
+) -> anyhow::Result<LocalTimelineInfo> {
+    let last_record_lsn = timeline.get_last_record_lsn();
+    let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
+        let guard = timeline.last_received_wal.lock().unwrap();
+        if let Some(info) = guard.as_ref() {
+            (
+                Some(info.wal_source_connstr.clone()),
+                Some(info.last_received_msg_lsn),
+                Some(info.last_received_msg_ts),
+            )
+        } else {
+            (None, None, None)
+        }
+    };
+
+    let info = LocalTimelineInfo {
+        ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
+        ancestor_lsn: {
+            match timeline.get_ancestor_lsn() {
+                Lsn(0) => None,
+                lsn @ Lsn(_) => Some(lsn),
+            }
+        },
+        disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
+        last_record_lsn,
+        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
+        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
+        timeline_state: LocalTimelineState::Loaded,
+        current_logical_size: Some(timeline.get_current_logical_size()),
+        current_physical_size: Some(timeline.get_physical_size()),
+        current_logical_size_non_incremental: if include_non_incremental_logical_size {
+            Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?)
+        } else {
+            None
+        },
+        current_physical_size_non_incremental: if include_non_incremental_physical_size {
+            Some(timeline.get_physical_size_non_incremental()?)
+        } else {
+            None
+        },
+        wal_source_connstr,
+        last_received_msg_lsn,
+        last_received_msg_ts,
+    };
+    Ok(info)
+}
+
+fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> LocalTimelineInfo {
+    LocalTimelineInfo {
+        ancestor_timeline_id: metadata.ancestor_timeline(),
+        ancestor_lsn: {
+            match metadata.ancestor_lsn() {
+                Lsn(0) => None,
+                lsn @ Lsn(_) => Some(lsn),
+            }
+        },
+        disk_consistent_lsn: metadata.disk_consistent_lsn(),
+        last_record_lsn: metadata.disk_consistent_lsn(),
+        prev_record_lsn: metadata.prev_record_lsn(),
+        latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(),
+        timeline_state: LocalTimelineState::Unloaded,
+        current_logical_size: None,
+        current_physical_size: None,
+        current_logical_size_non_incremental: None,
+        current_physical_size_non_incremental: None,
+        wal_source_connstr: None,
+        last_received_msg_lsn: None,
+        last_received_msg_ts: None,
+    }
+}
+
+fn local_timeline_info_from_repo_timeline(
+    repo_timeline: &RepositoryTimeline<TimelineImpl>,
+    include_non_incremental_logical_size: bool,
+    include_non_incremental_physical_size: bool,
+) -> anyhow::Result<LocalTimelineInfo> {
+    match repo_timeline {
+        RepositoryTimeline::Loaded(timeline) => local_timeline_info_from_loaded_timeline(
+            &*timeline,
+            include_non_incremental_logical_size,
+            include_non_incremental_physical_size,
+        ),
+        RepositoryTimeline::Unloaded { metadata } => {
+            Ok(local_timeline_info_from_unloaded_timeline(metadata))
+        }
+    }
+}
+
+fn list_local_timelines(
+    tenant_id: ZTenantId,
+    include_non_incremental_logical_size: bool,
+    include_non_incremental_physical_size: bool,
+) -> Result<Vec<(ZTimelineId, LocalTimelineInfo)>> {
+    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
+        .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?;
+    let repo_timelines = repo.list_timelines();
+
+    let mut local_timeline_info = Vec::with_capacity(repo_timelines.len());
+    for (timeline_id, repository_timeline) in repo_timelines {
+        local_timeline_info.push((
+            timeline_id,
+            local_timeline_info_from_repo_timeline(
+                &repository_timeline,
+                include_non_incremental_logical_size,
+                include_non_incremental_physical_size,
+            )?,
+        ))
+    }
+    Ok(local_timeline_info)
+}
+
 // healthcheck handler
 async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let config = get_config(&request);
@@ -93,16 +214,30 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<

    let new_timeline_info = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("/timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn).entered();
-        timelines::create_timeline(
+
+        match timelines::create_timeline(
            get_config(&request),
            tenant_id,
            request_data.new_timeline_id.map(ZTimelineId::from),
            request_data.ancestor_timeline_id.map(ZTimelineId::from),
            request_data.ancestor_start_lsn,
-        )
+        ) {
+            Ok(Some((new_timeline_id, new_timeline))) => {
+                // Created. Construct a TimelineInfo for it.
+                let local_info = local_timeline_info_from_loaded_timeline(new_timeline.as_ref(), false, false)?;
+                Ok(Some(TimelineInfo {
+                    tenant_id,
+                    timeline_id: new_timeline_id,
+                    local: Some(local_info),
+                    remote: None,
+                }))
+            }
+            Ok(None) => Ok(None), // timeline already exists
+            Err(err) => Err(err),
+        }
    })
    .await
-    .map_err(ApiError::from_err)??;
+        .map_err(ApiError::from_err)??;

    Ok(match new_timeline_info {
        Some(info) => json_response(StatusCode::CREATED, info)?,
@@ -113,10 +248,17 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
-    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+    let include_non_incremental_logical_size =
+        query_param_present(&request, "include-non-incremental-logical-size");
+    let include_non_incremental_physical_size =
+        query_param_present(&request, "include-non-incremental-physical-size");
    let local_timeline_infos = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
-        crate::timelines::get_local_timelines(tenant_id, include_non_incremental_logical_size)
+        list_local_timelines(
+            tenant_id,
+            include_non_incremental_logical_size,
+            include_non_incremental_physical_size,
+        )
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -145,17 +287,15 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, response_data)
 }

-// Gate non incremental logical size calculation behind a flag
-// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines
-// and tenants it can take noticeable amount of time. Also the value currently used only in tests
-fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
+/// Checks if a query param is present in the request's URL
+fn query_param_present(request: &Request<Body>, param: &str) -> bool {
    request
        .uri()
        .query()
        .map(|v| {
            url::form_urlencoded::parse(v.as_bytes())
                .into_owned()
-                .any(|(param, _)| param == "include-non-incremental-logical-size")
+                .any(|(p, _)| p == param)
        })
        .unwrap_or(false)
 }
@@ -165,7 +305,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    check_permission(&request, Some(tenant_id))?;

    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
-    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+    let include_non_incremental_logical_size =
+        query_param_present(&request, "include-non-incremental-logical-size");
+    let include_non_incremental_physical_size =
+        query_param_present(&request, "include-non-incremental-physical-size");

    let (local_timeline_info, remote_timeline_info) = async {
        // any error here will render local timeline as None
@@ -176,11 +319,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
                repo.get_timeline(timeline_id)
                    .as_ref()
                    .map(|timeline| {
-                        LocalTimelineInfo::from_repo_timeline(
-                            tenant_id,
-                            timeline_id,
+                        local_timeline_info_from_repo_timeline(
                            timeline,
                            include_non_incremental_logical_size,
+                            include_non_incremental_physical_size,
                        )
                    })
                    .transpose()?
@@ -225,23 +367,6 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    json_response(StatusCode::OK, timeline_info)
 }

-async fn wal_receiver_get_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
-    let wal_receiver_entry = crate::walreceiver::get_wal_receiver_entry(tenant_id, timeline_id)
-        .instrument(info_span!("wal_receiver_get", tenant = %tenant_id, timeline = %timeline_id))
-        .await
-        .ok_or_else(|| {
-            ApiError::NotFound(format!(
-                "WAL receiver data not found for tenant {tenant_id} and timeline {timeline_id}"
-            ))
-        })?;
-
-    json_response(StatusCode::OK, &wal_receiver_entry)
-}
-
 // TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
 async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
@@ -429,14 +554,36 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    let index_accessor = remote_index.read().await;
    let has_in_progress_downloads = index_accessor
        .tenant_entry(&tenant_id)
-        .ok_or_else(|| ApiError::NotFound("Tenant not found in remote index".to_string()))?
-        .has_in_progress_downloads();
+        .map(|t| t.has_in_progress_downloads())
+        .unwrap_or_else(|| {
+            info!("Tenant {tenant_id} not found in remote index");
+            false
+        });
+
+    let current_physical_size =
+        match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false))
+            .await
+            .map_err(ApiError::from_err)?
+        {
+            Err(err) => {
+                // Getting local timelines can fail when no local repo is on disk (e.g, when tenant data is being downloaded).
+                // In that case, put a warning message into log and operate normally.
+                warn!("Failed to get local timelines for tenant {tenant_id}: {err}");
+                None
+            }
+            Ok(local_timeline_infos) => Some(
+                local_timeline_infos
+                    .into_iter()
+                    .fold(0, |acc, x| acc + x.1.current_physical_size.unwrap()),
+            ),
+        };

    json_response(
        StatusCode::OK,
        TenantInfo {
            id: tenant_id,
            state: tenant_state,
+            current_physical_size,
            has_in_progress_downloads: Some(has_in_progress_downloads),
        },
    )
@@ -476,6 +623,11 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
+    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
+        tenant_conf.checkpoint_timeout =
+            Some(humantime::parse_duration(&checkpoint_timeout).map_err(ApiError::from_err)?);
+    }
+
    tenant_conf.compaction_target_size = request_data.compaction_target_size;
    tenant_conf.compaction_threshold = request_data.compaction_threshold;

@@ -536,6 +688,10 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
+    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
+        tenant_conf.checkpoint_timeout =
+            Some(humantime::parse_duration(&checkpoint_timeout).map_err(ApiError::from_err)?);
+    }
    tenant_conf.compaction_target_size = request_data.compaction_target_size;
    tenant_conf.compaction_threshold = request_data.compaction_threshold;

@@ -606,9 +762,5 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
            timeline_delete_handler,
        )
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver",
-            wal_receiver_get_handler,
-        )
        .any(handler_404))
 }
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -13,9 +13,8 @@ use walkdir::WalkDir;

 use crate::pgdatadir_mapping::*;
 use crate::reltag::{RelTag, SlruKind};
-use crate::repository::Repository;
-use crate::repository::Timeline;
 use crate::walingest::WalIngest;
+use crate::walrecord::DecodedWALRecord;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::*;
 use postgres_ffi::xlog_utils::*;
@@ -29,9 +28,9 @@ use utils::lsn::Lsn;
 /// This is currently only used to import a cluster freshly created by initdb.
 /// The code that deals with the checkpoint would not work right if the
 /// cluster was not shut down cleanly.
-pub fn import_timeline_from_postgres_datadir<R: Repository>(
+pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
    path: &Path,
-    tline: &mut DatadirTimeline<R>,
+    tline: &T,
    lsn: Lsn,
 ) -> Result<()> {
    let mut pg_control: Option<ControlFileData> = None;
@@ -89,8 +88,8 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
 }

 // subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
-fn import_rel<R: Repository, Reader: Read>(
-    modification: &mut DatadirModification<R>,
+fn import_rel<T: DatadirTimeline, Reader: Read>(
+    modification: &mut DatadirModification<T>,
    path: &Path,
    spcoid: Oid,
    dboid: Oid,
@@ -169,8 +168,8 @@ fn import_rel<R: Repository, Reader: Read>(

 /// Import an SLRU segment file
 ///
-fn import_slru<R: Repository, Reader: Read>(
-    modification: &mut DatadirModification<R>,
+fn import_slru<T: DatadirTimeline, Reader: Read>(
+    modification: &mut DatadirModification<T>,
    slru: SlruKind,
    path: &Path,
    mut reader: Reader,
@@ -225,9 +224,9 @@ fn import_slru<R: Repository, Reader: Read>(

 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
-fn import_wal<R: Repository>(
+fn import_wal<T: DatadirTimeline>(
    walpath: &Path,
-    tline: &mut DatadirTimeline<R>,
+    tline: &T,
    startpoint: Lsn,
    endpoint: Lsn,
 ) -> Result<()> {
@@ -268,9 +267,11 @@ fn import_wal<R: Repository>(
        waldecoder.feed_bytes(&buf);

        let mut nrecords = 0;
+        let mut modification = tline.begin_modification(endpoint);
+        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= endpoint {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(tline, recdata, lsn)?;
+                walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
                last_lsn = lsn;

                nrecords += 1;
@@ -294,8 +295,8 @@ fn import_wal<R: Repository>(
    Ok(())
 }

-pub fn import_basebackup_from_tar<R: Repository, Reader: Read>(
-    tline: &mut DatadirTimeline<R>,
+pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
+    tline: &T,
    reader: Reader,
    base_lsn: Lsn,
 ) -> Result<()> {
@@ -336,8 +337,8 @@ pub fn import_basebackup_from_tar<R: Repository, Reader: Read>(
    Ok(())
 }

-pub fn import_wal_from_tar<R: Repository, Reader: Read>(
-    tline: &mut DatadirTimeline<R>,
+pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
+    tline: &T,
    reader: Reader,
    start_lsn: Lsn,
    end_lsn: Lsn,
@@ -384,9 +385,11 @@ pub fn import_wal_from_tar<R: Repository, Reader: Read>(

        waldecoder.feed_bytes(&bytes[offset..]);

+        let mut modification = tline.begin_modification(end_lsn);
+        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= end_lsn {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                walingest.ingest_record(tline, recdata, lsn)?;
+                walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
                last_lsn = lsn;

                debug!("imported record at {} (end {})", lsn, end_lsn);
@@ -415,8 +418,8 @@ pub fn import_wal_from_tar<R: Repository, Reader: Read>(
    Ok(())
 }

-pub fn import_file<R: Repository, Reader: Read>(
-    modification: &mut DatadirModification<R>,
+pub fn import_file<T: DatadirTimeline, Reader: Read>(
+    modification: &mut DatadirModification<T>,
    file_path: &Path,
    reader: Reader,
    len: usize,
@@ -535,7 +538,7 @@ pub fn import_file<R: Repository, Reader: Read>(
        // zenith.signal is not necessarily the last file, that we handle
        // but it is ok to call `finish_write()`, because final `modification.commit()`
        // will update lsn once more to the final one.
-        let writer = modification.tline.tline.writer();
+        let writer = modification.tline.writer();
        writer.finish_write(prev_lsn);

        debug!("imported zenith signal {}", prev_lsn);
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/block_io.rs
+++ b/pageserver/src/layered_repository/block_io.rs
@@ -5,7 +5,7 @@
 use crate::page_cache;
 use crate::page_cache::{ReadBufResult, PAGE_SZ};
 use bytes::Bytes;
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::AtomicU64;
@@ -117,9 +117,7 @@ where
    }
 }

-lazy_static! {
-    static ref NEXT_ID: AtomicU64 = AtomicU64::new(1);
-}
+static NEXT_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));

 /// An adapter for reading a (virtual) file using the page cache.
 ///
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -316,6 +316,18 @@ impl Layer for DeltaLayer {
        }
    }

+    fn key_iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
+        let inner = match self.load() {
+            Ok(inner) => inner,
+            Err(e) => panic!("Failed to load a delta layer: {e:?}"),
+        };
+
+        match DeltaKeyIter::new(inner) {
+            Ok(iter) => Box::new(iter),
+            Err(e) => panic!("Layer index is corrupted: {e:?}"),
+        }
+    }
+
    fn delete(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -660,11 +672,21 @@ impl DeltaLayerWriter {
    /// The values must be appended in key, lsn order.
    ///
    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
+        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
+    }
+
+    pub fn put_value_bytes(
+        &mut self,
+        key: Key,
+        lsn: Lsn,
+        val: &[u8],
+        will_init: bool,
+    ) -> Result<()> {
        assert!(self.lsn_range.start <= lsn);

-        let off = self.blob_writer.write_blob(&Value::ser(&val)?)?;
+        let off = self.blob_writer.write_blob(val)?;

-        let blob_ref = BlobRef::new(off, val.will_init());
+        let blob_ref = BlobRef::new(off, will_init);

        let delta_key = DeltaKey::from_key_lsn(&key, lsn);
        self.tree.append(&delta_key.0, blob_ref.0)?;
@@ -822,3 +844,75 @@ impl<'a> DeltaValueIter<'a> {
        }
    }
 }
+///
+/// Iterator over all keys stored in a delta layer
+///
+/// FIXME: This creates a Vector to hold all keys.
+/// That takes up quite a lot of memory. Should do this in a more streaming
+/// fashion.
+///
+struct DeltaKeyIter {
+    all_keys: Vec<(DeltaKey, u64)>,
+    next_idx: usize,
+}
+
+impl Iterator for DeltaKeyIter {
+    type Item = (Key, Lsn, u64);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.next_idx < self.all_keys.len() {
+            let (delta_key, size) = &self.all_keys[self.next_idx];
+
+            let key = delta_key.key();
+            let lsn = delta_key.lsn();
+
+            self.next_idx += 1;
+            Some((key, lsn, *size))
+        } else {
+            None
+        }
+    }
+}
+
+impl<'a> DeltaKeyIter {
+    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+        let file = inner.file.as_ref().unwrap();
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );
+
+        let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value| {
+                let delta_key = DeltaKey::from_slice(key);
+                let pos = BlobRef(value).pos();
+                if let Some(last) = all_keys.last_mut() {
+                    if last.0.key() == delta_key.key() {
+                        return true;
+                    } else {
+                        // subtract offset of new key BLOB and first blob of this key
+                        // to get total size if values associated with this key
+                        let first_pos = last.1;
+                        last.1 = pos - first_pos;
+                    }
+                }
+                all_keys.push((delta_key, pos));
+                true
+            },
+        )?;
+        if let Some(last) = all_keys.last_mut() {
+            // Last key occupies all space till end of layer
+            last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
+        }
+        let iter = DeltaKeyIter {
+            all_keys,
+            next_idx: 0,
+        };
+
+        Ok(iter)
+    }
+}
--- a/pageserver/src/layered_repository/ephemeral_file.rs
+++ b/pageserver/src/layered_repository/ephemeral_file.rs
@@ -8,7 +8,7 @@ use crate::page_cache;
 use crate::page_cache::PAGE_SZ;
 use crate::page_cache::{ReadBufResult, WriteBufResult};
 use crate::virtual_file::VirtualFile;
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
 use std::cmp::min;
 use std::collections::HashMap;
 use std::fs::OpenOptions;
@@ -21,15 +21,15 @@ use utils::zid::{ZTenantId, ZTimelineId};

 use std::os::unix::fs::FileExt;

-lazy_static! {
-    ///
-    /// This is the global cache of file descriptors (File objects).
-    ///
-    static ref EPHEMERAL_FILES: RwLock<EphemeralFiles> = RwLock::new(EphemeralFiles {
+///
+/// This is the global cache of file descriptors (File objects).
+///
+static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
+    RwLock::new(EphemeralFiles {
        next_file_id: 1,
        files: HashMap::new(),
-    });
-}
+    })
+});

 pub struct EphemeralFiles {
    next_file_id: u64,
@@ -43,7 +43,7 @@ pub struct EphemeralFile {
    _timelineid: ZTimelineId,
    file: Arc<VirtualFile>,

-    size: u64,
+    pub size: u64,
 }

 impl EphemeralFile {
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -15,6 +15,7 @@ use crate::layered_repository::storage_layer::{
 use crate::repository::{Key, Value};
 use crate::walrecord;
 use anyhow::{bail, ensure, Result};
+use std::cell::RefCell;
 use std::collections::HashMap;
 use tracing::*;
 use utils::{
@@ -30,6 +31,12 @@ use std::ops::Range;
 use std::path::PathBuf;
 use std::sync::RwLock;

+thread_local! {
+    /// A buffer for serializing object during [`InMemoryLayer::put_value`].
+    /// This buffer is reused for each serialization to avoid additional malloc calls.
+    static SER_BUFFER: RefCell<Vec<u8>> = RefCell::new(Vec::new());
+}
+
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
@@ -233,6 +240,14 @@ impl Layer for InMemoryLayer {
 }

 impl InMemoryLayer {
+    ///
+    /// Get layer size on the disk
+    ///
+    pub fn size(&self) -> Result<u64> {
+        let inner = self.inner.read().unwrap();
+        Ok(inner.file.size)
+    }
+
    ///
    /// Create a new, empty, in-memory layer
    ///
@@ -270,10 +285,17 @@ impl InMemoryLayer {
    pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
        let mut inner = self.inner.write().unwrap();
-
        inner.assert_writeable();

-        let off = inner.file.write_blob(&Value::ser(val)?)?;
+        let off = {
+            SER_BUFFER.with(|x| -> Result<_> {
+                let mut buf = x.borrow_mut();
+                buf.clear();
+                val.ser_into(&mut (*buf))?;
+                let off = inner.file.write_blob(&buf)?;
+                Ok(off)
+            })?
+        };

        let vec_map = inner.index.entry(key).or_default();
        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
@@ -342,8 +364,8 @@ impl InMemoryLayer {
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
                cursor.read_blob_into_buf(*pos, &mut buf)?;
-                let val = Value::des(&buf)?;
-                delta_layer_writer.put_value(key, *lsn, val)?;
+                let will_init = Value::des(&buf)?.will_init();
+                delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
            }
        }

--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -10,24 +10,23 @@
 //! corresponding files are written to disk.
 //!

+use crate::layered_repository::inmemory_layer::InMemoryLayer;
 use crate::layered_repository::storage_layer::Layer;
 use crate::layered_repository::storage_layer::{range_eq, range_overlaps};
-use crate::layered_repository::InMemoryLayer;
 use crate::repository::Key;
 use anyhow::Result;
-use lazy_static::lazy_static;
 use metrics::{register_int_gauge, IntGauge};
+use once_cell::sync::Lazy;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
 use tracing::*;
 use utils::lsn::Lsn;

-lazy_static! {
-    static ref NUM_ONDISK_LAYERS: IntGauge =
-        register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
-            .expect("failed to define a metric");
-}
+static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
+        .expect("failed to define a metric")
+});

 ///
 /// LayerMap tracks what layers exist on a timeline.
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -139,6 +139,12 @@ pub trait Layer: Send + Sync {
    /// Iterate through all keys and values stored in the layer
    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;

+    /// Iterate through all keys stored in the layer. Returns key, lsn and value size
+    /// It is used only for compaction and so is currently implemented only for DeltaLayer
+    fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
+        panic!("Not implemented")
+    }
+
    /// Permanently remove this layer from disk.
    fn delete(&self) -> Result<()>;

--- a/pageserver/src/layered_repository/timeline.rs
+++ b/pageserver/src/layered_repository/timeline.rs
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -22,7 +22,7 @@ pub mod walreceiver;
 pub mod walrecord;
 pub mod walredo;

-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
 use tracing::info;

 use crate::thread_mgr::ThreadKind;
@@ -42,14 +42,14 @@ pub const STORAGE_FORMAT_VERSION: u16 = 3;
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
 pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

-lazy_static! {
-    static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
+static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
        "pageserver_live_connections",
        "Number of live network connections",
        &["pageserver_connection_kind"]
    )
-    .expect("failed to define a metric");
-}
+    .expect("failed to define a metric")
+});

 pub const LOG_FILE_NAME: &str = "pageserver.log";

@@ -63,8 +63,7 @@ pub enum CheckpointConfig {
 }

 pub type RepositoryImpl = LayeredRepository;
-
-pub type DatadirTimelineImpl = DatadirTimeline<RepositoryImpl>;
+pub type TimelineImpl = <LayeredRepository as repository::Repository>::Timeline;

 pub fn shutdown_pageserver(exit_code: i32) {
    // Shut down the libpq endpoint thread. This prevents new connections from
@@ -94,3 +93,56 @@ pub fn shutdown_pageserver(exit_code: i32) {
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
+
+const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
+const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
+
+async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+    let backoff_duration_seconds =
+        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    if backoff_duration_seconds > 0.0 {
+        info!(
+            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
+        );
+        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+    }
+}
+
+fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+    if n == 0 {
+        0.0
+    } else {
+        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
+    }
+}
+
+#[cfg(test)]
+mod backoff_defaults_tests {
+    use super::*;
+
+    #[test]
+    fn backoff_defaults_produce_growing_backoff_sequence() {
+        let mut current_backoff_value = None;
+
+        for i in 0..10_000 {
+            let new_backoff_value = exponential_backoff_duration_seconds(
+                i,
+                DEFAULT_BASE_BACKOFF_SECONDS,
+                DEFAULT_MAX_BACKOFF_SECONDS,
+            );
+
+            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
+                assert!(
+                    old_backoff_value <= new_backoff_value,
+                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
+                )
+            }
+        }
+
+        assert_eq!(
+            current_backoff_value.expect("Should have produced backoff values to compare"),
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            "Given big enough of retries, backoff should reach its allowed max value"
+        );
+    }
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -11,7 +11,7 @@

 use anyhow::{bail, ensure, Context, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
 use regex::Regex;
 use std::io::{self, Read};
 use std::net::TcpListener;
@@ -30,7 +30,6 @@ use utils::{
 use crate::basebackup;
 use crate::config::{PageServerConf, ProfilingConfig};
 use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
-use crate::layered_repository::LayeredRepository;
 use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp};
 use crate::profiling::profpoint_start;
 use crate::reltag::RelTag;
@@ -435,15 +434,15 @@ const TIME_BUCKETS: &[f64] = &[
    0.1,  // 1/10 s
 ];

-lazy_static! {
-    static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
+static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
        "Time spent on smgr query handling",
        &["smgr_query_type", "tenant_id", "timeline_id"],
        TIME_BUCKETS.into()
    )
-    .expect("failed to define a metric");
-}
+    .expect("failed to define a metric")
+});

 impl PageServerHandler {
    pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
@@ -555,9 +554,6 @@ impl PageServerHandler {
        info!("creating new timeline");
        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
        let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?;
-        let repartition_distance = repo.get_checkpoint_distance();
-        let mut datadir_timeline =
-            DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);

        // TODO mark timeline as not ready until it reaches end_lsn.
        // We might have some wal to import as well, and we should prevent compute
@@ -573,7 +569,7 @@ impl PageServerHandler {
        info!("importing basebackup");
        pgb.write_message(&BeMessage::CopyInResponse)?;
        let reader = CopyInReader::new(pgb);
-        import_basebackup_from_tar(&mut datadir_timeline, reader, base_lsn)?;
+        import_basebackup_from_tar(&*timeline, reader, base_lsn)?;

        // TODO check checksum
        // Meanwhile you can verify client-side by taking fullbackup
@@ -583,7 +579,7 @@ impl PageServerHandler {

        // Flush data to disk, then upload to s3
        info!("flushing layers");
-        datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?;
+        timeline.checkpoint(CheckpointConfig::Flush)?;

        info!("done");
        Ok(())
@@ -605,10 +601,6 @@ impl PageServerHandler {
        let timeline = repo.get_timeline_load(timeline_id)?;
        ensure!(timeline.get_last_record_lsn() == start_lsn);

-        let repartition_distance = repo.get_checkpoint_distance();
-        let mut datadir_timeline =
-            DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
-
        // TODO leave clean state on error. For now you can use detach to clean
        // up broken state from a failed import.

@@ -616,16 +608,16 @@ impl PageServerHandler {
        info!("importing wal");
        pgb.write_message(&BeMessage::CopyInResponse)?;
        let reader = CopyInReader::new(pgb);
-        import_wal_from_tar(&mut datadir_timeline, reader, start_lsn, end_lsn)?;
+        import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)?;

        // TODO Does it make sense to overshoot?
-        ensure!(datadir_timeline.tline.get_last_record_lsn() >= end_lsn);
+        ensure!(timeline.get_last_record_lsn() >= end_lsn);

        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
        // We only want to persist the data, and it doesn't matter if it's in the
        // shape of deltas or images.
        info!("flushing layers");
-        datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?;
+        timeline.checkpoint(CheckpointConfig::Flush)?;

        info!("done");
        Ok(())
@@ -643,8 +635,8 @@ impl PageServerHandler {
    /// In either case, if the page server hasn't received the WAL up to the
    /// requested LSN yet, we will wait for it to arrive. The return value is
    /// the LSN that should be used to look up the page versions.
-    fn wait_or_get_last_lsn<R: Repository>(
-        timeline: &DatadirTimeline<R>,
+    fn wait_or_get_last_lsn<T: DatadirTimeline>(
+        timeline: &T,
        mut lsn: Lsn,
        latest: bool,
        latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
@@ -671,7 +663,7 @@ impl PageServerHandler {
            if lsn <= last_record_lsn {
                lsn = last_record_lsn;
            } else {
-                timeline.tline.wait_lsn(lsn)?;
+                timeline.wait_lsn(lsn)?;
                // Since we waited for 'lsn' to arrive, that is now the last
                // record LSN. (Or close enough for our purposes; the
                // last-record LSN can advance immediately after we return
@@ -681,7 +673,7 @@ impl PageServerHandler {
            if lsn == Lsn(0) {
                bail!("invalid LSN(0) in request");
            }
-            timeline.tline.wait_lsn(lsn)?;
+            timeline.wait_lsn(lsn)?;
        }
        ensure!(
            lsn >= **latest_gc_cutoff_lsn,
@@ -691,14 +683,14 @@ impl PageServerHandler {
        Ok(lsn)
    }

-    fn handle_get_rel_exists_request<R: Repository>(
+    fn handle_get_rel_exists_request<T: DatadirTimeline>(
        &self,
-        timeline: &DatadirTimeline<R>,
+        timeline: &T,
        req: &PagestreamExistsRequest,
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();

-        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;

        let exists = timeline.get_rel_exists(req.rel, lsn)?;
@@ -708,13 +700,13 @@ impl PageServerHandler {
        }))
    }

-    fn handle_get_nblocks_request<R: Repository>(
+    fn handle_get_nblocks_request<T: DatadirTimeline>(
        &self,
-        timeline: &DatadirTimeline<R>,
+        timeline: &T,
        req: &PagestreamNblocksRequest,
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
-        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;

        let n_blocks = timeline.get_rel_size(req.rel, lsn)?;
@@ -724,13 +716,13 @@ impl PageServerHandler {
        }))
    }

-    fn handle_db_size_request<R: Repository>(
+    fn handle_db_size_request<T: DatadirTimeline>(
        &self,
-        timeline: &DatadirTimeline<R>,
+        timeline: &T,
        req: &PagestreamDbSizeRequest,
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered();
-        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;

        let total_blocks =
@@ -743,14 +735,14 @@ impl PageServerHandler {
        }))
    }

-    fn handle_get_page_at_lsn_request<R: Repository>(
+    fn handle_get_page_at_lsn_request<T: DatadirTimeline>(
        &self,
-        timeline: &DatadirTimeline<R>,
+        timeline: &T,
        req: &PagestreamGetPageRequest,
    ) -> Result<PagestreamBeMessage> {
        let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
            .entered();
-        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
        /*
        // Add a 1s delay to some requests. The delayed causes the requests to
@@ -783,7 +775,7 @@ impl PageServerHandler {
        // check that the timeline exists
        let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
            .context("Cannot load local timeline")?;
-        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            timeline
                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
@@ -921,7 +913,7 @@ impl postgres_backend::Handler for PageServerHandler {
            let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
                .context("Cannot load local timeline")?;

-            let end_of_timeline = timeline.tline.get_last_record_rlsn();
+            let end_of_timeline = timeline.get_last_record_rlsn();

            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::text_col(b"prev_lsn"),
@@ -1052,6 +1044,7 @@ impl postgres_backend::Handler for PageServerHandler {
            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"checkpoint_distance"),
+                RowDescriptor::int8_col(b"checkpoint_timeout"),
                RowDescriptor::int8_col(b"compaction_target_size"),
                RowDescriptor::int8_col(b"compaction_period"),
                RowDescriptor::int8_col(b"compaction_threshold"),
@@ -1062,6 +1055,12 @@ impl postgres_backend::Handler for PageServerHandler {
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
                Some(repo.get_checkpoint_distance().to_string().as_bytes()),
+                Some(
+                    repo.get_checkpoint_timeout()
+                        .as_secs()
+                        .to_string()
+                        .as_bytes(),
+                ),
                Some(repo.get_compaction_target_size().to_string().as_bytes()),
                Some(
                    repo.get_compaction_period()
@@ -1139,7 +1138,7 @@ impl postgres_backend::Handler for PageServerHandler {
            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
            let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
                .context("Couldn't load timeline")?;
-            timeline.tline.compact()?;
+            timeline.compact()?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
@@ -1159,13 +1158,8 @@ impl postgres_backend::Handler for PageServerHandler {
            let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
                .context("Cannot load local timeline")?;

-            timeline.tline.checkpoint(CheckpointConfig::Forced)?;
-
-            // Also compact it.
-            //
-            // FIXME: This probably shouldn't be part of a "checkpoint" command, but a
-            // separate operation. Update the tests if you change this.
-            timeline.tline.compact()?;
+            // Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`).
+            timeline.checkpoint(CheckpointConfig::Forced)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,10 +6,10 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
-use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceAccum};
+use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::reltag::{RelTag, SlruKind};
+use crate::repository::Timeline;
 use crate::repository::*;
-use crate::repository::{Repository, Timeline};
 use crate::walrecord::ZenithWalRecord;
 use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
@@ -18,34 +18,12 @@ use postgres_ffi::{pg_constants, Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
 use std::ops::Range;
-use std::sync::atomic::{AtomicIsize, Ordering};
-use std::sync::{Arc, Mutex, RwLockReadGuard};
-use tracing::{debug, error, trace, warn};
+use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
 pub type BlockNumber = u32;

-pub struct DatadirTimeline<R>
-where
-    R: Repository,
-{
-    /// The underlying key-value store. Callers should not read or modify the
-    /// data in the underlying store directly. However, it is exposed to have
-    /// access to information like last-LSN, ancestor, and operations like
-    /// compaction.
-    pub tline: Arc<R::Timeline>,
-
-    /// When did we last calculate the partitioning?
-    partitioning: Mutex<(KeyPartitioning, Lsn)>,
-
-    /// Configuration: how often should the partitioning be recalculated.
-    repartition_threshold: u64,
-
-    /// Current logical size of the "datadir", at the last LSN.
-    current_logical_size: AtomicIsize,
-}
-
 #[derive(Debug)]
 pub enum LsnForTimestamp {
    Present(Lsn),
@@ -54,33 +32,29 @@ pub enum LsnForTimestamp {
    NoData(Lsn),
 }

-impl<R: Repository> DatadirTimeline<R> {
-    pub fn new(tline: Arc<R::Timeline>, repartition_threshold: u64) -> Self {
-        DatadirTimeline {
-            tline,
-            partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
-            current_logical_size: AtomicIsize::new(0),
-            repartition_threshold,
-        }
-    }
-
-    /// (Re-)calculate the logical size of the database at the latest LSN.
-    ///
-    /// This can be a slow operation.
-    pub fn init_logical_size(&self) -> Result<()> {
-        let last_lsn = self.tline.get_last_record_lsn();
-        self.current_logical_size.store(
-            self.get_current_logical_size_non_incremental(last_lsn)? as isize,
-            Ordering::SeqCst,
-        );
-        Ok(())
-    }
-
+///
+/// This trait provides all the functionality to store PostgreSQL relations, SLRUs,
+/// and other special kinds of files, in a versioned key-value store. The
+/// Timeline trait provides the key-value store.
+///
+/// This is a trait, so that we can easily include all these functions in a Timeline
+/// implementation. You're not expected to have different implementations of this trait,
+/// rather, this provides an interface and implementation, over Timeline.
+///
+/// If you wanted to store other kinds of data in the Neon repository, e.g.
+/// flat files or MySQL, you would create a new trait like this, with all the
+/// functions that make sense for the kind of data you're storing. For flat files,
+/// for example, you might have a function like "fn read(path, offset, size)".
+/// We might also have that situation in the future, to support multiple PostgreSQL
+/// versions, if there are big changes in how the data is organized in the data
+/// directory, or if new special files are introduced.
+///
+pub trait DatadirTimeline: Timeline {
    /// Start ingesting a WAL record, or other atomic modification of
    /// the timeline.
    ///
    /// This provides a transaction-like interface to perform a bunch
-    /// of modifications atomically, all stamped with one LSN.
+    /// of modifications atomically.
    ///
    /// To ingest a WAL record, call begin_modification(lsn) to get a
    /// DatadirModification object. Use the functions in the object to
@@ -88,18 +62,27 @@ impl<R: Repository> DatadirTimeline<R> {
    /// that the WAL record affects. When you're done, call commit() to
    /// commit the changes.
    ///
+    /// Lsn stored in modification is advanced by `ingest_record` and
+    /// is used by `commit()` to update `last_record_lsn`.
+    ///
+    /// Calling commit() will flush all the changes and reset the state,
+    /// so the `DatadirModification` struct can be reused to perform the next modification.
+    ///
    /// Note that any pending modifications you make through the
    /// modification object won't be visible to calls to the 'get' and list
    /// functions of the timeline until you finish! And if you update the
    /// same page twice, the last update wins.
    ///
-    pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification<R> {
+    fn begin_modification(&self, lsn: Lsn) -> DatadirModification<Self>
+    where
+        Self: Sized,
+    {
        DatadirModification {
            tline: self,
-            lsn,
            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
+            lsn,
        }
    }

@@ -108,7 +91,7 @@ impl<R: Repository> DatadirTimeline<R> {
    //------------------------------------------------------------------------------

    /// Look up given page version.
-    pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes> {
+    fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes> {
        ensure!(tag.relnode != 0, "invalid relnode");

        let nblocks = self.get_rel_size(tag, lsn)?;
@@ -121,11 +104,11 @@ impl<R: Repository> DatadirTimeline<R> {
        }

        let key = rel_block_to_key(tag, blknum);
-        self.tline.get(key, lsn)
+        self.get(key, lsn)
    }

    // Get size of a database in blocks
-    pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<usize> {
+    fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<usize> {
        let mut total_blocks = 0;

        let rels = self.list_rels(spcnode, dbnode, lsn)?;
@@ -138,9 +121,13 @@ impl<R: Repository> DatadirTimeline<R> {
    }

    /// Get size of a relation file
-    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
+    fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
        ensure!(tag.relnode != 0, "invalid relnode");

+        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
+            return Ok(nblocks);
+        }
+
        if (tag.forknum == pg_constants::FSM_FORKNUM
            || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM)
            && !self.get_rel_exists(tag, lsn)?
@@ -153,17 +140,25 @@ impl<R: Repository> DatadirTimeline<R> {
        }

        let key = rel_size_to_key(tag);
-        let mut buf = self.tline.get(key, lsn)?;
-        Ok(buf.get_u32_le())
+        let mut buf = self.get(key, lsn)?;
+        let nblocks = buf.get_u32_le();
+
+        // Update relation size cache
+        self.update_cached_rel_size(tag, lsn, nblocks);
+        Ok(nblocks)
    }

    /// Does relation exist?
-    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
+    fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
        ensure!(tag.relnode != 0, "invalid relnode");

+        // first try to lookup relation in cache
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
+            return Ok(true);
+        }
        // fetch directory listing
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.tline.get(key, lsn)?;
+        let buf = self.get(key, lsn)?;
        let dir = RelDirectory::des(&buf)?;

        let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
@@ -172,10 +167,10 @@ impl<R: Repository> DatadirTimeline<R> {
    }

    /// Get a list of all existing relations in given tablespace and database.
-    pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
+    fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
        // fetch directory listing
        let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.tline.get(key, lsn)?;
+        let buf = self.get(key, lsn)?;
        let dir = RelDirectory::des(&buf)?;

        let rels: HashSet<RelTag> =
@@ -190,7 +185,7 @@ impl<R: Repository> DatadirTimeline<R> {
    }

    /// Look up given SLRU page version.
-    pub fn get_slru_page_at_lsn(
+    fn get_slru_page_at_lsn(
        &self,
        kind: SlruKind,
        segno: u32,
@@ -198,26 +193,21 @@ impl<R: Repository> DatadirTimeline<R> {
        lsn: Lsn,
    ) -> Result<Bytes> {
        let key = slru_block_to_key(kind, segno, blknum);
-        self.tline.get(key, lsn)
+        self.get(key, lsn)
    }

    /// Get size of an SLRU segment
-    pub fn get_slru_segment_size(
-        &self,
-        kind: SlruKind,
-        segno: u32,
-        lsn: Lsn,
-    ) -> Result<BlockNumber> {
+    fn get_slru_segment_size(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<BlockNumber> {
        let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.tline.get(key, lsn)?;
+        let mut buf = self.get(key, lsn)?;
        Ok(buf.get_u32_le())
    }

    /// Get size of an SLRU segment
-    pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
+    fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
        // fetch directory listing
        let key = slru_dir_to_key(kind);
-        let buf = self.tline.get(key, lsn)?;
+        let buf = self.get(key, lsn)?;
        let dir = SlruSegmentDirectory::des(&buf)?;

        let exists = dir.segments.get(&segno).is_some();
@@ -231,10 +221,10 @@ impl<R: Repository> DatadirTimeline<R> {
    /// so it's not well defined which LSN you get if there were multiple commits
    /// "in flight" at that point in time.
    ///
-    pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
-        let gc_cutoff_lsn_guard = self.tline.get_latest_gc_cutoff_lsn();
+    fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
+        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
        let min_lsn = *gc_cutoff_lsn_guard;
-        let max_lsn = self.tline.get_last_record_lsn();
+        let max_lsn = self.get_last_record_lsn();

        // LSNs are always 8-byte aligned. low/mid/high represent the
        // LSN divided by 8.
@@ -325,88 +315,51 @@ impl<R: Repository> DatadirTimeline<R> {
    }

    /// Get a list of SLRU segments
-    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
+    fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
        // fetch directory entry
        let key = slru_dir_to_key(kind);

-        let buf = self.tline.get(key, lsn)?;
+        let buf = self.get(key, lsn)?;
        let dir = SlruSegmentDirectory::des(&buf)?;

        Ok(dir.segments)
    }

-    pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
+    fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
        let key = relmap_file_key(spcnode, dbnode);

-        let buf = self.tline.get(key, lsn)?;
+        let buf = self.get(key, lsn)?;
        Ok(buf)
    }

-    pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
+    fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
        // fetch directory entry
-        let buf = self.tline.get(DBDIR_KEY, lsn)?;
+        let buf = self.get(DBDIR_KEY, lsn)?;
        let dir = DbDirectory::des(&buf)?;

        Ok(dir.dbdirs)
    }

-    pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
+    fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
        let key = twophase_file_key(xid);
-        let buf = self.tline.get(key, lsn)?;
+        let buf = self.get(key, lsn)?;
        Ok(buf)
    }

-    pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
+    fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
        // fetch directory entry
-        let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?;
+        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
        let dir = TwoPhaseDirectory::des(&buf)?;

        Ok(dir.xids)
    }

-    pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
-        self.tline.get(CONTROLFILE_KEY, lsn)
+    fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
+        self.get(CONTROLFILE_KEY, lsn)
    }

-    pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
-        self.tline.get(CHECKPOINT_KEY, lsn)
-    }
-
-    /// Get the LSN of the last ingested WAL record.
-    ///
-    /// This is just a convenience wrapper that calls through to the underlying
-    /// repository.
-    pub fn get_last_record_lsn(&self) -> Lsn {
-        self.tline.get_last_record_lsn()
-    }
-
-    /// Check that it is valid to request operations with that lsn.
-    ///
-    /// This is just a convenience wrapper that calls through to the underlying
-    /// repository.
-    pub fn check_lsn_is_in_scope(
-        &self,
-        lsn: Lsn,
-        latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
-    ) -> Result<()> {
-        self.tline.check_lsn_is_in_scope(lsn, latest_gc_cutoff_lsn)
-    }
-
-    /// Retrieve current logical size of the timeline
-    ///
-    /// NOTE: counted incrementally, includes ancestors,
-    pub fn get_current_logical_size(&self) -> usize {
-        let current_logical_size = self.current_logical_size.load(Ordering::Acquire);
-        match usize::try_from(current_logical_size) {
-            Ok(sz) => sz,
-            Err(_) => {
-                error!(
-                    "current_logical_size is out of range: {}",
-                    current_logical_size
-                );
-                0
-            }
-        }
+    fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
+        self.get(CHECKPOINT_KEY, lsn)
    }

    /// Does the same as get_current_logical_size but counted on demand.
@@ -414,16 +367,16 @@ impl<R: Repository> DatadirTimeline<R> {
    ///
    /// Only relation blocks are counted currently. That excludes metadata,
    /// SLRUs, twophase files etc.
-    pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize> {
+    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize> {
        // Fetch list of database dirs and iterate them
-        let buf = self.tline.get(DBDIR_KEY, lsn)?;
+        let buf = self.get(DBDIR_KEY, lsn)?;
        let dbdir = DbDirectory::des(&buf)?;

        let mut total_size: usize = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
            for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
                let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.tline.get(relsize_key, lsn)?;
+                let mut buf = self.get(relsize_key, lsn)?;
                let relsize = buf.get_u32_le();

                total_size += relsize as usize;
@@ -444,7 +397,7 @@ impl<R: Repository> DatadirTimeline<R> {
        result.add_key(DBDIR_KEY);

        // Fetch list of database dirs and iterate them
-        let buf = self.tline.get(DBDIR_KEY, lsn)?;
+        let buf = self.get(DBDIR_KEY, lsn)?;
        let dbdir = DbDirectory::des(&buf)?;

        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
@@ -461,7 +414,7 @@ impl<R: Repository> DatadirTimeline<R> {
            rels.sort_unstable();
            for rel in rels {
                let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.tline.get(relsize_key, lsn)?;
+                let mut buf = self.get(relsize_key, lsn)?;
                let relsize = buf.get_u32_le();

                result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -477,13 +430,13 @@ impl<R: Repository> DatadirTimeline<R> {
        ] {
            let slrudir_key = slru_dir_to_key(kind);
            result.add_key(slrudir_key);
-            let buf = self.tline.get(slrudir_key, lsn)?;
+            let buf = self.get(slrudir_key, lsn)?;
            let dir = SlruSegmentDirectory::des(&buf)?;
            let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
            segments.sort_unstable();
            for segno in segments {
                let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.tline.get(segsize_key, lsn)?;
+                let mut buf = self.get(segsize_key, lsn)?;
                let segsize = buf.get_u32_le();

                result.add_range(
@@ -495,7 +448,7 @@ impl<R: Repository> DatadirTimeline<R> {

        // Then pg_twophase
        result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?;
+        let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
        let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
        xids.sort_unstable();
@@ -509,31 +462,31 @@ impl<R: Repository> DatadirTimeline<R> {
        Ok(result.to_keyspace())
    }

-    pub fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> {
-        let mut partitioning_guard = self.partitioning.lock().unwrap();
-        if partitioning_guard.1 == Lsn(0)
-            || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold
-        {
-            let keyspace = self.collect_keyspace(lsn)?;
-            let partitioning = keyspace.partition(partition_size);
-            *partitioning_guard = (partitioning, lsn);
-            return Ok((partitioning_guard.0.clone(), lsn));
-        }
-        Ok((partitioning_guard.0.clone(), partitioning_guard.1))
-    }
+    /// Get cached size of relation if it not updated after specified LSN
+    fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber>;
+
+    /// Update cached relation size if there is no more recent update
+    fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
+
+    /// Store cached relation size
+    fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
+
+    /// Remove cached relation size
+    fn remove_cached_rel_size(&self, tag: &RelTag);
 }

 /// DatadirModification represents an operation to ingest an atomic set of
 /// updates to the repository. It is created by the 'begin_record'
 /// function. It is called for each WAL record, so that all the modifications
 /// by a one WAL record appear atomic.
-pub struct DatadirModification<'a, R: Repository> {
+pub struct DatadirModification<'a, T: DatadirTimeline> {
    /// The timeline this modification applies to. You can access this to
    /// read the state, but note that any pending updates are *not* reflected
    /// in the state in 'tline' yet.
-    pub tline: &'a DatadirTimeline<R>,
+    pub tline: &'a T,

-    lsn: Lsn,
+    /// Lsn assigned by begin_modification
+    pub lsn: Lsn,

    // The modifications are not applied directly to the underlying key-value store.
    // The put-functions add the modifications here, and they are flushed to the
@@ -543,7 +496,7 @@ pub struct DatadirModification<'a, R: Repository> {
    pending_nblocks: isize,
 }

-impl<'a, R: Repository> DatadirModification<'a, R> {
+impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
    /// Initialize a completely new repository.
    ///
    /// This inserts the directory metadata entries that are assumed to
@@ -744,26 +697,36 @@ impl<'a, R: Repository> DatadirModification<'a, R> {

        self.pending_nblocks += nblocks as isize;

+        // Update relation size cache
+        self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
+
        // Even if nblocks > 0, we don't insert any actual blocks here. That's up to the
        // caller.
-
        Ok(())
    }

    /// Truncate relation
    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
        ensure!(rel.relnode != 0, "invalid relnode");
-        let size_key = rel_size_to_key(rel);
+        let last_lsn = self.tline.get_last_record_lsn();
+        if self.tline.get_rel_exists(rel, last_lsn)? {
+            let size_key = rel_size_to_key(rel);
+            // Fetch the old size first
+            let old_size = self.get(size_key)?.get_u32_le();

-        // Fetch the old size first
-        let old_size = self.get(size_key)?.get_u32_le();
+            // Update the entry with the new size.
+            let buf = nblocks.to_le_bytes();
+            self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));

-        // Update the entry with the new size.
-        let buf = nblocks.to_le_bytes();
-        self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
+            // Update relation size cache
+            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);

-        // Update logical database size.
-        self.pending_nblocks -= old_size as isize - nblocks as isize;
+            // Update relation size cache
+            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
+
+            // Update logical database size.
+            self.pending_nblocks -= old_size as isize - nblocks as isize;
+        }
        Ok(())
    }

@@ -781,6 +744,9 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
            let buf = nblocks.to_le_bytes();
            self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));

+            // Update relation size cache
+            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
+
            self.pending_nblocks += nblocks as isize - old_size as isize;
        }
        Ok(())
@@ -806,6 +772,9 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
        let old_size = self.get(size_key)?.get_u32_le();
        self.pending_nblocks -= old_size as isize;

+        // Remove enty from relation size cache
+        self.tline.remove_cached_rel_size(&rel);
+
        // Delete size entry, as well as all blocks
        self.delete(rel_key_range(rel));

@@ -928,7 +897,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
            return Ok(());
        }

-        let writer = self.tline.tline.writer();
+        let writer = self.tline.writer();

        // Flush relation and  SLRU data blocks, keep metadata.
        let mut result: Result<()> = Ok(());
@@ -943,10 +912,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
        result?;

        if pending_nblocks != 0 {
-            self.tline.current_logical_size.fetch_add(
-                pending_nblocks * pg_constants::BLCKSZ as isize,
-                Ordering::SeqCst,
-            );
+            writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize);
            self.pending_nblocks = 0;
        }

@@ -956,26 +922,25 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
    ///
    /// Finish this atomic update, writing all the updated keys to the
    /// underlying timeline.
+    /// All the modifications in this atomic update are stamped by the specified LSN.
    ///
-    pub fn commit(self) -> Result<()> {
-        let writer = self.tline.tline.writer();
-
+    pub fn commit(&mut self) -> Result<()> {
+        let writer = self.tline.writer();
+        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
+        self.pending_nblocks = 0;

-        for (key, value) in self.pending_updates {
-            writer.put(key, self.lsn, &value)?;
+        for (key, value) in self.pending_updates.drain() {
+            writer.put(key, lsn, &value)?;
        }
-        for key_range in self.pending_deletions {
-            writer.delete(key_range.clone(), self.lsn)?;
+        for key_range in self.pending_deletions.drain(..) {
+            writer.delete(key_range, lsn)?;
        }

-        writer.finish_write(self.lsn);
+        writer.finish_write(lsn);

        if pending_nblocks != 0 {
-            self.tline.current_logical_size.fetch_add(
-                pending_nblocks * pg_constants::BLCKSZ as isize,
-                Ordering::SeqCst,
-            );
+            writer.update_current_logical_size(pending_nblocks * pg_constants::BLCKSZ as isize);
        }

        Ok(())
@@ -1001,8 +966,8 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
                bail!("unexpected pending WAL record");
            }
        } else {
-            let last_lsn = self.tline.get_last_record_lsn();
-            self.tline.tline.get(key, last_lsn)
+            let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+            self.tline.get(key, lsn)
        }
    }

@@ -1404,13 +1369,12 @@ fn is_slru_block_key(key: Key) -> bool {
 pub fn create_test_timeline<R: Repository>(
    repo: R,
    timeline_id: utils::zid::ZTimelineId,
-) -> Result<Arc<crate::DatadirTimeline<R>>> {
+) -> Result<std::sync::Arc<R::Timeline>> {
    let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
-    let tline = DatadirTimeline::new(tline, 256 * 1024);
    let mut m = tline.begin_modification(Lsn(8));
    m.init_empty()?;
    m.commit()?;
-    Ok(Arc::new(tline))
+    Ok(tline)
 }

 #[allow(clippy::bool_assert_comparison)]
@@ -1483,7 +1447,7 @@ mod tests {
            .contains(&TESTREL_A));

        // Run checkpoint and garbage collection and check that it's still not visible
-        newtline.tline.checkpoint(CheckpointConfig::Forced)?;
+        newtline.checkpoint(CheckpointConfig::Forced)?;
        repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;

        assert!(!newtline
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -185,7 +185,7 @@ impl Value {
 /// A repository corresponds to one .neon directory. One repository holds multiple
 /// timelines, forked off from the same initial call to 'initdb'.
 pub trait Repository: Send + Sync {
-    type Timeline: Timeline;
+    type Timeline: crate::DatadirTimeline;

    /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
    /// See [`crate::remote_storage`] for more details about the synchronization.
@@ -277,15 +277,6 @@ pub enum LocalTimelineState {
    Unloaded,
 }

-impl<'a, T> From<&'a RepositoryTimeline<T>> for LocalTimelineState {
-    fn from(local_timeline_entry: &'a RepositoryTimeline<T>) -> Self {
-        match local_timeline_entry {
-            RepositoryTimeline::Loaded(_) => LocalTimelineState::Loaded,
-            RepositoryTimeline::Unloaded { .. } => LocalTimelineState::Unloaded,
-        }
-    }
-}
-
 ///
 /// Result of performing GC
 ///
@@ -382,6 +373,11 @@ pub trait Timeline: Send + Sync {
        lsn: Lsn,
        latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
    ) -> Result<()>;
+
+    /// Get the physical size of the timeline at the latest LSN
+    fn get_physical_size(&self) -> u64;
+    /// Get the physical size of the timeline at the latest LSN non incrementally
+    fn get_physical_size_non_incremental(&self) -> Result<u64>;
 }

 /// Various functions to mutate the timeline.
@@ -405,12 +401,14 @@ pub trait TimelineWriter<'a> {
    /// the 'lsn' or anything older. The previous last record LSN is stored alongside
    /// the latest and can be read.
    fn finish_write(&self, lsn: Lsn);
+
+    fn update_current_logical_size(&self, delta: isize);
 }

 #[cfg(test)]
 pub mod repo_harness {
    use bytes::BytesMut;
-    use lazy_static::lazy_static;
+    use once_cell::sync::Lazy;
    use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
    use std::{fs, path::PathBuf};

@@ -441,14 +439,13 @@ pub mod repo_harness {
        buf.freeze()
    }

-    lazy_static! {
-        static ref LOCK: RwLock<()> = RwLock::new(());
-    }
+    static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));

    impl From<TenantConf> for TenantConfOpt {
        fn from(tenant_conf: TenantConf) -> Self {
            Self {
                checkpoint_distance: Some(tenant_conf.checkpoint_distance),
+                checkpoint_timeout: Some(tenant_conf.checkpoint_timeout),
                compaction_target_size: Some(tenant_conf.compaction_target_size),
                compaction_period: Some(tenant_conf.compaction_period),
                compaction_threshold: Some(tenant_conf.compaction_threshold),
@@ -591,11 +588,10 @@ mod tests {
    //use std::sync::Arc;
    use bytes::BytesMut;
    use hex_literal::hex;
-    use lazy_static::lazy_static;
+    use once_cell::sync::Lazy;

-    lazy_static! {
-        static ref TEST_KEY: Key = Key::from_slice(&hex!("112222222233333333444444445500000001"));
-    }
+    static TEST_KEY: Lazy<Key> =
+        Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));

    #[test]
    fn test_basic() -> Result<()> {
--- a/pageserver/src/storage_sync.rs
+++ b/pageserver/src/storage_sync.rs
@@ -155,8 +155,7 @@ use std::{

 use anyhow::{anyhow, bail, Context};
 use futures::stream::{FuturesUnordered, StreamExt};
-use lazy_static::lazy_static;
-use once_cell::sync::OnceCell;
+use once_cell::sync::{Lazy, OnceCell};
 use remote_storage::{GenericRemoteStorage, RemoteStorage};
 use tokio::{
    fs,
@@ -173,10 +172,10 @@ use self::{
 };
 use crate::{
    config::PageServerConf,
+    exponential_backoff,
    layered_repository::{
        ephemeral_file::is_ephemeral_file,
        metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
-        LayeredRepository,
    },
    storage_sync::{self, index::RemoteIndex},
    tenant_mgr::attach_downloaded_tenants,
@@ -185,8 +184,8 @@ use crate::{
 };

 use metrics::{
-    register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge,
-    HistogramVec, IntCounter, IntCounterVec, IntGauge,
+    register_histogram_vec, register_int_counter_vec, register_int_gauge, HistogramVec,
+    IntCounterVec, IntGauge,
 };
 use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};

@@ -194,32 +193,33 @@ use self::download::download_index_parts;
 pub use self::download::gather_tenant_timelines_index_parts;
 pub use self::download::TEMP_DOWNLOAD_EXTENSION;

-lazy_static! {
-    static ref REMAINING_SYNC_ITEMS: IntGauge = register_int_gauge!(
+static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
        "pageserver_remote_storage_remaining_sync_items",
        "Number of storage sync items left in the queue"
    )
-    .expect("failed to register pageserver remote storage remaining sync items int gauge");
-    static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!(
-        "pageserver_remote_storage_fatal_task_failures_total",
-        "Number of critically failed tasks"
-    )
-    .expect("failed to register pageserver remote storage remaining sync items int gauge");
-    static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
+    .expect("failed to register pageserver remote storage remaining sync items int gauge")
+});
+
+static IMAGE_SYNC_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_remote_storage_image_sync_seconds",
        "Time took to synchronize (download or upload) a whole pageserver image. \
        Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
        &["tenant_id", "timeline_id", "operation_kind", "status"],
        vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
    )
-    .expect("failed to register pageserver image sync time histogram vec");
-    static ref REMOTE_INDEX_UPLOAD: IntCounterVec = register_int_counter_vec!(
+    .expect("failed to register pageserver image sync time histogram vec")
+});
+
+static REMOTE_INDEX_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "pageserver_remote_storage_remote_index_uploads_total",
        "Number of remote index uploads",
        &["tenant_id", "timeline_id"],
    )
-    .expect("failed to register pageserver remote index upload vec");
-}
+    .expect("failed to register pageserver remote index upload vec")
+});

 static SYNC_QUEUE: OnceCell<SyncQueue> = OnceCell::new();

@@ -970,14 +970,19 @@ fn storage_sync_loop<P, S>(
    }
 }

-// needed to check whether the download happened
-// more informative than just a bool
 #[derive(Debug)]
-enum DownloadMarker {
+enum DownloadStatus {
    Downloaded,
    Nothing,
 }

+#[derive(Debug)]
+enum UploadStatus {
+    Uploaded,
+    Failed,
+    Nothing,
+}
+
 async fn process_batches<P, S>(
    conf: &'static PageServerConf,
    max_sync_errors: NonZeroU32,
@@ -1017,7 +1022,7 @@ where
            "Finished storage sync task for sync id {sync_id} download marker {:?}",
            download_marker
        );
-        if matches!(download_marker, DownloadMarker::Downloaded) {
+        if matches!(download_marker, DownloadStatus::Downloaded) {
            downloaded_timelines.insert(sync_id.tenant_id);
        }
    }
@@ -1031,7 +1036,7 @@ async fn process_sync_task_batch<P, S>(
    max_sync_errors: NonZeroU32,
    sync_id: ZTenantTimelineId,
    batch: SyncTaskBatch,
-) -> DownloadMarker
+) -> DownloadStatus
 where
    P: Debug + Send + Sync + 'static,
    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -1048,7 +1053,7 @@ where
    // When operating in a system without tasks failing over the error threshold,
    // current batching and task processing systems aim to update the layer set and metadata files (remote and local),
    // without "losing" such layer files.
-    let (upload_result, status_update) = tokio::join!(
+    let (upload_status, download_status) = tokio::join!(
        async {
            if let Some(upload_data) = upload_data {
                match validate_task_retries(upload_data, max_sync_errors)
@@ -1066,7 +1071,7 @@ where
                            "upload",
                        )
                        .await;
-                        return Some(());
+                        UploadStatus::Uploaded
                    }
                    ControlFlow::Break(failed_upload_data) => {
                        if let Err(e) = update_remote_data(
@@ -1083,10 +1088,13 @@ where
                        {
                            error!("Failed to update remote timeline {sync_id}: {e:?}");
                        }
+
+                        UploadStatus::Failed
                    }
                }
+            } else {
+                UploadStatus::Nothing
            }
-            None
        }
        .instrument(info_span!("upload_timeline_data")),
        async {
@@ -1116,50 +1124,53 @@ where
                    }
                }
            }
-            DownloadMarker::Nothing
+            DownloadStatus::Nothing
        }
        .instrument(info_span!("download_timeline_data")),
    );

    if let Some(delete_data) = batch.delete {
-        if upload_result.is_some() {
-            match validate_task_retries(delete_data, max_sync_errors)
-                .instrument(info_span!("retries_validation"))
-                .await
-            {
-                ControlFlow::Continue(new_delete_data) => {
-                    delete_timeline_data(
-                        conf,
-                        (storage.as_ref(), &index, sync_queue),
-                        sync_id,
-                        new_delete_data,
-                        sync_start,
-                        "delete",
-                    )
-                    .instrument(info_span!("delete_timeline_data"))
-                    .await;
-                }
-                ControlFlow::Break(failed_delete_data) => {
-                    if let Err(e) = update_remote_data(
-                        conf,
-                        storage.as_ref(),
-                        &index,
-                        sync_id,
-                        RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers),
-                    )
+        match upload_status {
+            UploadStatus::Uploaded | UploadStatus::Nothing => {
+                match validate_task_retries(delete_data, max_sync_errors)
+                    .instrument(info_span!("retries_validation"))
                    .await
-                    {
-                        error!("Failed to update remote timeline {sync_id}: {e:?}");
+                {
+                    ControlFlow::Continue(new_delete_data) => {
+                        delete_timeline_data(
+                            conf,
+                            (storage.as_ref(), &index, sync_queue),
+                            sync_id,
+                            new_delete_data,
+                            sync_start,
+                            "delete",
+                        )
+                        .instrument(info_span!("delete_timeline_data"))
+                        .await;
+                    }
+                    ControlFlow::Break(failed_delete_data) => {
+                        if let Err(e) = update_remote_data(
+                            conf,
+                            storage.as_ref(),
+                            &index,
+                            sync_id,
+                            RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers),
+                        )
+                        .await
+                        {
+                            error!("Failed to update remote timeline {sync_id}: {e:?}");
+                        }
                    }
                }
            }
-        } else {
-            sync_queue.push(sync_id, SyncTask::Delete(delete_data));
-            warn!("Skipping delete task due to failed upload tasks, reenqueuing");
+            UploadStatus::Failed => {
+                warn!("Skipping delete task due to failed upload tasks, reenqueuing");
+                sync_queue.push(sync_id, SyncTask::Delete(delete_data));
+            }
        }
    }

-    status_update
+    download_status
 }

 async fn download_timeline_data<P, S>(
@@ -1170,7 +1181,7 @@ async fn download_timeline_data<P, S>(
    new_download_data: SyncData<LayersDownload>,
    sync_start: Instant,
    task_name: &str,
-) -> DownloadMarker
+) -> DownloadStatus
 where
    P: Debug + Send + Sync + 'static,
    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -1199,7 +1210,7 @@ where
                Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
                    Ok(()) => {
                        register_sync_status(sync_id, sync_start, task_name, Some(true));
-                        return DownloadMarker::Downloaded;
+                        return DownloadStatus::Downloaded;
                    }
                    Err(e) => {
                        error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
@@ -1215,7 +1226,7 @@ where
        }
    }

-    DownloadMarker::Nothing
+    DownloadStatus::Nothing
 }

 async fn update_local_metadata(
@@ -1257,7 +1268,13 @@ async fn update_local_metadata(
            timeline_id,
        } = sync_id;
        tokio::task::spawn_blocking(move || {
-            LayeredRepository::save_metadata(conf, timeline_id, tenant_id, &cloned_metadata, true)
+            crate::layered_repository::save_metadata(
+                conf,
+                timeline_id,
+                tenant_id,
+                &cloned_metadata,
+                true,
+            )
        })
        .await
        .with_context(|| {
@@ -1487,11 +1504,7 @@ async fn validate_task_retries<T>(
        return ControlFlow::Break(sync_data);
    }

-    if current_attempt > 0 {
-        let seconds_to_wait = 2.0_f64.powf(current_attempt as f64 - 1.0).min(30.0);
-        info!("Waiting {seconds_to_wait} seconds before starting the task");
-        tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await;
-    }
+    exponential_backoff(current_attempt, 1.0, 30.0).await;
    ControlFlow::Continue(sync_data)
 }

--- a/pageserver/src/storage_sync/download.rs
+++ b/pageserver/src/storage_sync/download.rs
@@ -130,6 +130,7 @@ where
            tenant_path.display()
        )
    })?;
+
    let timelines = storage
        .list_prefixes(Some(tenant_storage_path))
        .await
@@ -140,6 +141,13 @@ where
            )
        })?;

+    if timelines.is_empty() {
+        anyhow::bail!(
+            "no timelines found on the remote storage for tenant {}",
+            tenant_id
+        )
+    }
+
    let mut sync_ids = HashSet::new();

    for timeline_remote_storage_key in timelines {
--- a/pageserver/src/storage_sync/upload.rs
+++ b/pageserver/src/storage_sync/upload.rs
@@ -4,7 +4,7 @@ use std::{fmt::Debug, path::PathBuf};

 use anyhow::Context;
 use futures::stream::{FuturesUnordered, StreamExt};
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
 use remote_storage::RemoteStorage;
 use tokio::fs;
 use tracing::{debug, error, info, warn};
@@ -20,14 +20,14 @@ use crate::{
 };
 use metrics::{register_int_counter_vec, IntCounterVec};

-lazy_static! {
-    static ref NO_LAYERS_UPLOAD: IntCounterVec = register_int_counter_vec!(
+static NO_LAYERS_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
        "pageserver_remote_storage_no_layers_uploads_total",
        "Number of skipped uploads due to no layers",
        &["tenant_id", "timeline_id"],
    )
-    .expect("failed to register pageserver no layers upload vec");
-}
+    .expect("failed to register pageserver no layers upload vec")
+});

 /// Serializes and uploads the given index part data to the remote storage.
 pub(super) async fn upload_index_part<P, S>(
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -23,6 +23,7 @@ pub mod defaults {
    // which is good for now to trigger bugs.
    // This parameter actually determines L0 layer file size.
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
+    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";

    // Target file size, when creating image and delta layers.
    // This parameter determines L1 layer file size.
@@ -48,6 +49,9 @@ pub struct TenantConf {
    // page server crashes.
    // This parameter actually determines L0 layer file size.
    pub checkpoint_distance: u64,
+    // Inmemory layer is also flushed at least once in checkpoint_timeout to
+    // eventually upload WAL after activity is stopped.
+    pub checkpoint_timeout: Duration,
    // Target file size, when creating image and delta layers.
    // This parameter determines L1 layer file size.
    pub compaction_target_size: u64,
@@ -90,6 +94,7 @@ pub struct TenantConf {
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
 pub struct TenantConfOpt {
    pub checkpoint_distance: Option<u64>,
+    pub checkpoint_timeout: Option<Duration>,
    pub compaction_target_size: Option<u64>,
    #[serde(with = "humantime_serde")]
    pub compaction_period: Option<Duration>,
@@ -113,6 +118,9 @@ impl TenantConfOpt {
            checkpoint_distance: self
                .checkpoint_distance
                .unwrap_or(global_conf.checkpoint_distance),
+            checkpoint_timeout: self
+                .checkpoint_timeout
+                .unwrap_or(global_conf.checkpoint_timeout),
            compaction_target_size: self
                .compaction_target_size
                .unwrap_or(global_conf.compaction_target_size),
@@ -142,6 +150,9 @@ impl TenantConfOpt {
        if let Some(checkpoint_distance) = other.checkpoint_distance {
            self.checkpoint_distance = Some(checkpoint_distance);
        }
+        if let Some(checkpoint_timeout) = other.checkpoint_timeout {
+            self.checkpoint_timeout = Some(checkpoint_timeout);
+        }
        if let Some(compaction_target_size) = other.compaction_target_size {
            self.compaction_target_size = Some(compaction_target_size);
        }
@@ -181,6 +192,8 @@ impl TenantConf {

        TenantConf {
            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
+            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
+                .expect("cannot parse default checkpoint timeout"),
            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                .expect("cannot parse default compaction period"),
@@ -212,6 +225,7 @@ impl TenantConf {
    pub fn dummy_conf() -> Self {
        TenantConf {
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
+            checkpoint_timeout: Duration::from_secs(600),
            compaction_target_size: 4 * 1024 * 1024,
            compaction_period: Duration::from_secs(10),
            compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -2,8 +2,8 @@
 //! page server.

 use crate::config::PageServerConf;
+use crate::http::models::TenantInfo;
 use crate::layered_repository::{load_metadata, LayeredRepository};
-use crate::pgdatadir_mapping::DatadirTimeline;
 use crate::repository::Repository;
 use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
 use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
@@ -12,10 +12,9 @@ use crate::thread_mgr::ThreadKind;
 use crate::timelines::CreateRepo;
 use crate::walredo::PostgresRedoManager;
 use crate::{thread_mgr, timelines, walreceiver};
-use crate::{DatadirTimelineImpl, RepositoryImpl};
+use crate::{RepositoryImpl, TimelineImpl};
 use anyhow::Context;
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::fmt;
@@ -28,23 +27,25 @@ use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};

 mod tenants_state {
    use anyhow::ensure;
+    use once_cell::sync::Lazy;
    use std::{
        collections::HashMap,
        sync::{RwLock, RwLockReadGuard, RwLockWriteGuard},
    };
    use tokio::sync::mpsc;
    use tracing::{debug, error};
-
    use utils::zid::ZTenantId;

    use crate::tenant_mgr::{LocalTimelineUpdate, Tenant};

-    lazy_static::lazy_static! {
-        static ref TENANTS: RwLock<HashMap<ZTenantId, Tenant>> = RwLock::new(HashMap::new());
-        /// Sends updates to the local timelines (creation and deletion) to the WAL receiver,
-        /// so that it can enable/disable corresponding processes.
-        static ref TIMELINE_UPDATE_SENDER: RwLock<Option<mpsc::UnboundedSender<LocalTimelineUpdate>>> = RwLock::new(None);
-    }
+    static TENANTS: Lazy<RwLock<HashMap<ZTenantId, Tenant>>> =
+        Lazy::new(|| RwLock::new(HashMap::new()));
+
+    /// Sends updates to the local timelines (creation and deletion) to the WAL receiver,
+    /// so that it can enable/disable corresponding processes.
+    static TIMELINE_UPDATE_SENDER: Lazy<
+        RwLock<Option<mpsc::UnboundedSender<LocalTimelineUpdate>>>,
+    > = Lazy::new(|| RwLock::new(None));

    pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<ZTenantId, Tenant>> {
        TENANTS
@@ -101,7 +102,7 @@ struct Tenant {
    ///
    /// Local timelines have more metadata that's loaded into memory,
    /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`].
-    local_timelines: HashMap<ZTimelineId, Arc<DatadirTimelineImpl>>,
+    local_timelines: HashMap<ZTimelineId, Arc<<RepositoryImpl as Repository>::Timeline>>,
 }

 #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -178,7 +179,7 @@ pub enum LocalTimelineUpdate {
    },
    Attach {
        id: ZTenantTimelineId,
-        datadir: Arc<DatadirTimelineImpl>,
+        datadir: Arc<<RepositoryImpl as Repository>::Timeline>,
    },
 }

@@ -382,7 +383,7 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result<Arc<Rep
 pub fn get_local_timeline_with_load(
    tenant_id: ZTenantId,
    timeline_id: ZTimelineId,
-) -> anyhow::Result<Arc<DatadirTimelineImpl>> {
+) -> anyhow::Result<Arc<TimelineImpl>> {
    let mut m = tenants_state::write_tenants();
    let tenant = m
        .get_mut(&tenant_id)
@@ -489,34 +490,23 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any
 fn load_local_timeline(
    repo: &RepositoryImpl,
    timeline_id: ZTimelineId,
-) -> anyhow::Result<Arc<DatadirTimeline<LayeredRepository>>> {
+) -> anyhow::Result<Arc<TimelineImpl>> {
    let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| {
        format!("Inmem timeline {timeline_id} not found in tenant's repository")
    })?;
-    let repartition_distance = repo.get_checkpoint_distance() / 10;
-    let page_tline = Arc::new(DatadirTimelineImpl::new(
-        inmem_timeline,
-        repartition_distance,
-    ));
-    page_tline.init_logical_size()?;
+    inmem_timeline.init_logical_size()?;

    tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach {
        id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id),
-        datadir: Arc::clone(&page_tline),
+        datadir: Arc::clone(&inmem_timeline),
    });

-    Ok(page_tline)
-}
-
-#[serde_as]
-#[derive(Serialize, Deserialize, Clone)]
-pub struct TenantInfo {
-    #[serde_as(as = "DisplayFromStr")]
-    pub id: ZTenantId,
-    pub state: Option<TenantState>,
-    pub has_in_progress_downloads: Option<bool>,
+    Ok(inmem_timeline)
 }

+///
+/// Get list of tenants, for the mgmt API
+///
 pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
    tenants_state::read_tenants()
        .iter()
@@ -532,6 +522,7 @@ pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
            TenantInfo {
                id: *id,
                state: Some(tenant.state),
+                current_physical_size: None,
                has_in_progress_downloads,
            }
        })
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -120,6 +120,10 @@ pub fn init_tenant_task_pool() -> anyhow::Result<()> {
    let runtime = tokio::runtime::Builder::new_multi_thread()
        .thread_name("tenant-task-worker")
        .enable_all()
+        .on_thread_start(|| {
+            thread_mgr::register(ThreadKind::TenantTaskWorker, "tenant-task-worker")
+        })
+        .on_thread_stop(thread_mgr::deregister)
        .build()?;

    let (gc_send, mut gc_recv) = mpsc::channel::<ZTenantId>(100);
--- a/pageserver/src/thread_mgr.rs
+++ b/pageserver/src/thread_mgr.rs
@@ -45,21 +45,20 @@ use tokio::sync::watch;

 use tracing::{debug, error, info, warn};

-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;

 use utils::zid::{ZTenantId, ZTimelineId};

 use crate::shutdown_pageserver;

-lazy_static! {
-    /// Each thread that we track is associated with a "thread ID". It's just
-    /// an increasing number that we assign, not related to any system thread
-    /// id.
-    static ref NEXT_THREAD_ID: AtomicU64 = AtomicU64::new(1);
+/// Each thread that we track is associated with a "thread ID". It's just
+/// an increasing number that we assign, not related to any system thread
+/// id.
+static NEXT_THREAD_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));

-    /// Global registry of threads
-    static ref THREADS: Mutex<HashMap<u64, Arc<PageServerThread>>> = Mutex::new(HashMap::new());
-}
+/// Global registry of threads
+static THREADS: Lazy<Mutex<HashMap<u64, Arc<PageServerThread>>>> =
+    Lazy::new(|| Mutex::new(HashMap::new()));

 // There is a Tokio watch channel for each thread, which can be used to signal the
 // thread that it needs to shut down. This thread local variable holds the receiving
@@ -97,6 +96,9 @@ pub enum ThreadKind {
    // Thread that schedules new compaction and gc jobs
    TenantTaskManager,

+    // Worker thread for tenant tasks thread pool
+    TenantTaskWorker,
+
    // Thread that flushes frozen in-memory layers to disk
    LayerFlushThread,

@@ -105,18 +107,20 @@ pub enum ThreadKind {
    StorageSync,
 }

+#[derive(Default)]
 struct MutableThreadState {
    /// Tenant and timeline that this thread is associated with.
    tenant_id: Option<ZTenantId>,
    timeline_id: Option<ZTimelineId>,

    /// Handle for waiting for the thread to exit. It can be None, if the
-    /// the thread has already exited.
+    /// the thread has already exited. OR if this thread is managed externally
+    /// and was not spawned through thread_mgr.rs::spawn function.
    join_handle: Option<JoinHandle<()>>,
 }

 struct PageServerThread {
-    _thread_id: u64,
+    thread_id: u64,

    kind: ThreadKind,

@@ -147,7 +151,7 @@ where
    let (shutdown_tx, shutdown_rx) = watch::channel(());
    let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed);
    let thread = Arc::new(PageServerThread {
-        _thread_id: thread_id,
+        thread_id,
        kind,
        name: name.to_string(),
        shutdown_requested: AtomicBool::new(false),
@@ -315,8 +319,10 @@ pub fn shutdown_threads(
            drop(thread_mut);
            let _ = join_handle.join();
        } else {
-            // The thread had not even fully started yet. Or it was shut down
-            // concurrently and already exited
+            // Possibly one of:
+            //  * The thread had not even fully started yet.
+            //  * It was shut down concurrently and already exited
+            //  * Is managed through `register`/`deregister` fns without providing a join handle
        }
    }
 }
@@ -348,3 +354,56 @@ pub fn is_shutdown_requested() -> bool {
        }
    })
 }
+
+/// Needed to register threads that were not spawned through spawn function.
+/// For example tokio blocking threads. This function is expected to be used
+/// in tandem with `deregister`.
+/// NOTE: threads registered through this function cannot be joined
+pub fn register(kind: ThreadKind, name: &str) {
+    CURRENT_THREAD.with(|ct| {
+        let mut borrowed = ct.borrow_mut();
+        if borrowed.is_some() {
+            panic!("thread already registered")
+        };
+        let (shutdown_tx, shutdown_rx) = watch::channel(());
+        let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed);
+
+        let thread = Arc::new(PageServerThread {
+            thread_id,
+            kind,
+            name: name.to_owned(),
+            shutdown_requested: AtomicBool::new(false),
+            shutdown_tx,
+            mutable: Mutex::new(MutableThreadState {
+                tenant_id: None,
+                timeline_id: None,
+                join_handle: None,
+            }),
+        });
+
+        *borrowed = Some(Arc::clone(&thread));
+
+        SHUTDOWN_RX.with(|rx| {
+            *rx.borrow_mut() = Some(shutdown_rx);
+        });
+
+        THREADS.lock().unwrap().insert(thread_id, thread);
+    });
+}
+
+// Expected to be used in tandem with `register`. See the doc for `register` for more details
+pub fn deregister() {
+    CURRENT_THREAD.with(|ct| {
+        let mut borrowed = ct.borrow_mut();
+        let thread = match borrowed.take() {
+            Some(thread) => thread,
+            None => panic!("calling deregister on unregistered thread"),
+        };
+
+        SHUTDOWN_RX.with(|rx| {
+            *rx.borrow_mut() = None;
+        });
+
+        THREADS.lock().unwrap().remove(&thread.thread_id)
+    });
+}
--- a/pageserver/src/timelines.rs
+++ b/pageserver/src/timelines.rs
@@ -4,8 +4,6 @@

 use anyhow::{bail, ensure, Context, Result};
 use postgres_ffi::ControlFileData;
-use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use std::{
    fs,
    path::Path,
@@ -20,123 +18,15 @@ use utils::{
    zid::{ZTenantId, ZTimelineId},
 };

+use crate::tenant_mgr;
 use crate::{
-    config::PageServerConf,
-    layered_repository::metadata::TimelineMetadata,
-    repository::{LocalTimelineState, Repository},
-    storage_sync::index::RemoteIndex,
-    tenant_config::TenantConfOpt,
-    DatadirTimeline, RepositoryImpl,
+    config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex,
+    tenant_config::TenantConfOpt, RepositoryImpl, TimelineImpl,
 };
 use crate::{import_datadir, LOG_FILE_NAME};
 use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager};
-use crate::{repository::RepositoryTimeline, tenant_mgr};
 use crate::{repository::Timeline, CheckpointConfig};

-#[serde_as]
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct LocalTimelineInfo {
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub ancestor_timeline_id: Option<ZTimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub ancestor_lsn: Option<Lsn>,
-    #[serde_as(as = "DisplayFromStr")]
-    pub last_record_lsn: Lsn,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub prev_record_lsn: Option<Lsn>,
-    #[serde_as(as = "DisplayFromStr")]
-    pub latest_gc_cutoff_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
-    pub disk_consistent_lsn: Lsn,
-    pub current_logical_size: Option<usize>, // is None when timeline is Unloaded
-    pub current_logical_size_non_incremental: Option<usize>,
-    pub timeline_state: LocalTimelineState,
-}
-
-impl LocalTimelineInfo {
-    pub fn from_loaded_timeline<R: Repository>(
-        datadir_tline: &DatadirTimeline<R>,
-        include_non_incremental_logical_size: bool,
-    ) -> anyhow::Result<Self> {
-        let last_record_lsn = datadir_tline.tline.get_last_record_lsn();
-        let info = LocalTimelineInfo {
-            ancestor_timeline_id: datadir_tline.tline.get_ancestor_timeline_id(),
-            ancestor_lsn: {
-                match datadir_tline.tline.get_ancestor_lsn() {
-                    Lsn(0) => None,
-                    lsn @ Lsn(_) => Some(lsn),
-                }
-            },
-            disk_consistent_lsn: datadir_tline.tline.get_disk_consistent_lsn(),
-            last_record_lsn,
-            prev_record_lsn: Some(datadir_tline.tline.get_prev_record_lsn()),
-            latest_gc_cutoff_lsn: *datadir_tline.tline.get_latest_gc_cutoff_lsn(),
-            timeline_state: LocalTimelineState::Loaded,
-            current_logical_size: Some(datadir_tline.get_current_logical_size()),
-            current_logical_size_non_incremental: if include_non_incremental_logical_size {
-                Some(datadir_tline.get_current_logical_size_non_incremental(last_record_lsn)?)
-            } else {
-                None
-            },
-        };
-        Ok(info)
-    }
-
-    pub fn from_unloaded_timeline(metadata: &TimelineMetadata) -> Self {
-        LocalTimelineInfo {
-            ancestor_timeline_id: metadata.ancestor_timeline(),
-            ancestor_lsn: {
-                match metadata.ancestor_lsn() {
-                    Lsn(0) => None,
-                    lsn @ Lsn(_) => Some(lsn),
-                }
-            },
-            disk_consistent_lsn: metadata.disk_consistent_lsn(),
-            last_record_lsn: metadata.disk_consistent_lsn(),
-            prev_record_lsn: metadata.prev_record_lsn(),
-            latest_gc_cutoff_lsn: metadata.latest_gc_cutoff_lsn(),
-            timeline_state: LocalTimelineState::Unloaded,
-            current_logical_size: None,
-            current_logical_size_non_incremental: None,
-        }
-    }
-
-    pub fn from_repo_timeline<T>(
-        tenant_id: ZTenantId,
-        timeline_id: ZTimelineId,
-        repo_timeline: &RepositoryTimeline<T>,
-        include_non_incremental_logical_size: bool,
-    ) -> anyhow::Result<Self> {
-        match repo_timeline {
-            RepositoryTimeline::Loaded(_) => {
-                let datadir_tline =
-                    tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?;
-                Self::from_loaded_timeline(&datadir_tline, include_non_incremental_logical_size)
-            }
-            RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)),
-        }
-    }
-}
-
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct RemoteTimelineInfo {
-    #[serde_as(as = "DisplayFromStr")]
-    pub remote_consistent_lsn: Lsn,
-    pub awaits_download: bool,
-}
-
-#[serde_as]
-#[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct TimelineInfo {
-    #[serde_as(as = "DisplayFromStr")]
-    pub tenant_id: ZTenantId,
-    #[serde_as(as = "DisplayFromStr")]
-    pub timeline_id: ZTimelineId,
-    pub local: Option<LocalTimelineInfo>,
-    pub remote: Option<RemoteTimelineInfo>,
-}
-
 #[derive(Debug, Clone, Copy)]
 pub struct PointInTime {
    pub timeline_id: ZTimelineId,
@@ -298,19 +188,18 @@ fn bootstrap_timeline<R: Repository>(
    // Initdb lsn will be equal to last_record_lsn which will be set after import.
    // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
    let timeline = repo.create_empty_timeline(tli, lsn)?;
-    let mut page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline, u64::MAX);
-    import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?;
+    import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;

    fail::fail_point!("before-checkpoint-new-timeline", |_| {
        bail!("failpoint before-checkpoint-new-timeline");
    });

-    page_tline.tline.checkpoint(CheckpointConfig::Forced)?;
+    timeline.checkpoint(CheckpointConfig::Forced)?;

    info!(
        "created root timeline {} timeline.lsn {}",
        tli,
-        page_tline.tline.get_last_record_lsn()
+        timeline.get_last_record_lsn()
    );

    // Remove temp dir. We don't need it anymore
@@ -319,36 +208,22 @@ fn bootstrap_timeline<R: Repository>(
    Ok(())
 }

-pub(crate) fn get_local_timelines(
-    tenant_id: ZTenantId,
-    include_non_incremental_logical_size: bool,
-) -> Result<Vec<(ZTimelineId, LocalTimelineInfo)>> {
-    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
-        .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?;
-    let repo_timelines = repo.list_timelines();
-
-    let mut local_timeline_info = Vec::with_capacity(repo_timelines.len());
-    for (timeline_id, repository_timeline) in repo_timelines {
-        local_timeline_info.push((
-            timeline_id,
-            LocalTimelineInfo::from_repo_timeline(
-                tenant_id,
-                timeline_id,
-                &repository_timeline,
-                include_non_incremental_logical_size,
-            )?,
-        ))
-    }
-    Ok(local_timeline_info)
-}
-
+///
+/// Create a new timeline.
+///
+/// Returns the new timeline ID and reference to its Timeline object.
+///
+/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
+/// the same timeline ID already exists, returns None. If `new_timeline_id` is not given,
+/// a new unique ID is generated.
+///
 pub(crate) fn create_timeline(
    conf: &'static PageServerConf,
    tenant_id: ZTenantId,
    new_timeline_id: Option<ZTimelineId>,
    ancestor_timeline_id: Option<ZTimelineId>,
    mut ancestor_start_lsn: Option<Lsn>,
-) -> Result<Option<TimelineInfo>> {
+) -> Result<Option<(ZTimelineId, Arc<TimelineImpl>)>> {
    let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;

@@ -357,7 +232,7 @@ pub(crate) fn create_timeline(
        return Ok(None);
    }

-    let new_timeline_info = match ancestor_timeline_id {
+    match ancestor_timeline_id {
        Some(ancestor_timeline_id) => {
            let ancestor_timeline = repo
                .get_timeline_load(ancestor_timeline_id)
@@ -385,26 +260,13 @@ pub(crate) fn create_timeline(
                }
            }

-            repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?;
-            // load the timeline into memory
-            let loaded_timeline =
-                tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;
-            LocalTimelineInfo::from_loaded_timeline(&loaded_timeline, false)
-                .context("cannot fill timeline info")?
-        }
-        None => {
-            bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
-            // load the timeline into memory
-            let new_timeline =
-                tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;
-            LocalTimelineInfo::from_loaded_timeline(&new_timeline, false)
-                .context("cannot fill timeline info")?
+            repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
        }
+        None => bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?,
    };
-    Ok(Some(TimelineInfo {
-        tenant_id,
-        timeline_id: new_timeline_id,
-        local: Some(new_timeline_info),
-        remote: None,
-    }))
+
+    // load the timeline into memory
+    let loaded_timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;
+
+    Ok(Some((new_timeline_id, loaded_timeline)))
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -10,7 +10,7 @@
 //! This is similar to PostgreSQL's virtual file descriptor facility in
 //! src/backend/storage/file/fd.c
 //!
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
 use once_cell::sync::OnceCell;
 use std::fs::{File, OpenOptions};
 use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
@@ -32,23 +32,24 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    1.0,      // 1 sec
 ];

-lazy_static! {
-    static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!(
+static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
        "pageserver_io_operations_seconds",
        "Time spent in IO operations",
        &["operation", "tenant_id", "timeline_id"],
        STORAGE_IO_TIME_BUCKETS.into()
    )
-    .expect("failed to define a metric");
-}
-lazy_static! {
-    static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!(
+    .expect("failed to define a metric")
+});
+
+static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
        &["operation", "tenant_id", "timeline_id"]
    )
-    .expect("failed to define a metric");
-}
+    .expect("failed to define a metric")
+});

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -30,11 +30,8 @@ use anyhow::Result;
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;

-use std::collections::HashMap;
-
 use crate::pgdatadir_mapping::*;
 use crate::reltag::{RelTag, SlruKind};
-use crate::repository::Repository;
 use crate::walrecord::*;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment;
 use postgres_ffi::xlog_utils::*;
@@ -44,17 +41,15 @@ use utils::lsn::Lsn;

 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);

-pub struct WalIngest<'a, R: Repository> {
-    timeline: &'a DatadirTimeline<R>,
+pub struct WalIngest<'a, T: DatadirTimeline> {
+    timeline: &'a T,

    checkpoint: CheckPoint,
    checkpoint_modified: bool,
-
-    relsize_cache: HashMap<RelTag, BlockNumber>,
 }

-impl<'a, R: Repository> WalIngest<'a, R> {
-    pub fn new(timeline: &DatadirTimeline<R>, startpoint: Lsn) -> Result<WalIngest<R>> {
+impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
+    pub fn new(timeline: &T, startpoint: Lsn) -> Result<WalIngest<T>> {
        // Fetch the latest checkpoint into memory, so that we can compare with it
        // quickly in `ingest_record` and update it when it changes.
        let checkpoint_bytes = timeline.get_checkpoint(startpoint)?;
@@ -65,26 +60,27 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            timeline,
            checkpoint,
            checkpoint_modified: false,
-            relsize_cache: HashMap::new(),
        })
    }

    ///
    /// Decode a PostgreSQL WAL record and store it in the repository, in the given timeline.
    ///
+    /// This function updates `lsn` field of `DatadirModification`
    ///
    /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
    /// relations/pages that the record affects.
    ///
    pub fn ingest_record(
        &mut self,
-        timeline: &DatadirTimeline<R>,
        recdata: Bytes,
        lsn: Lsn,
+        modification: &mut DatadirModification<T>,
+        decoded: &mut DecodedWALRecord,
    ) -> Result<()> {
-        let mut modification = timeline.begin_modification(lsn);
+        modification.lsn = lsn;
+        decode_wal_record(recdata, decoded).context("failed decoding wal record")?;

-        let mut decoded = decode_wal_record(recdata).context("failed decoding wal record")?;
        let mut buf = decoded.record.clone();
        buf.advance(decoded.main_data_offset);

@@ -98,7 +94,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        if decoded.xl_rmid == pg_constants::RM_HEAP_ID
            || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
        {
-            self.ingest_heapam_record(&mut buf, &mut modification, &mut decoded)?;
+            self.ingest_heapam_record(&mut buf, modification, decoded)?;
        }
        // Handle other special record types
        if decoded.xl_rmid == pg_constants::RM_SMGR_ID
@@ -106,19 +102,19 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                == pg_constants::XLOG_SMGR_CREATE
        {
            let create = XlSmgrCreate::decode(&mut buf);
-            self.ingest_xlog_smgr_create(&mut modification, &create)?;
+            self.ingest_xlog_smgr_create(modification, &create)?;
        } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
            && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                == pg_constants::XLOG_SMGR_TRUNCATE
        {
            let truncate = XlSmgrTruncate::decode(&mut buf);
-            self.ingest_xlog_smgr_truncate(&mut modification, &truncate)?;
+            self.ingest_xlog_smgr_truncate(modification, &truncate)?;
        } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
            if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                == pg_constants::XLOG_DBASE_CREATE
            {
                let createdb = XlCreateDatabase::decode(&mut buf);
-                self.ingest_xlog_dbase_create(&mut modification, &createdb)?;
+                self.ingest_xlog_dbase_create(modification, &createdb)?;
            } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                == pg_constants::XLOG_DBASE_DROP
            {
@@ -137,7 +133,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                self.put_slru_page_image(
-                    &mut modification,
+                    modification,
                    SlruKind::Clog,
                    segno,
                    rpageno,
@@ -146,7 +142,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            } else {
                assert!(info == pg_constants::CLOG_TRUNCATE);
                let xlrec = XlClogTruncate::decode(&mut buf);
-                self.ingest_clog_truncate_record(&mut modification, &xlrec)?;
+                self.ingest_clog_truncate_record(modification, &xlrec)?;
            }
        } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
            let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
@@ -154,7 +150,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                let parsed_xact =
                    XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
                self.ingest_xact_record(
-                    &mut modification,
+                    modification,
                    &parsed_xact,
                    info == pg_constants::XLOG_XACT_COMMIT,
                )?;
@@ -164,7 +160,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                let parsed_xact =
                    XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
                self.ingest_xact_record(
-                    &mut modification,
+                    modification,
                    &parsed_xact,
                    info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
                )?;
@@ -187,7 +183,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                self.put_slru_page_image(
-                    &mut modification,
+                    modification,
                    SlruKind::MultiXactOffsets,
                    segno,
                    rpageno,
@@ -198,7 +194,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                self.put_slru_page_image(
-                    &mut modification,
+                    modification,
                    SlruKind::MultiXactMembers,
                    segno,
                    rpageno,
@@ -206,14 +202,14 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                )?;
            } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
                let xlrec = XlMultiXactCreate::decode(&mut buf);
-                self.ingest_multixact_create_record(&mut modification, &xlrec)?;
+                self.ingest_multixact_create_record(modification, &xlrec)?;
            } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
                let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                self.ingest_multixact_truncate_record(&mut modification, &xlrec)?;
+                self.ingest_multixact_truncate_record(modification, &xlrec)?;
            }
        } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
            let xlrec = XlRelmapUpdate::decode(&mut buf);
-            self.ingest_relmap_page(&mut modification, &xlrec, &decoded)?;
+            self.ingest_relmap_page(modification, &xlrec, decoded)?;
        } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
            if info == pg_constants::XLOG_NEXTOID {
@@ -248,7 +244,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        // Iterate through all the blocks that the record modifies, and
        // "put" a separate copy of the record for each block.
        for blk in decoded.blocks.iter() {
-            self.ingest_decoded_block(&mut modification, lsn, &decoded, blk)?;
+            self.ingest_decoded_block(modification, lsn, decoded, blk)?;
        }

        // If checkpoint data was updated, store the new version in the repository
@@ -268,7 +264,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_decoded_block(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        lsn: Lsn,
        decoded: &DecodedWALRecord,
        blk: &DecodedBkpBlock,
@@ -328,7 +324,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
    fn ingest_heapam_record(
        &mut self,
        buf: &mut Bytes,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        decoded: &mut DecodedWALRecord,
    ) -> Result<()> {
        // Handle VM bit updates that are implicitly part of heap records.
@@ -409,7 +405,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            // replaying it would fail to find the previous image of the page, because
            // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
            // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel)?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn)?;
            if let Some(blknum) = new_vm_blk {
                if blknum >= vm_size {
                    new_vm_blk = None;
@@ -472,7 +468,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
    /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
    fn ingest_xlog_dbase_create(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        rec: &XlCreateDatabase,
    ) -> Result<()> {
        let db_id = rec.db_id;
@@ -539,7 +535,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_xlog_smgr_create(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        rec: &XlSmgrCreate,
    ) -> Result<()> {
        let rel = RelTag {
@@ -557,7 +553,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
    /// This is the same logic as in PostgreSQL's smgr_redo() function.
    fn ingest_xlog_smgr_truncate(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        rec: &XlSmgrTruncate,
    ) -> Result<()> {
        let spcnode = rec.rnode.spcnode;
@@ -622,7 +618,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
    ///
    fn ingest_xact_record(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        parsed: &XlXactParsedRecord,
        is_commit: bool,
    ) -> Result<()> {
@@ -691,7 +687,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_clog_truncate_record(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        xlrec: &XlClogTruncate,
    ) -> Result<()> {
        info!(
@@ -749,7 +745,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_multixact_create_record(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        xlrec: &XlMultiXactCreate,
    ) -> Result<()> {
        // Create WAL record for updating the multixact-offsets page
@@ -828,7 +824,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_multixact_truncate_record(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        xlrec: &XlMultiXactTruncate,
    ) -> Result<()> {
        self.checkpoint.oldestMulti = xlrec.end_trunc_off;
@@ -862,7 +858,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_relmap_page(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        xlrec: &XlRelmapUpdate,
        decoded: &DecodedWALRecord,
    ) -> Result<()> {
@@ -878,17 +874,16 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn put_rel_creation(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        rel: RelTag,
    ) -> Result<()> {
-        self.relsize_cache.insert(rel, 0);
        modification.put_rel_creation(rel, 0)?;
        Ok(())
    }

    fn put_rel_page_image(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        rel: RelTag,
        blknum: BlockNumber,
        img: Bytes,
@@ -900,7 +895,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn put_rel_wal_record(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        rel: RelTag,
        blknum: BlockNumber,
        rec: ZenithWalRecord,
@@ -912,63 +907,49 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn put_rel_truncation(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        rel: RelTag,
        nblocks: BlockNumber,
    ) -> Result<()> {
        modification.put_rel_truncation(rel, nblocks)?;
-        self.relsize_cache.insert(rel, nblocks);
        Ok(())
    }

    fn put_rel_drop(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        rel: RelTag,
    ) -> Result<()> {
        modification.put_rel_drop(rel)?;
-        self.relsize_cache.remove(&rel);
        Ok(())
    }

-    fn get_relsize(&mut self, rel: RelTag) -> Result<BlockNumber> {
-        if let Some(nblocks) = self.relsize_cache.get(&rel) {
-            Ok(*nblocks)
+    fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> Result<BlockNumber> {
+        let nblocks = if !self.timeline.get_rel_exists(rel, lsn)? {
+            0
        } else {
-            let last_lsn = self.timeline.get_last_record_lsn();
-            let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
-                0
-            } else {
-                self.timeline.get_rel_size(rel, last_lsn)?
-            };
-            self.relsize_cache.insert(rel, nblocks);
-            Ok(nblocks)
-        }
+            self.timeline.get_rel_size(rel, lsn)?
+        };
+        Ok(nblocks)
    }

    fn handle_rel_extend(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        rel: RelTag,
        blknum: BlockNumber,
    ) -> Result<()> {
        let new_nblocks = blknum + 1;
-        let old_nblocks = if let Some(nblocks) = self.relsize_cache.get(&rel) {
-            *nblocks
+        // Check if the relation exists. We implicitly create relations on first
+        // record.
+        // TODO: would be nice if to be more explicit about it
+        let last_lsn = modification.lsn;
+        let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
+            // create it with 0 size initially, the logic below will extend it
+            modification.put_rel_creation(rel, 0)?;
+            0
        } else {
-            // Check if the relation exists. We implicitly create relations on first
-            // record.
-            // TODO: would be nice if to be more explicit about it
-            let last_lsn = self.timeline.get_last_record_lsn();
-            let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
-                // create it with 0 size initially, the logic below will extend it
-                modification.put_rel_creation(rel, 0)?;
-                0
-            } else {
-                self.timeline.get_rel_size(rel, last_lsn)?
-            };
-            self.relsize_cache.insert(rel, nblocks);
-            nblocks
+            self.timeline.get_rel_size(rel, last_lsn)?
        };

        if new_nblocks > old_nblocks {
@@ -979,14 +960,13 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            for gap_blknum in old_nblocks..blknum {
                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
            }
-            self.relsize_cache.insert(rel, new_nblocks);
        }
        Ok(())
    }

    fn put_slru_page_image(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        kind: SlruKind,
        segno: u32,
        blknum: BlockNumber,
@@ -999,7 +979,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn handle_slru_extend(
        &mut self,
-        modification: &mut DatadirModification<R>,
+        modification: &mut DatadirModification<T>,
        kind: SlruKind,
        segno: u32,
        blknum: BlockNumber,
@@ -1052,6 +1032,7 @@ mod tests {
    use super::*;
    use crate::pgdatadir_mapping::create_test_timeline;
    use crate::repository::repo_harness::*;
+    use crate::repository::Timeline;
    use postgres_ffi::pg_constants;

    /// Arbitrary relation tag, for testing.
@@ -1062,13 +1043,13 @@ mod tests {
        forknum: 0,
    };

-    fn assert_current_logical_size<R: Repository>(_timeline: &DatadirTimeline<R>, _lsn: Lsn) {
+    fn assert_current_logical_size<T: Timeline>(_timeline: &T, _lsn: Lsn) {
        // TODO
    }

    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);

-    fn init_walingest_test<R: Repository>(tline: &DatadirTimeline<R>) -> Result<WalIngest<R>> {
+    fn init_walingest_test<T: DatadirTimeline>(tline: &T) -> Result<WalIngest<T>> {
        let mut m = tline.begin_modification(Lsn(0x10));
        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
        m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
@@ -1082,7 +1063,7 @@ mod tests {
    fn test_relsize() -> Result<()> {
        let repo = RepoHarness::create("test_relsize")?.load();
        let tline = create_test_timeline(repo, TIMELINE_ID)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&*tline)?;

        let mut m = tline.begin_modification(Lsn(0x20));
        walingest.put_rel_creation(&mut m, TESTREL_A)?;
@@ -1098,7 +1079,7 @@ mod tests {
        walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?;
        m.commit()?;

-        assert_current_logical_size(&tline, Lsn(0x50));
+        assert_current_logical_size(&*tline, Lsn(0x50));

        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
@@ -1145,7 +1126,7 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x60));
        walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?;
        m.commit()?;
-        assert_current_logical_size(&tline, Lsn(0x60));
+        assert_current_logical_size(&*tline, Lsn(0x60));

        // Check reported size and contents after truncation
        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 2);
@@ -1210,7 +1191,7 @@ mod tests {
    fn test_drop_extend() -> Result<()> {
        let repo = RepoHarness::create("test_drop_extend")?.load();
        let tline = create_test_timeline(repo, TIMELINE_ID)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&*tline)?;

        let mut m = tline.begin_modification(Lsn(0x20));
        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
@@ -1250,7 +1231,7 @@ mod tests {
    fn test_truncate_extend() -> Result<()> {
        let repo = RepoHarness::create("test_truncate_extend")?.load();
        let tline = create_test_timeline(repo, TIMELINE_ID)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&*tline)?;

        // Create a 20 MB relation (the size is arbitrary)
        let relsize = 20 * 1024 * 1024 / 8192;
@@ -1338,7 +1319,7 @@ mod tests {
    fn test_large_rel() -> Result<()> {
        let repo = RepoHarness::create("test_large_rel")?.load();
        let tline = create_test_timeline(repo, TIMELINE_ID)?;
-        let mut walingest = init_walingest_test(&tline)?;
+        let mut walingest = init_walingest_test(&*tline)?;

        let mut lsn = 0x10;
        for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
@@ -1349,7 +1330,7 @@ mod tests {
            m.commit()?;
        }

-        assert_current_logical_size(&tline, Lsn(lsn));
+        assert_current_logical_size(&*tline, Lsn(lsn));

        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
@@ -1365,7 +1346,7 @@ mod tests {
            tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
            pg_constants::RELSEG_SIZE
        );
-        assert_current_logical_size(&tline, Lsn(lsn));
+        assert_current_logical_size(&*tline, Lsn(lsn));

        // Truncate another block
        lsn += 0x10;
@@ -1376,7 +1357,7 @@ mod tests {
            tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
            pg_constants::RELSEG_SIZE - 1
        );
-        assert_current_logical_size(&tline, Lsn(lsn));
+        assert_current_logical_size(&*tline, Lsn(lsn));

        // Truncate to 1500, and then truncate all the way down to 0, one block at a time
        // This tests the behavior at segment boundaries
@@ -1393,7 +1374,7 @@ mod tests {

            size -= 1;
        }
-        assert_current_logical_size(&tline, Lsn(lsn));
+        assert_current_logical_size(&*tline, Lsn(lsn));

        Ok(())
    }
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -26,7 +26,6 @@ mod walreceiver_connection;
 use anyhow::{ensure, Context};
 use etcd_broker::Client;
 use itertools::Itertools;
-use once_cell::sync::Lazy;
 use std::cell::Cell;
 use std::collections::{hash_map, HashMap, HashSet};
 use std::future::Future;
@@ -36,14 +35,13 @@ use std::thread_local;
 use std::time::Duration;
 use tokio::{
    select,
-    sync::{mpsc, watch, RwLock},
+    sync::{mpsc, watch},
    task::JoinHandle,
 };
 use tracing::*;
 use url::Url;

 use crate::config::PageServerConf;
-use crate::http::models::WalReceiverEntry;
 use crate::tenant_mgr::{self, LocalTimelineUpdate, TenantState};
 use crate::thread_mgr::{self, ThreadKind};
 use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
@@ -55,23 +53,6 @@ thread_local! {
    pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
 }

-/// WAL receiver state for sharing with the outside world.
-/// Only entries for timelines currently available in pageserver are stored.
-static WAL_RECEIVER_ENTRIES: Lazy<RwLock<HashMap<ZTenantTimelineId, WalReceiverEntry>>> =
-    Lazy::new(|| RwLock::new(HashMap::new()));
-
-/// Gets the public WAL streaming entry for a certain timeline.
-pub async fn get_wal_receiver_entry(
-    tenant_id: ZTenantId,
-    timeline_id: ZTimelineId,
-) -> Option<WalReceiverEntry> {
-    WAL_RECEIVER_ENTRIES
-        .read()
-        .await
-        .get(&ZTenantTimelineId::new(tenant_id, timeline_id))
-        .cloned()
-}
-
 /// Sets up the main WAL receiver thread that manages the rest of the subtasks inside of it, per timeline.
 /// See comments in [`wal_receiver_main_thread_loop_step`] for more details on per timeline activities.
 pub fn init_wal_receiver_main_thread(
@@ -85,7 +66,7 @@ pub fn init_wal_receiver_main_thread(
    );
    let broker_prefix = &conf.broker_etcd_prefix;
    info!(
-        "Starting wal receiver main thread, etdc endpoints: {}",
+        "Starting wal receiver main thread, etcd endpoints: {}",
        etcd_endpoints.iter().map(Url::to_string).join(", ")
    );

@@ -281,13 +262,10 @@ async fn wal_receiver_main_thread_loop_step<'a>(
                        }
                        None => warn!("Timeline {id} does not have a tenant entry in wal receiver main thread"),
                    };
-                    {
-                        WAL_RECEIVER_ENTRIES.write().await.remove(&id);
-                        if let Err(e) = join_confirmation_sender.send(()) {
-                            warn!("cannot send wal_receiver shutdown confirmation {e}")
-                        } else {
-                            info!("confirm walreceiver shutdown for {id}");
-                        }
+                    if let Err(e) = join_confirmation_sender.send(()) {
+                        warn!("cannot send wal_receiver shutdown confirmation {e}")
+                    } else {
+                        info!("confirm walreceiver shutdown for {id}");
                    }
                }
                // Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly.
@@ -322,17 +300,6 @@ async fn wal_receiver_main_thread_loop_step<'a>(
                            }
                        };

-                    {
-                        WAL_RECEIVER_ENTRIES.write().await.insert(
-                            id,
-                            WalReceiverEntry {
-                                wal_producer_connstr: None,
-                                last_received_msg_lsn: None,
-                                last_received_msg_ts: None,
-                            },
-                        );
-                    }
-
                    vacant_connection_manager_entry.insert(
                        connection_manager::spawn_connection_manager_task(
                            id,
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -25,7 +25,12 @@ use etcd_broker::{
 use tokio::select;
 use tracing::*;

-use crate::DatadirTimelineImpl;
+use crate::{
+    exponential_backoff,
+    repository::{Repository, Timeline},
+    DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};
+use crate::{RepositoryImpl, TimelineImpl};
 use utils::{
    lsn::Lsn,
    pq_proto::ReplicationFeedback,
@@ -39,7 +44,7 @@ pub(super) fn spawn_connection_manager_task(
    id: ZTenantTimelineId,
    broker_loop_prefix: String,
    mut client: Client,
-    local_timeline: Arc<DatadirTimelineImpl>,
+    local_timeline: Arc<TimelineImpl>,
    wal_connect_timeout: Duration,
    lagging_wal_timeout: Duration,
    max_lsn_wal_lag: NonZeroU64,
@@ -167,7 +172,7 @@ async fn connection_manager_loop_step(
            walreceiver_state
                .change_connection(
                    new_candidate.safekeeper_id,
-                    new_candidate.wal_producer_connstr,
+                    new_candidate.wal_source_connstr,
                )
                .await
        }
@@ -229,23 +234,11 @@ async fn subscribe_for_timeline_updates(
    }
 }

-const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 2.0;
-const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 60.0;
-
-async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) {
-    if n == 0 {
-        return;
-    }
-    let seconds_to_wait = base.powf(f64::from(n) - 1.0).min(max_seconds);
-    info!("Backoff: waiting {seconds_to_wait} seconds before proceeding with the task");
-    tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await;
-}
-
 /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible.
 struct WalreceiverState {
    id: ZTenantTimelineId,
    /// Use pageserver data about the timeline to filter out some of the safekeepers.
-    local_timeline: Arc<DatadirTimelineImpl>,
+    local_timeline: Arc<TimelineImpl>,
    /// The timeout on the connection to safekeeper for WAL streaming.
    wal_connect_timeout: Duration,
    /// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.
@@ -283,7 +276,7 @@ struct EtcdSkTimeline {
 impl WalreceiverState {
    fn new(
        id: ZTenantTimelineId,
-        local_timeline: Arc<DatadirTimelineImpl>,
+        local_timeline: Arc<<RepositoryImpl as Repository>::Timeline>,
        wal_connect_timeout: Duration,
        lagging_wal_timeout: Duration,
        max_lsn_wal_lag: NonZeroU64,
@@ -301,7 +294,7 @@ impl WalreceiverState {
    }

    /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
-    async fn change_connection(&mut self, new_sk_id: NodeId, new_wal_producer_connstr: String) {
+    async fn change_connection(&mut self, new_sk_id: NodeId, new_wal_source_connstr: String) {
        if let Some(old_connection) = self.wal_connection.take() {
            old_connection.connection_task.shutdown().await
        }
@@ -323,7 +316,7 @@ impl WalreceiverState {
                .await;
                super::walreceiver_connection::handle_walreceiver_connection(
                    id,
-                    &new_wal_producer_connstr,
+                    &new_wal_source_connstr,
                    events_sender.as_ref(),
                    cancellation,
                    connect_timeout,
@@ -386,7 +379,7 @@ impl WalreceiverState {
            Some(existing_wal_connection) => {
                let connected_sk_node = existing_wal_connection.sk_id;

-                let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) =
+                let (new_sk_id, new_safekeeper_etcd_data, new_wal_source_connstr) =
                    self.select_connection_candidate(Some(connected_sk_node))?;

                let now = Utc::now().naive_utc();
@@ -396,7 +389,7 @@ impl WalreceiverState {
                    if latest_interaciton > self.lagging_wal_timeout {
                        return Some(NewWalConnectionCandidate {
                            safekeeper_id: new_sk_id,
-                            wal_producer_connstr: new_wal_producer_connstr,
+                            wal_source_connstr: new_wal_source_connstr,
                            reason: ReconnectReason::NoWalTimeout {
                                last_wal_interaction: Some(
                                    existing_wal_connection.latest_connection_update,
@@ -422,7 +415,7 @@ impl WalreceiverState {
                                        return Some(
                                            NewWalConnectionCandidate {
                                                safekeeper_id: new_sk_id,
-                                                wal_producer_connstr: new_wal_producer_connstr,
+                                                wal_source_connstr: new_wal_source_connstr,
                                                reason: ReconnectReason::LaggingWal { current_lsn, new_lsn, threshold: self.max_lsn_wal_lag },
                                            });
                                    }
@@ -433,18 +426,18 @@ impl WalreceiverState {
                    None => {
                        return Some(NewWalConnectionCandidate {
                            safekeeper_id: new_sk_id,
-                            wal_producer_connstr: new_wal_producer_connstr,
+                            wal_source_connstr: new_wal_source_connstr,
                            reason: ReconnectReason::NoEtcdDataForExistingConnection,
                        })
                    }
                }
            }
            None => {
-                let (new_sk_id, _, new_wal_producer_connstr) =
+                let (new_sk_id, _, new_wal_source_connstr) =
                    self.select_connection_candidate(None)?;
                return Some(NewWalConnectionCandidate {
                    safekeeper_id: new_sk_id,
-                    wal_producer_connstr: new_wal_producer_connstr,
+                    wal_source_connstr: new_wal_source_connstr,
                    reason: ReconnectReason::NoExistingConnection,
                });
            }
@@ -545,7 +538,7 @@ impl WalreceiverState {
 #[derive(Debug, PartialEq, Eq)]
 struct NewWalConnectionCandidate {
    safekeeper_id: NodeId,
-    wal_producer_connstr: String,
+    wal_source_connstr: String,
    reason: ReconnectReason,
 }

@@ -802,7 +795,7 @@ mod tests {
            "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold"
        );
        assert!(only_candidate
-            .wal_producer_connstr
+            .wal_source_connstr
            .contains(DUMMY_SAFEKEEPER_CONNSTR));

        let selected_lsn = 100_000;
@@ -867,7 +860,7 @@ mod tests {
            "Should select new safekeeper due to missing connection, even if there's also a lag in the wal over the threshold"
        );
        assert!(biggest_wal_candidate
-            .wal_producer_connstr
+            .wal_source_connstr
            .contains(DUMMY_SAFEKEEPER_CONNSTR));

        Ok(())
@@ -984,7 +977,7 @@ mod tests {
            "Should select new safekeeper due to missing etcd data, even if there's an existing connection with this safekeeper"
        );
        assert!(only_candidate
-            .wal_producer_connstr
+            .wal_source_connstr
            .contains(DUMMY_SAFEKEEPER_CONNSTR));

        Ok(())
@@ -1066,7 +1059,7 @@ mod tests {
            "Should select bigger WAL safekeeper if it starts to lag enough"
        );
        assert!(over_threshcurrent_candidate
-            .wal_producer_connstr
+            .wal_source_connstr
            .contains("advanced by Lsn safekeeper"));

        Ok(())
@@ -1133,7 +1126,7 @@ mod tests {
            unexpected => panic!("Unexpected reason: {unexpected:?}"),
        }
        assert!(over_threshcurrent_candidate
-            .wal_producer_connstr
+            .wal_source_connstr
            .contains(DUMMY_SAFEKEEPER_CONNSTR));

        Ok(())
@@ -1189,7 +1182,7 @@ mod tests {
            unexpected => panic!("Unexpected reason: {unexpected:?}"),
        }
        assert!(over_threshcurrent_candidate
-            .wal_producer_connstr
+            .wal_source_connstr
            .contains(DUMMY_SAFEKEEPER_CONNSTR));

        Ok(())
@@ -1203,13 +1196,10 @@ mod tests {
                tenant_id: harness.tenant_id,
                timeline_id: TIMELINE_ID,
            },
-            local_timeline: Arc::new(DatadirTimelineImpl::new(
-                harness
-                    .load()
-                    .create_empty_timeline(TIMELINE_ID, Lsn(0))
-                    .expect("Failed to create an empty timeline for dummy wal connection manager"),
-                10_000,
-            )),
+            local_timeline: harness
+                .load()
+                .create_empty_timeline(TIMELINE_ID, Lsn(0))
+                .expect("Failed to create an empty timeline for dummy wal connection manager"),
            wal_connect_timeout: Duration::from_secs(1),
            lagging_wal_timeout: Duration::from_secs(1),
            max_lsn_wal_lag: NonZeroU64::new(1).unwrap(),
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -9,36 +9,38 @@ use std::{
 use anyhow::{bail, ensure, Context};
 use bytes::BytesMut;
 use fail::fail_point;
+use futures::StreamExt;
 use postgres::{SimpleQueryMessage, SimpleQueryRow};
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
 use tokio::{pin, select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
-use tokio_stream::StreamExt;
 use tracing::{debug, error, info, info_span, trace, warn, Instrument};

 use super::TaskEvent;
 use crate::{
-    http::models::WalReceiverEntry,
+    layered_repository::WalReceiverInfo,
+    pgdatadir_mapping::DatadirTimeline,
    repository::{Repository, Timeline},
    tenant_mgr,
    walingest::WalIngest,
+    walrecord::DecodedWALRecord,
 };
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId};

-/// Opens a conneciton to the given wal producer and streams the WAL, sending progress messages during streaming.
+/// Open a connection to the given safekeeper and receive WAL, sending back progress
+/// messages as we go.
 pub async fn handle_walreceiver_connection(
    id: ZTenantTimelineId,
-    wal_producer_connstr: &str,
+    wal_source_connstr: &str,
    events_sender: &watch::Sender<TaskEvent<ReplicationFeedback>>,
    mut cancellation: watch::Receiver<()>,
    connect_timeout: Duration,
 ) -> anyhow::Result<()> {
    // Connect to the database in replication mode.
-    info!("connecting to {wal_producer_connstr}");
-    let connect_cfg =
-        format!("{wal_producer_connstr} application_name=pageserver replication=true");
+    info!("connecting to {wal_source_connstr}");
+    let connect_cfg = format!("{wal_source_connstr} application_name=pageserver replication=true");

    let (mut replication_client, connection) = time::timeout(
        connect_timeout,
@@ -150,19 +152,25 @@ pub async fn handle_walreceiver_connection(

                waldecoder.feed_bytes(data);

-                while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                    let _enter = info_span!("processing record", lsn = %lsn).entered();
+                {
+                    let mut decoded = DecodedWALRecord::default();
+                    let mut modification = timeline.begin_modification(endlsn);
+                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                        // let _enter = info_span!("processing record", lsn = %lsn).entered();

-                    // It is important to deal with the aligned records as lsn in getPage@LSN is
-                    // aligned and can be several bytes bigger. Without this alignment we are
-                    // at risk of hitting a deadlock.
-                    ensure!(lsn.is_aligned());
+                        // It is important to deal with the aligned records as lsn in getPage@LSN is
+                        // aligned and can be several bytes bigger. Without this alignment we are
+                        // at risk of hitting a deadlock.
+                        ensure!(lsn.is_aligned());

-                    walingest.ingest_record(&timeline, recdata, lsn)?;
+                        walingest
+                            .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                            .context("could not ingest record at {lsn}")?;

-                    fail_point!("walreceiver-after-ingest");
+                        fail_point!("walreceiver-after-ingest");

-                    last_rec_lsn = lsn;
+                        last_rec_lsn = lsn;
+                    }
                }

                if !caught_up && endlsn >= end_of_wal {
@@ -170,16 +178,6 @@ pub async fn handle_walreceiver_connection(
                    caught_up = true;
                }

-                let timeline_to_check = Arc::clone(&timeline.tline);
-                tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance())
-                    .await
-                    .with_context(|| {
-                        format!("Spawned checkpoint check task panicked for timeline {id}")
-                    })?
-                    .with_context(|| {
-                        format!("Failed to check checkpoint distance for timeline {id}")
-                    })?;
-
                Some(endlsn)
            }

@@ -200,6 +198,12 @@ pub async fn handle_walreceiver_connection(
            _ => None,
        };

+        let timeline_to_check = Arc::clone(&timeline);
+        tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance())
+            .await
+            .with_context(|| format!("Spawned checkpoint check task panicked for timeline {id}"))?
+            .with_context(|| format!("Failed to check checkpoint distance for timeline {id}"))?;
+
        if let Some(last_lsn) = status_update {
            let remote_index = repo.get_remote_index();
            let timeline_remote_consistent_lsn = remote_index
@@ -218,27 +222,22 @@ pub async fn handle_walreceiver_connection(
            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
            let write_lsn = u64::from(last_lsn);
            // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
-            let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn());
+            let flush_lsn = u64::from(timeline.get_disk_consistent_lsn());
            // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
            // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
            let apply_lsn = u64::from(timeline_remote_consistent_lsn);
            let ts = SystemTime::now();

-            // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS`
-            {
-                super::WAL_RECEIVER_ENTRIES.write().await.insert(
-                    id,
-                    WalReceiverEntry {
-                        wal_producer_connstr: Some(wal_producer_connstr.to_owned()),
-                        last_received_msg_lsn: Some(last_lsn),
-                        last_received_msg_ts: Some(
-                            ts.duration_since(SystemTime::UNIX_EPOCH)
-                                .expect("Received message time should be before UNIX EPOCH!")
-                                .as_micros(),
-                        ),
-                    },
-                );
-            }
+            // Update the status about what we just received. This is shown in the mgmt API.
+            let last_received_wal = WalReceiverInfo {
+                wal_source_connstr: wal_source_connstr.to_owned(),
+                last_received_msg_lsn: last_lsn,
+                last_received_msg_ts: ts
+                    .duration_since(SystemTime::UNIX_EPOCH)
+                    .expect("Received message time should be before UNIX EPOCH!")
+                    .as_micros(),
+            };
+            *timeline.last_received_wal.lock().unwrap() = Some(last_received_wal);

            // Send zenith feedback message.
            // Regular standby_status_update fields are put into this message.
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -96,6 +96,7 @@ impl DecodedBkpBlock {
    }
 }

+#[derive(Default)]
 pub struct DecodedWALRecord {
    pub xl_xid: TransactionId,
    pub xl_info: u8,
@@ -505,7 +506,17 @@ impl XlMultiXactTruncate {
 //      block data
 //      ...
 //      main data
-pub fn decode_wal_record(record: Bytes) -> Result<DecodedWALRecord, DeserializeError> {
+//
+//
+// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in.
+// It would be more natural for this function to return a DecodedWALRecord as return value,
+// but reusing the caller-supplied struct avoids an allocation.
+// This code is in the hot path for digesting incoming WAL, and is very performance sensitive.
+//
+pub fn decode_wal_record(
+    record: Bytes,
+    decoded: &mut DecodedWALRecord,
+) -> Result<(), DeserializeError> {
    let mut rnode_spcnode: u32 = 0;
    let mut rnode_dbnode: u32 = 0;
    let mut rnode_relnode: u32 = 0;
@@ -534,7 +545,7 @@ pub fn decode_wal_record(record: Bytes) -> Result<DecodedWALRecord, DeserializeE
    let mut blocks_total_len: u32 = 0;
    let mut main_data_len = 0;
    let mut datatotal: u32 = 0;
-    let mut blocks: Vec<DecodedBkpBlock> = Vec::new();
+    decoded.blocks.clear();

    // 2. Decode the headers.
    // XLogRecordBlockHeaders if any,
@@ -713,7 +724,7 @@ pub fn decode_wal_record(record: Bytes) -> Result<DecodedWALRecord, DeserializeE
                    blk.blkno
                );

-                blocks.push(blk);
+                decoded.blocks.push(blk);
            }

            _ => {
@@ -724,7 +735,7 @@ pub fn decode_wal_record(record: Bytes) -> Result<DecodedWALRecord, DeserializeE

    // 3. Decode blocks.
    let mut ptr = record.len() - buf.remaining();
-    for blk in blocks.iter_mut() {
+    for blk in decoded.blocks.iter_mut() {
        if blk.has_image {
            blk.bimg_offset = ptr as u32;
            ptr += blk.bimg_len as usize;
@@ -744,14 +755,13 @@ pub fn decode_wal_record(record: Bytes) -> Result<DecodedWALRecord, DeserializeE
        assert_eq!(buf.remaining(), main_data_len as usize);
    }

-    Ok(DecodedWALRecord {
-        xl_xid: xlogrec.xl_xid,
-        xl_info: xlogrec.xl_info,
-        xl_rmid: xlogrec.xl_rmid,
-        record,
-        blocks,
-        main_data_offset,
-    })
+    decoded.xl_xid = xlogrec.xl_xid;
+    decoded.xl_info = xlogrec.xl_info;
+    decoded.xl_rmid = xlogrec.xl_rmid;
+    decoded.record = record;
+    decoded.main_data_offset = main_data_offset;
+
+    Ok(())
 }

 ///
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,8 +20,8 @@
 //!
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
-use lazy_static::lazy_static;
 use nix::poll::*;
+use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::fs;
 use std::fs::OpenOptions;
@@ -105,21 +105,27 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
 // for access to the postgres process ('wait') since there is only one for
 // each tenant.
-lazy_static! {
-    static ref WAL_REDO_TIME: Histogram =
-        register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
-            .expect("failed to define a metric");
-    static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
+
+static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
+        .expect("failed to define a metric")
+});
+
+static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the WAL redo process"
    )
-    .expect("failed to define a metric");
-    static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
+    .expect("failed to define a metric")
+});
+
+static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
        "pageserver_replayed_wal_records_total",
        "Number of WAL records replayed in WAL redo process"
    )
-    .unwrap();
-}
+    .unwrap()
+});

 ///
 /// This is the real implementation that uses a Postgres process to
--- a/poetry.lock
+++ b/poetry.lock
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -14,7 +14,7 @@ hashbrown = "0.11.2"
 hex = "0.4.3"
 hmac = "0.12.1"
 hyper = "0.14"
-lazy_static = "1.4.0"
+once_cell = "1.13.0"
 md5 = "0.7.0"
 parking_lot = "0.12"
 pin-project-lite = "0.2.7"
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -1,11 +1,14 @@
 //! Client authentication mechanisms.

 pub mod backend;
-pub use backend::DatabaseInfo;
+pub use backend::{BackendType, DatabaseInfo};

 mod credentials;
 pub use credentials::ClientCredentials;

+mod password_hack;
+use password_hack::PasswordHackPayload;
+
 mod flow;
 pub use flow::*;

@@ -29,9 +32,8 @@ pub enum AuthErrorImpl {
    #[error(transparent)]
    Sasl(#[from] crate::sasl::Error),

-    /// For passwords that couldn't be processed by [`backend::legacy_console::parse_password`].
-    #[error("Malformed password message")]
-    MalformedPassword,
+    #[error("Malformed password message: {0}")]
+    MalformedPassword(&'static str),

    /// Errors produced by [`crate::stream::PqStream`].
    #[error(transparent)]
@@ -76,7 +78,7 @@ impl UserFacingError for AuthError {
            Console(e) => e.to_string_client(),
            GetAuthInfo(e) => e.to_string_client(),
            Sasl(e) => e.to_string_client(),
-            MalformedPassword => self.to_string(),
+            MalformedPassword(_) => self.to_string(),
            _ => "Internal error".to_string(),
        }
    }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,26 +1,23 @@
-mod legacy_console;
 mod link;
 mod postgres;

 pub mod console;

+mod legacy_console;
 pub use legacy_console::{AuthError, AuthErrorImpl};

-use super::ClientCredentials;
 use crate::{
-    compute,
-    config::{AuthBackendType, ProxyConfig},
-    mgmt,
+    auth::{self, AuthFlow, ClientCredentials},
+    compute, config, mgmt,
    stream::PqStream,
    waiters::{self, Waiter, Waiters},
 };
-use lazy_static::lazy_static;
+
+use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};

-lazy_static! {
-    static ref CPLANE_WAITERS: Waiters<mgmt::ComputeReady> = Default::default();
-}
+static CPLANE_WAITERS: Lazy<Waiters<mgmt::ComputeReady>> = Lazy::new(Default::default);

 /// Give caller an opportunity to wait for the cloud's reply.
 pub async fn with_waiter<R, T, E>(
@@ -78,32 +75,158 @@ impl From<DatabaseInfo> for tokio_postgres::Config {
    }
 }

-pub(super) async fn handle_user(
-    config: &ProxyConfig,
-    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    creds: ClientCredentials,
-) -> super::Result<compute::NodeInfo> {
-    use AuthBackendType::*;
-    match config.auth_backend {
-        LegacyConsole => {
-            legacy_console::handle_user(
-                &config.auth_endpoint,
-                &config.auth_link_uri,
-                client,
-                &creds,
-            )
-            .await
+/// This type serves two purposes:
+///
+/// * When `T` is `()`, it's just a regular auth backend selector
+///   which we use in [`crate::config::ProxyConfig`].
+///
+/// * However, when we substitute `T` with [`ClientCredentials`],
+///   this helps us provide the credentials only to those auth
+///   backends which require them for the authentication process.
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum BackendType<T> {
+    /// Legacy Cloud API (V1) + link auth.
+    LegacyConsole(T),
+    /// Current Cloud API (V2).
+    Console(T),
+    /// Local mock of Cloud API (V2).
+    Postgres(T),
+    /// Authentication via a web browser.
+    Link,
+}
+
+impl<T> BackendType<T> {
+    /// Very similar to [`std::option::Option::map`].
+    /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
+    /// a function to a contained value.
+    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<R> {
+        use BackendType::*;
+        match self {
+            LegacyConsole(x) => LegacyConsole(f(x)),
+            Console(x) => Console(f(x)),
+            Postgres(x) => Postgres(f(x)),
+            Link => Link,
+        }
+    }
+}
+
+impl<T, E> BackendType<Result<T, E>> {
+    /// Very similar to [`std::option::Option::transpose`].
+    /// This is most useful for error handling.
+    pub fn transpose(self) -> Result<BackendType<T>, E> {
+        use BackendType::*;
+        match self {
+            LegacyConsole(x) => x.map(LegacyConsole),
+            Console(x) => x.map(Console),
+            Postgres(x) => x.map(Postgres),
+            Link => Ok(Link),
+        }
+    }
+}
+
+impl BackendType<ClientCredentials> {
+    /// Authenticate the client via the requested backend, possibly using credentials.
+    pub async fn authenticate(
+        mut self,
+        urls: &config::AuthUrls,
+        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
+    ) -> super::Result<compute::NodeInfo> {
+        use BackendType::*;
+
+        if let Console(creds) | Postgres(creds) = &mut self {
+            // If there's no project so far, that entails that client doesn't
+            // support SNI or other means of passing the project name.
+            // We now expect to see a very specific payload in the place of password.
+            if creds.project().is_none() {
+                let payload = AuthFlow::new(client)
+                    .begin(auth::PasswordHack)
+                    .await?
+                    .authenticate()
+                    .await?;
+
+                // Finally we may finish the initialization of `creds`.
+                // TODO: add missing type safety to ClientCredentials.
+                creds.project = Some(payload.project);
+
+                let mut config = match &self {
+                    Console(creds) => {
+                        console::Api::new(&urls.auth_endpoint, creds)
+                            .wake_compute()
+                            .await?
+                    }
+                    Postgres(creds) => {
+                        postgres::Api::new(&urls.auth_endpoint, creds)
+                            .wake_compute()
+                            .await?
+                    }
+                    _ => unreachable!("see the patterns above"),
+                };
+
+                // We should use a password from payload as well.
+                config.password(payload.password);
+
+                return Ok(compute::NodeInfo {
+                    reported_auth_ok: false,
+                    config,
+                });
+            }
+        }
+
+        match self {
+            LegacyConsole(creds) => {
+                legacy_console::handle_user(
+                    &urls.auth_endpoint,
+                    &urls.auth_link_uri,
+                    &creds,
+                    client,
+                )
+                .await
+            }
+            Console(creds) => {
+                console::Api::new(&urls.auth_endpoint, &creds)
+                    .handle_user(client)
+                    .await
+            }
+            Postgres(creds) => {
+                postgres::Api::new(&urls.auth_endpoint, &creds)
+                    .handle_user(client)
+                    .await
+            }
+            // NOTE: this auth backend doesn't use client credentials.
+            Link => link::handle_user(&urls.auth_link_uri, client).await,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_backend_type_map() {
+        let values = [
+            BackendType::LegacyConsole(0),
+            BackendType::Console(0),
+            BackendType::Postgres(0),
+            BackendType::Link,
+        ];
+
+        for value in values {
+            assert_eq!(value.map(|x| x), value);
+        }
+    }
+
+    #[test]
+    fn test_backend_type_transpose() {
+        let values = [
+            BackendType::LegacyConsole(Ok::<_, ()>(0)),
+            BackendType::Console(Ok(0)),
+            BackendType::Postgres(Ok(0)),
+            BackendType::Link,
+        ];
+
+        for value in values {
+            assert_eq!(value.map(Result::unwrap), value.transpose().unwrap());
        }
-        Console => {
-            console::Api::new(&config.auth_endpoint, &creds)?
-                .handle_user(client)
-                .await
-        }
-        Postgres => {
-            postgres::Api::new(&config.auth_endpoint, &creds)?
-                .handle_user(client)
-                .await
-        }
-        Link => link::handle_user(&config.auth_link_uri, client).await,
    }
 }
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -1,18 +1,17 @@
 //! Cloud API V2.

 use crate::{
-    auth::{self, AuthFlow, ClientCredentials, DatabaseInfo},
-    compute,
-    error::UserFacingError,
+    auth::{self, AuthFlow, ClientCredentials},
+    compute::{self, ComputeConnCfg},
+    error::{io_error, UserFacingError},
    scram,
    stream::PqStream,
    url::ApiUrl,
 };
 use serde::{Deserialize, Serialize};
-use std::{future::Future, io};
+use std::future::Future;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};

 pub type Result<T> = std::result::Result<T, ConsoleAuthError>;

@@ -84,8 +83,8 @@ pub(super) struct Api<'a> {

 impl<'a> Api<'a> {
    /// Construct an API object containing the auth parameters.
-    pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result<Self> {
-        Ok(Self { endpoint, creds })
+    pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self {
+        Self { endpoint, creds }
    }

    /// Authenticate the existing user or throw an error.
@@ -100,7 +99,7 @@ impl<'a> Api<'a> {
        let mut url = self.endpoint.clone();
        url.path_segments_mut().push("proxy_get_role_secret");
        url.query_pairs_mut()
-            .append_pair("project", self.creds.project_name.as_ref()?)
+            .append_pair("project", self.creds.project().expect("impossible"))
            .append_pair("role", &self.creds.user);

        // TODO: use a proper logger
@@ -120,11 +119,11 @@ impl<'a> Api<'a> {
    }

    /// Wake up the compute node and return the corresponding connection info.
-    async fn wake_compute(&self) -> Result<DatabaseInfo> {
+    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg> {
        let mut url = self.endpoint.clone();
        url.path_segments_mut().push("proxy_wake_compute");
-        let project_name = self.creds.project_name.as_ref()?;
-        url.query_pairs_mut().append_pair("project", project_name);
+        url.query_pairs_mut()
+            .append_pair("project", self.creds.project().expect("impossible"));

        // TODO: use a proper logger
        println!("cplane request: {url}");
@@ -137,16 +136,20 @@ impl<'a> Api<'a> {
        let response: GetWakeComputeResponse =
            serde_json::from_str(&resp.text().await.map_err(io_error)?)?;

-        let (host, port) = parse_host_port(&response.address)
-            .ok_or(ConsoleAuthError::BadComputeAddress(response.address))?;
+        // Unfortunately, ownership won't let us use `Option::ok_or` here.
+        let (host, port) = match parse_host_port(&response.address) {
+            None => return Err(ConsoleAuthError::BadComputeAddress(response.address)),
+            Some(x) => x,
+        };

-        Ok(DatabaseInfo {
-            host,
-            port,
-            dbname: self.creds.dbname.to_owned(),
-            user: self.creds.user.to_owned(),
-            password: None,
-        })
+        let mut config = ComputeConnCfg::new();
+        config
+            .host(host)
+            .port(port)
+            .dbname(&self.creds.dbname)
+            .user(&self.creds.user);
+
+        Ok(config)
    }
 }

@@ -160,7 +163,7 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
 ) -> auth::Result<compute::NodeInfo>
 where
    GetAuthInfo: Future<Output = Result<AuthInfo>>,
-    WakeCompute: Future<Output = Result<DatabaseInfo>>,
+    WakeCompute: Future<Output = Result<ComputeConnCfg>>,
 {
    let auth_info = get_auth_info(endpoint).await?;

@@ -179,48 +182,18 @@ where
        }
    };

-    client
-        .write_message_noflush(&Be::AuthenticationOk)?
-        .write_message_noflush(&BeParameterStatusMessage::encoding())?;
+    let mut config = wake_compute(endpoint).await?;
+    if let Some(keys) = scram_keys {
+        config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys));
+    }

    Ok(compute::NodeInfo {
-        db_info: wake_compute(endpoint).await?,
-        scram_keys,
+        reported_auth_ok: false,
+        config,
    })
 }

-/// Upcast (almost) any error into an opaque [`io::Error`].
-pub(super) fn io_error(e: impl Into<Box<dyn std::error::Error + Send + Sync>>) -> io::Error {
-    io::Error::new(io::ErrorKind::Other, e)
-}
-
-fn parse_host_port(input: &str) -> Option<(String, u16)> {
+fn parse_host_port(input: &str) -> Option<(&str, u16)> {
    let (host, port) = input.split_once(':')?;
-    Some((host.to_owned(), port.parse().ok()?))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use serde_json::json;
-
-    #[test]
-    fn parse_db_info() -> anyhow::Result<()> {
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-            "password": "password",
-        }))?;
-
-        let _: DatabaseInfo = serde_json::from_value(json!({
-            "host": "localhost",
-            "port": 5432,
-            "dbname": "postgres",
-            "user": "john_doe",
-        }))?;
-
-        Ok(())
-    }
+    Some((host, port.parse().ok()?))
 }
--- a/proxy/src/auth/backend/legacy_console.rs
+++ b/proxy/src/auth/backend/legacy_console.rs
@@ -11,7 +11,7 @@ use crate::{
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};
+use utils::pq_proto::BeMessage as Be;

 #[derive(Debug, Error)]
 pub enum AuthErrorImpl {
@@ -76,6 +76,12 @@ enum ProxyAuthResponse {
    NotReady { ready: bool }, // TODO: get rid of `ready`
 }

+impl ClientCredentials {
+    fn is_existing_user(&self) -> bool {
+        self.user.ends_with("@zenith")
+    }
+}
+
 async fn authenticate_proxy_client(
    auth_endpoint: &reqwest::Url,
    creds: &ClientCredentials,
@@ -100,7 +106,7 @@ async fn authenticate_proxy_client(
        }

        let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?;
-        println!("got auth info: #{:?}", auth_info);
+        println!("got auth info: {:?}", auth_info);

        use ProxyAuthResponse::*;
        let db_info = match auth_info {
@@ -128,7 +134,9 @@ async fn handle_existing_user(

    // Read client's password hash
    let msg = client.read_password_message().await?;
-    let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword)?;
+    let md5_response = parse_password(&msg).ok_or(auth::AuthErrorImpl::MalformedPassword(
+        "the password should be a valid null-terminated utf-8 string",
+    ))?;

    let db_info = authenticate_proxy_client(
        auth_endpoint,
@@ -139,21 +147,17 @@ async fn handle_existing_user(
    )
    .await?;

-    client
-        .write_message_noflush(&Be::AuthenticationOk)?
-        .write_message_noflush(&BeParameterStatusMessage::encoding())?;
-
    Ok(compute::NodeInfo {
-        db_info,
-        scram_keys: None,
+        reported_auth_ok: false,
+        config: db_info.into(),
    })
 }

 pub async fn handle_user(
    auth_endpoint: &reqwest::Url,
    auth_link_uri: &reqwest::Url,
-    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
    creds: &ClientCredentials,
+    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
 ) -> auth::Result<compute::NodeInfo> {
    if creds.is_existing_user() {
        handle_existing_user(auth_endpoint, client, creds).await
@@ -201,4 +205,24 @@ mod tests {
        .unwrap();
        assert!(matches!(auth, ProxyAuthResponse::NotReady { .. }));
    }
+
+    #[test]
+    fn parse_db_info() -> anyhow::Result<()> {
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "password": "password",
+        }))?;
+
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+        }))?;
+
+        Ok(())
+    }
 }
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -41,7 +41,7 @@ pub async fn handle_user(
    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;

    Ok(compute::NodeInfo {
-        db_info,
-        scram_keys: None,
+        reported_auth_ok: true,
+        config: db_info.into(),
    })
 }
--- a/proxy/src/auth/backend/postgres.rs
+++ b/proxy/src/auth/backend/postgres.rs
@@ -3,10 +3,12 @@
 use crate::{
    auth::{
        self,
-        backend::console::{self, io_error, AuthInfo, Result},
-        ClientCredentials, DatabaseInfo,
+        backend::console::{self, AuthInfo, Result},
+        ClientCredentials,
    },
-    compute, scram,
+    compute::{self, ComputeConnCfg},
+    error::io_error,
+    scram,
    stream::PqStream,
    url::ApiUrl,
 };
@@ -20,8 +22,8 @@ pub(super) struct Api<'a> {

 impl<'a> Api<'a> {
    /// Construct an API object containing the auth parameters.
-    pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Result<Self> {
-        Ok(Self { endpoint, creds })
+    pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self {
+        Self { endpoint, creds }
    }

    /// Authenticate the existing user or throw an error.
@@ -56,7 +58,10 @@ impl<'a> Api<'a> {

            // We shouldn't get more than one row anyway.
            [row, ..] => {
-                let entry = row.try_get(0).map_err(io_error)?;
+                let entry = row
+                    .try_get("rolpassword")
+                    .map_err(|e| io_error(format!("failed to read user's password: {e}")))?;
+
                scram::ServerSecret::parse(entry)
                    .map(AuthInfo::Scram)
                    .or_else(|| {
@@ -75,14 +80,14 @@ impl<'a> Api<'a> {
    }

    /// We don't need to wake anything locally, so we just return the connection info.
-    async fn wake_compute(&self) -> Result<DatabaseInfo> {
-        Ok(DatabaseInfo {
-            // TODO: handle that near CLI params parsing
-            host: self.endpoint.host_str().unwrap_or("localhost").to_owned(),
-            port: self.endpoint.port().unwrap_or(5432),
-            dbname: self.creds.dbname.to_owned(),
-            user: self.creds.user.to_owned(),
-            password: None,
-        })
+    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg> {
+        let mut config = ComputeConnCfg::new();
+        config
+            .host(self.endpoint.host_str().unwrap_or("localhost"))
+            .port(self.endpoint.port().unwrap_or(5432))
+            .dbname(&self.creds.dbname)
+            .user(&self.creds.user);
+
+        Ok(config)
    }
 }
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,39 +1,25 @@
 //! User credentials used in authentication.

-use crate::compute;
-use crate::config::ProxyConfig;
 use crate::error::UserFacingError;
-use crate::stream::PqStream;
-use std::collections::HashMap;
 use thiserror::Error;
-use tokio::io::{AsyncRead, AsyncWrite};
+use utils::pq_proto::StartupMessageParams;

 #[derive(Debug, Error, PartialEq, Eq, Clone)]
 pub enum ClientCredsParseError {
-    #[error("Parameter `{0}` is missing in startup packet.")]
+    #[error("Parameter '{0}' is missing in startup packet.")]
    MissingKey(&'static str),

-    #[error(
-        "Project name is not specified. \
-        EITHER please upgrade the postgres client library (libpq) for SNI support \
-        OR pass the project name as a parameter: '&options=project%3D<project-name>'."
-    )]
-    MissingSNIAndProjectName,
-
    #[error("Inconsistent project name inferred from SNI ('{0}') and project option ('{1}').")]
-    InconsistentProjectNameAndSNI(String, String),
-
-    #[error("Common name is not set.")]
-    CommonNameNotSet,
+    InconsistentProjectNames(String, String),

    #[error(
        "SNI ('{1}') inconsistently formatted with respect to common name ('{0}'). \
-        SNI should be formatted as '<project-name>.<common-name>'."
+        SNI should be formatted as '<project-name>.{0}'."
    )]
-    InconsistentCommonNameAndSNI(String, String),
+    InconsistentSni(String, String),

-    #[error("Project name ('{0}') must contain only alphanumeric characters and hyphens ('-').")]
-    ProjectNameContainsIllegalChars(String),
+    #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
+    MalformedProjectName(String),
 }

 impl UserFacingError for ClientCredsParseError {}
@@ -44,286 +30,171 @@ impl UserFacingError for ClientCredsParseError {}
 pub struct ClientCredentials {
    pub user: String,
    pub dbname: String,
-    pub project_name: Result<String, ClientCredsParseError>,
+    pub project: Option<String>,
 }

 impl ClientCredentials {
-    pub fn is_existing_user(&self) -> bool {
-        // This logic will likely change in the future.
-        self.user.ends_with("@zenith")
+    pub fn project(&self) -> Option<&str> {
+        self.project.as_deref()
    }
+}

+impl ClientCredentials {
    pub fn parse(
-        mut options: HashMap<String, String>,
-        sni_data: Option<&str>,
+        mut options: StartupMessageParams,
+        sni: Option<&str>,
        common_name: Option<&str>,
    ) -> Result<Self, ClientCredsParseError> {
-        let mut get_param = |key| {
-            options
-                .remove(key)
-                .ok_or(ClientCredsParseError::MissingKey(key))
-        };
+        use ClientCredsParseError::*;

+        // Some parameters are absolutely necessary, others not so much.
+        let mut get_param = |key| options.remove(key).ok_or(MissingKey(key));
+
+        // Some parameters are stored in the startup message.
        let user = get_param("user")?;
        let dbname = get_param("database")?;
-        let project_name = get_param("project").ok();
-        let project_name = get_project_name(sni_data, common_name, project_name.as_deref());
+        let project_a = get_param("project").ok();
+
+        // Alternative project name is in fact a subdomain from SNI.
+        // NOTE: we do not consider SNI if `common_name` is missing.
+        let project_b = sni
+            .zip(common_name)
+            .map(|(sni, cn)| {
+                // TODO: what if SNI is present but just a common name?
+                subdomain_from_sni(sni, cn)
+                    .ok_or_else(|| InconsistentSni(sni.to_owned(), cn.to_owned()))
+            })
+            .transpose()?;
+
+        let project = match (project_a, project_b) {
+            // Invariant: if we have both project name variants, they should match.
+            (Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a, b))),
+            (a, b) => a.or(b).map(|name| {
+                // Invariant: project name may not contain certain characters.
+                check_project_name(name).map_err(MalformedProjectName)
+            }),
+        }
+        .transpose()?;

        Ok(Self {
            user,
            dbname,
-            project_name,
+            project,
        })
    }
+}

-    /// Use credentials to authenticate the user.
-    pub async fn authenticate(
-        self,
-        config: &ProxyConfig,
-        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> super::Result<compute::NodeInfo> {
-        // This method is just a convenient facade for `handle_user`
-        super::backend::handle_user(config, client, self).await
+fn check_project_name(name: String) -> Result<String, String> {
+    if name.chars().all(|c| c.is_alphanumeric() || c == '-') {
+        Ok(name)
+    } else {
+        Err(name)
    }
 }

-/// Inferring project name from sni_data.
-fn project_name_from_sni_data(
-    sni_data: &str,
-    common_name: &str,
-) -> Result<String, ClientCredsParseError> {
-    let common_name_with_dot = format!(".{common_name}");
-    // check that ".{common_name_with_dot}" is the actual suffix in sni_data
-    if !sni_data.ends_with(&common_name_with_dot) {
-        return Err(ClientCredsParseError::InconsistentCommonNameAndSNI(
-            common_name.to_string(),
-            sni_data.to_string(),
+fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
+    sni.strip_suffix(common_name)?
+        .strip_suffix('.')
+        .map(str::to_owned)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn make_options<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> StartupMessageParams {
+        StartupMessageParams::from(pairs.map(|(k, v)| (k.to_owned(), v.to_owned())))
+    }
+
+    #[test]
+    #[ignore = "TODO: fix how database is handled"]
+    fn parse_bare_minimum() -> anyhow::Result<()> {
+        // According to postgresql, only `user` should be required.
+        let options = make_options([("user", "john_doe")]);
+
+        // TODO: check that `creds.dbname` is None.
+        let creds = ClientCredentials::parse(options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_missing_project() -> anyhow::Result<()> {
+        let options = make_options([("user", "john_doe"), ("database", "world")]);
+
+        let creds = ClientCredentials::parse(options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.dbname, "world");
+        assert_eq!(creds.project, None);
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_project_from_sni() -> anyhow::Result<()> {
+        let options = make_options([("user", "john_doe"), ("database", "world")]);
+
+        let sni = Some("foo.localhost");
+        let common_name = Some("localhost");
+
+        let creds = ClientCredentials::parse(options, sni, common_name)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.dbname, "world");
+        assert_eq!(creds.project.as_deref(), Some("foo"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_project_from_options() -> anyhow::Result<()> {
+        let options = make_options([
+            ("user", "john_doe"),
+            ("database", "world"),
+            ("project", "bar"),
+        ]);
+
+        let creds = ClientCredentials::parse(options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.dbname, "world");
+        assert_eq!(creds.project.as_deref(), Some("bar"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_projects_identical() -> anyhow::Result<()> {
+        let options = make_options([
+            ("user", "john_doe"),
+            ("database", "world"),
+            ("project", "baz"),
+        ]);
+
+        let sni = Some("baz.localhost");
+        let common_name = Some("localhost");
+
+        let creds = ClientCredentials::parse(options, sni, common_name)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.dbname, "world");
+        assert_eq!(creds.project.as_deref(), Some("baz"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_projects_different() {
+        let options = make_options([
+            ("user", "john_doe"),
+            ("database", "world"),
+            ("project", "first"),
+        ]);
+
+        let sni = Some("second.localhost");
+        let common_name = Some("localhost");
+
+        assert!(matches!(
+            ClientCredentials::parse(options, sni, common_name).expect_err("should fail"),
+            ClientCredsParseError::InconsistentProjectNames(_, _)
        ));
    }
-    // return sni_data without the common name suffix.
-    Ok(sni_data
-        .strip_suffix(&common_name_with_dot)
-        .unwrap()
-        .to_string())
-}
-
-#[cfg(test)]
-mod tests_for_project_name_from_sni_data {
-    use super::*;
-
-    #[test]
-    fn passing() {
-        let target_project_name = "my-project-123";
-        let common_name = "localtest.me";
-        let sni_data = format!("{target_project_name}.{common_name}");
-        assert_eq!(
-            project_name_from_sni_data(&sni_data, common_name),
-            Ok(target_project_name.to_string())
-        );
-    }
-
-    #[test]
-    fn throws_inconsistent_common_name_and_sni_data() {
-        let target_project_name = "my-project-123";
-        let common_name = "localtest.me";
-        let wrong_suffix = "wrongtest.me";
-        assert_eq!(common_name.len(), wrong_suffix.len());
-        let wrong_common_name = format!("wrong{wrong_suffix}");
-        let sni_data = format!("{target_project_name}.{wrong_common_name}");
-        assert_eq!(
-            project_name_from_sni_data(&sni_data, common_name),
-            Err(ClientCredsParseError::InconsistentCommonNameAndSNI(
-                common_name.to_string(),
-                sni_data
-            ))
-        );
-    }
-}
-
-/// Determine project name from SNI or from project_name parameter from options argument.
-fn get_project_name(
-    sni_data: Option<&str>,
-    common_name: Option<&str>,
-    project_name: Option<&str>,
-) -> Result<String, ClientCredsParseError> {
-    // determine the project name from sni_data if it exists, otherwise from project_name.
-    let ret = match sni_data {
-        Some(sni_data) => {
-            let common_name = common_name.ok_or(ClientCredsParseError::CommonNameNotSet)?;
-            let project_name_from_sni = project_name_from_sni_data(sni_data, common_name)?;
-            // check invariant: project name from options and from sni should match
-            if let Some(project_name) = &project_name {
-                if !project_name_from_sni.eq(project_name) {
-                    return Err(ClientCredsParseError::InconsistentProjectNameAndSNI(
-                        project_name_from_sni,
-                        project_name.to_string(),
-                    ));
-                }
-            }
-            project_name_from_sni
-        }
-        None => project_name
-            .ok_or(ClientCredsParseError::MissingSNIAndProjectName)?
-            .to_string(),
-    };
-
-    // check formatting invariant: project name must contain only alphanumeric characters and hyphens.
-    if !ret.chars().all(|x: char| x.is_alphanumeric() || x == '-') {
-        return Err(ClientCredsParseError::ProjectNameContainsIllegalChars(ret));
-    }
-
-    Ok(ret)
-}
-
-#[cfg(test)]
-mod tests_for_project_name_only {
-    use super::*;
-
-    #[test]
-    fn passing_from_sni_data_only() {
-        let target_project_name = "my-project-123";
-        let common_name = "localtest.me";
-        let sni_data = format!("{target_project_name}.{common_name}");
-        assert_eq!(
-            get_project_name(Some(&sni_data), Some(common_name), None),
-            Ok(target_project_name.to_string())
-        );
-    }
-
-    #[test]
-    fn throws_project_name_contains_illegal_chars_from_sni_data_only() {
-        let project_name_prefix = "my-project";
-        let project_name_suffix = "123";
-        let common_name = "localtest.me";
-
-        for illegal_char_id in 0..256 {
-            let illegal_char = char::from_u32(illegal_char_id).unwrap();
-            if !(illegal_char.is_alphanumeric() || illegal_char == '-')
-                && illegal_char.to_string().len() == 1
-            {
-                let target_project_name =
-                    format!("{project_name_prefix}{illegal_char}{project_name_suffix}");
-                let sni_data = format!("{target_project_name}.{common_name}");
-                assert_eq!(
-                    get_project_name(Some(&sni_data), Some(common_name), None),
-                    Err(ClientCredsParseError::ProjectNameContainsIllegalChars(
-                        target_project_name
-                    ))
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn passing_from_project_name_only() {
-        let target_project_name = "my-project-123";
-        let common_names = [Some("localtest.me"), None];
-        for common_name in common_names {
-            assert_eq!(
-                get_project_name(None, common_name, Some(target_project_name)),
-                Ok(target_project_name.to_string())
-            );
-        }
-    }
-
-    #[test]
-    fn throws_project_name_contains_illegal_chars_from_project_name_only() {
-        let project_name_prefix = "my-project";
-        let project_name_suffix = "123";
-        let common_names = [Some("localtest.me"), None];
-
-        for common_name in common_names {
-            for illegal_char_id in 0..256 {
-                let illegal_char: char = char::from_u32(illegal_char_id).unwrap();
-                if !(illegal_char.is_alphanumeric() || illegal_char == '-')
-                    && illegal_char.to_string().len() == 1
-                {
-                    let target_project_name =
-                        format!("{project_name_prefix}{illegal_char}{project_name_suffix}");
-                    assert_eq!(
-                        get_project_name(None, common_name, Some(&target_project_name)),
-                        Err(ClientCredsParseError::ProjectNameContainsIllegalChars(
-                            target_project_name
-                        ))
-                    );
-                }
-            }
-        }
-    }
-
-    #[test]
-    fn passing_from_sni_data_and_project_name() {
-        let target_project_name = "my-project-123";
-        let common_name = "localtest.me";
-        let sni_data = format!("{target_project_name}.{common_name}");
-        assert_eq!(
-            get_project_name(
-                Some(&sni_data),
-                Some(common_name),
-                Some(target_project_name)
-            ),
-            Ok(target_project_name.to_string())
-        );
-    }
-
-    #[test]
-    fn throws_inconsistent_project_name_and_sni() {
-        let project_name_param = "my-project-123";
-        let wrong_project_name = "not-my-project-123";
-        let common_name = "localtest.me";
-        let sni_data = format!("{wrong_project_name}.{common_name}");
-        assert_eq!(
-            get_project_name(Some(&sni_data), Some(common_name), Some(project_name_param)),
-            Err(ClientCredsParseError::InconsistentProjectNameAndSNI(
-                wrong_project_name.to_string(),
-                project_name_param.to_string()
-            ))
-        );
-    }
-
-    #[test]
-    fn throws_common_name_not_set() {
-        let target_project_name = "my-project-123";
-        let wrong_project_name = "not-my-project-123";
-        let common_name = "localtest.me";
-        let sni_datas = [
-            Some(format!("{wrong_project_name}.{common_name}")),
-            Some(format!("{target_project_name}.{common_name}")),
-        ];
-        let project_names = [None, Some(target_project_name)];
-        for sni_data in sni_datas {
-            for project_name_param in project_names {
-                assert_eq!(
-                    get_project_name(sni_data.as_deref(), None, project_name_param),
-                    Err(ClientCredsParseError::CommonNameNotSet)
-                );
-            }
-        }
-    }
-
-    #[test]
-    fn throws_inconsistent_common_name_and_sni_data() {
-        let target_project_name = "my-project-123";
-        let wrong_project_name = "not-my-project-123";
-        let common_name = "localtest.me";
-        let wrong_suffix = "wrongtest.me";
-        assert_eq!(common_name.len(), wrong_suffix.len());
-        let wrong_common_name = format!("wrong{wrong_suffix}");
-        let sni_datas = [
-            Some(format!("{wrong_project_name}.{wrong_common_name}")),
-            Some(format!("{target_project_name}.{wrong_common_name}")),
-        ];
-        let project_names = [None, Some(target_project_name)];
-        for project_name_param in project_names {
-            for sni_data in &sni_datas {
-                assert_eq!(
-                    get_project_name(sni_data.as_deref(), Some(common_name), project_name_param),
-                    Err(ClientCredsParseError::InconsistentCommonNameAndSNI(
-                        common_name.to_string(),
-                        sni_data.clone().unwrap().to_string()
-                    ))
-                );
-            }
-        }
-    }
 }
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -1,8 +1,7 @@
 //! Main authentication flow.

-use super::AuthErrorImpl;
-use crate::stream::PqStream;
-use crate::{sasl, scram};
+use super::{AuthErrorImpl, PasswordHackPayload};
+use crate::{sasl, scram, stream::PqStream};
 use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
 use utils::pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
@@ -27,6 +26,17 @@ impl AuthMethod for Scram<'_> {
    }
 }

+/// Use an ad hoc auth flow (for clients which don't support SNI) proposed in
+/// <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
+pub struct PasswordHack;
+
+impl AuthMethod for PasswordHack {
+    #[inline(always)]
+    fn first_message(&self) -> BeMessage<'_> {
+        Be::AuthenticationCleartextPassword
+    }
+}
+
 /// This wrapper for [`PqStream`] performs client authentication.
 #[must_use]
 pub struct AuthFlow<'a, Stream, State> {
@@ -57,13 +67,34 @@ impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
    }
 }

+impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
+    /// Perform user authentication. Raise an error in case authentication failed.
+    pub async fn authenticate(self) -> super::Result<PasswordHackPayload> {
+        let msg = self.stream.read_password_message().await?;
+        let password = msg
+            .strip_suffix(&[0])
+            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
+
+        // The so-called "password" should contain a base64-encoded json.
+        // We will use it later to route the client to their project.
+        let bytes = base64::decode(password)
+            .map_err(|_| AuthErrorImpl::MalformedPassword("bad encoding"))?;
+
+        let payload = serde_json::from_slice(&bytes)
+            .map_err(|_| AuthErrorImpl::MalformedPassword("invalid payload"))?;
+
+        Ok(payload)
+    }
+}
+
 /// Stream wrapper for handling [SCRAM](crate::scram) auth.
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
    /// Perform user authentication. Raise an error in case authentication failed.
    pub async fn authenticate(self) -> super::Result<scram::ScramKey> {
        // Initial client message contains the chosen auth method's name.
        let msg = self.stream.read_password_message().await?;
-        let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?;
+        let sasl = sasl::FirstMessage::parse(&msg)
+            .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?;

        // Currently, the only supported SASL method is SCRAM.
        if !scram::METHODS.contains(&sasl.method) {
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -0,0 +1,102 @@
+//! Payload for ad hoc authentication method for clients that don't support SNI.
+//! See the `impl` for [`super::backend::BackendType<ClientCredentials>`].
+//! Read more: <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
+
+use serde::{de, Deserialize, Deserializer};
+use std::fmt;
+
+#[derive(Deserialize)]
+#[serde(untagged)]
+pub enum Password {
+    /// A regular string for utf-8 encoded passwords.
+    Simple { password: String },
+
+    /// Password is base64-encoded because it may contain arbitrary byte sequences.
+    Encoded {
+        #[serde(rename = "password_", deserialize_with = "deserialize_base64")]
+        password: Vec<u8>,
+    },
+}
+
+impl AsRef<[u8]> for Password {
+    fn as_ref(&self) -> &[u8] {
+        match self {
+            Password::Simple { password } => password.as_ref(),
+            Password::Encoded { password } => password.as_ref(),
+        }
+    }
+}
+
+#[derive(Deserialize)]
+pub struct PasswordHackPayload {
+    pub project: String,
+
+    #[serde(flatten)]
+    pub password: Password,
+}
+
+fn deserialize_base64<'a, D: Deserializer<'a>>(des: D) -> Result<Vec<u8>, D::Error> {
+    // It's very tempting to replace this with
+    //
+    // ```
+    // let base64: &str = Deserialize::deserialize(des)?;
+    // base64::decode(base64).map_err(serde::de::Error::custom)
+    // ```
+    //
+    // Unfortunately, we can't always deserialize into `&str`, so we'd
+    // have to use an allocating `String` instead. Thus, visitor is better.
+    struct Visitor;
+
+    impl<'de> de::Visitor<'de> for Visitor {
+        type Value = Vec<u8>;
+
+        fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+            formatter.write_str("a string")
+        }
+
+        fn visit_str<E: de::Error>(self, v: &str) -> Result<Self::Value, E> {
+            base64::decode(v).map_err(de::Error::custom)
+        }
+    }
+
+    des.deserialize_str(Visitor)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rstest::rstest;
+    use serde_json::json;
+
+    #[test]
+    fn parse_password() -> anyhow::Result<()> {
+        let password: Password = serde_json::from_value(json!({
+            "password": "foo",
+        }))?;
+        assert_eq!(password.as_ref(), "foo".as_bytes());
+
+        let password: Password = serde_json::from_value(json!({
+            "password_": base64::encode("foo"),
+        }))?;
+        assert_eq!(password.as_ref(), "foo".as_bytes());
+
+        Ok(())
+    }
+
+    #[rstest]
+    #[case("password", str::to_owned)]
+    #[case("password_", base64::encode)]
+    fn parse(#[case] key: &str, #[case] encode: fn(&'static str) -> String) -> anyhow::Result<()> {
+        let (password, project) = ("password", "pie-in-the-sky");
+        let payload = json!({
+            "project": project,
+            key: encode(password),
+        });
+
+        let payload: PasswordHackPayload = serde_json::from_value(payload)?;
+        assert_eq!(payload.password.as_ref(), password.as_bytes());
+        assert_eq!(payload.project, project);
+
+        Ok(())
+    }
+}
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,8 +1,6 @@
-use crate::auth::DatabaseInfo;
-use crate::cancellation::CancelClosure;
-use crate::error::UserFacingError;
-use std::io;
-use std::net::SocketAddr;
+use crate::{cancellation::CancelClosure, error::UserFacingError};
+use futures::TryFutureExt;
+use std::{io, net::SocketAddr};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::NoTls;
@@ -21,44 +19,96 @@ pub enum ConnectionError {
    FailedToFetchPgVersion,
 }

-impl UserFacingError for ConnectionError {}
-
-/// PostgreSQL version as [`String`].
-pub type Version = String;
+impl UserFacingError for ConnectionError {
+    fn to_string_client(&self) -> String {
+        use ConnectionError::*;
+        match self {
+            // This helps us drop irrelevant library-specific prefixes.
+            // TODO: propagate severity level and other parameters.
+            Postgres(err) => match err.as_db_error() {
+                Some(err) => err.message().to_string(),
+                None => err.to_string(),
+            },
+            other => other.to_string(),
+        }
+    }
+}

 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
 pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;

-/// Compute node connection params.
+pub type ComputeConnCfg = tokio_postgres::Config;
+
+/// Various compute node info for establishing connection etc.
 pub struct NodeInfo {
-    pub db_info: DatabaseInfo,
-    pub scram_keys: Option<ScramKeys>,
+    /// Did we send [`utils::pq_proto::BeMessage::AuthenticationOk`]?
+    pub reported_auth_ok: bool,
+    /// Compute node connection params.
+    pub config: tokio_postgres::Config,
 }

 impl NodeInfo {
    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> {
-        let host_port = (self.db_info.host.as_str(), self.db_info.port);
-        let socket = TcpStream::connect(host_port).await?;
-        let socket_addr = socket.peer_addr()?;
-        socket2::SockRef::from(&socket).set_keepalive(true)?;
+        use tokio_postgres::config::Host;

-        Ok((socket_addr, socket))
+        let connect_once = |host, port| {
+            TcpStream::connect((host, port)).and_then(|socket| async {
+                let socket_addr = socket.peer_addr()?;
+                // This prevents load balancer from severing the connection.
+                socket2::SockRef::from(&socket).set_keepalive(true)?;
+                Ok((socket_addr, socket))
+            })
+        };
+
+        // We can't reuse connection establishing logic from `tokio_postgres` here,
+        // because it has no means for extracting the underlying socket which we
+        // require for our business.
+        let mut connection_error = None;
+        let ports = self.config.get_ports();
+        for (i, host) in self.config.get_hosts().iter().enumerate() {
+            let port = ports.get(i).or_else(|| ports.get(0)).unwrap_or(&5432);
+            let host = match host {
+                Host::Tcp(host) => host.as_str(),
+                Host::Unix(_) => continue, // unix sockets are not welcome here
+            };
+
+            // TODO: maybe we should add a timeout.
+            match connect_once(host, *port).await {
+                Ok(socket) => return Ok(socket),
+                Err(err) => {
+                    // We can't throw an error here, as there might be more hosts to try.
+                    println!("failed to connect to compute `{host}:{port}`: {err}");
+                    connection_error = Some(err);
+                }
+            }
+        }
+
+        Err(connection_error.unwrap_or_else(|| {
+            io::Error::new(
+                io::ErrorKind::Other,
+                format!("couldn't connect: bad compute config: {:?}", self.config),
+            )
+        }))
    }
+}

+pub struct PostgresConnection {
+    /// Socket connected to a compute node.
+    pub stream: TcpStream,
+    /// PostgreSQL version of this instance.
+    pub version: String,
+}
+
+impl NodeInfo {
    /// Connect to a corresponding compute node.
-    pub async fn connect(self) -> Result<(TcpStream, Version, CancelClosure), ConnectionError> {
-        let (socket_addr, mut socket) = self
+    pub async fn connect(&self) -> Result<(PostgresConnection, CancelClosure), ConnectionError> {
+        let (socket_addr, mut stream) = self
            .connect_raw()
            .await
            .map_err(|_| ConnectionError::FailedToConnectToCompute)?;

-        let mut config = tokio_postgres::Config::from(self.db_info);
-        if let Some(scram_keys) = self.scram_keys {
-            config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(scram_keys));
-        }
-
        // TODO: establish a secure connection to the DB
-        let (client, conn) = config.connect_raw(&mut socket, NoTls).await?;
+        let (client, conn) = self.config.connect_raw(&mut stream, NoTls).await?;
        let version = conn
            .parameter("server_version")
            .ok_or(ConnectionError::FailedToFetchPgVersion)?
@@ -66,6 +116,8 @@ impl NodeInfo {

        let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token());

-        Ok((socket, version, cancel_closure))
+        let db = PostgresConnection { stream, version };
+
+        Ok((db, cancel_closure))
    }
 }
--- a/Show More
+++ b/Show More