Increase parallel workers to trigger more errors

Hide debug logs in test_wal_acceptor_async
Fix print in last test
2026-01-21 20:32:56 +00:00 · 2021-09-29 12:23:49 +03:00 · 2021-09-29 11:47:53 +03:00 · 2021-09-29 11:47:53 +03:00 · 2021-09-29 11:47:53 +03:00 · 2021-09-29 11:47:52 +03:00
170 changed files with 6303 additions and 18314 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,19 +1,20 @@
 version: 2.1

+orbs:
+  python: circleci/python@1.4.0
+
 executors:
  zenith-build-executor:
    resource_class: xlarge
    docker:
-      - image: cimg/rust:1.56.1
-  zenith-python-executor:
-    docker:
-      - image: cimg/python:3.7.10  # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI
+      - image: cimg/rust:1.52.1

 jobs:
-  check-codestyle-rust:
+  check-codestyle:
    executor: zenith-build-executor
    steps:
      - checkout
+
      - run:
          name: rustfmt
          when: always
@@ -23,12 +24,6 @@ jobs:
  # A job to build postgres
  build-postgres:
    executor: zenith-build-executor
-    parameters:
-      build_type:
-        type: enum
-        enum: ["debug", "release"]
-    environment:
-      BUILD_TYPE: << parameters.build_type >>
    steps:
        # Checkout the git repo (circleci doesn't have a flag to enable submodules here)
      - checkout
@@ -44,7 +39,7 @@ jobs:
          name: Restore postgres cache
          keys:
            # Restore ONLY if the rev key matches exactly
-            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
+            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}

        # FIXME We could cache our own docker container, instead of installing packages every time.
      - run:
@@ -64,12 +59,12 @@ jobs:
            if [ ! -e tmp_install/bin/postgres ]; then
              # "depth 1" saves some time by not cloning the whole repo
              git submodule update --init --depth 1
-              make postgres -j8
+              make postgres
            fi

      - save_cache:
          name: Save postgres cache
-          key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
+          key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
          paths:
            - tmp_install

@@ -80,8 +75,6 @@ jobs:
      build_type:
        type: enum
        enum: ["debug", "release"]
-    environment:
-      BUILD_TYPE: << parameters.build_type >>
    steps:
      - run:
          name: apt install dependencies
@@ -103,7 +96,7 @@ jobs:
          name: Restore postgres cache
          keys:
            # Restore ONLY if the rev key matches exactly
-            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
+            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}

      - restore_cache:
          name: Restore rust cache
@@ -111,26 +104,25 @@ jobs:
            # Require an exact match. While an out of date cache might speed up the build,
            # there's no way to clean out old packages, so the cache grows every time something
            # changes.
-            - v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+            - v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}

        # Build the rust code, including test binaries
      - run:
          name: Rust build << parameters.build_type >>
          command: |
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-              CARGO_FLAGS=
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-              CARGO_FLAGS=--release
-            fi
-
            export CARGO_INCREMENTAL=0
-            "${cov_prefix[@]}" cargo build $CARGO_FLAGS --bins --tests
+            BUILD_TYPE="<< parameters.build_type >>"
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              echo "Build in debug mode"
+              cargo build --bins --tests
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              echo "Build in release mode"
+              cargo build --release --bins --tests
+            fi

      - save_cache:
          name: Save rust cache
-          key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+          key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
          paths:
            - ~/.cargo/registry
            - ~/.cargo/git
@@ -140,100 +132,53 @@ jobs:
        # has to run separately from cargo fmt section
        # since needs to run with dependencies
      - run:
-          name: cargo clippy
+          name: clippy
          command: |
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-            fi
-
-            "${cov_prefix[@]}" ./run_clippy.sh
+            ./run_clippy.sh

        # Run rust unit tests
-      - run:
-          name: cargo test
-          command: |
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-            fi
-
-            "${cov_prefix[@]}" cargo test
+      - run: cargo test

        # Install the rust binaries, for use by test jobs
+        # `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
+        # FIXME: this is a really silly way to install; maybe we should just output
+        # a tarball as an artifact? Or a .deb package?
      - run:
-          name: Install rust binaries
+          name: cargo install
          command: |
+            export CARGO_INCREMENTAL=0
+            BUILD_TYPE="<< parameters.build_type >>"
            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
+              echo "Install debug mode"
+              CARGO_FLAGS="--debug"
            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-            fi
-
-            binaries=$(
-              "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
-              jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
-            )
-
-            test_exe_paths=$(
-              "${cov_prefix[@]}" cargo test --message-format=json --no-run |
-              jq -r '.executable | select(. != null)'
-            )
-
-            mkdir -p /tmp/zenith/bin
-            mkdir -p /tmp/zenith/test_bin
-            mkdir -p /tmp/zenith/etc
-
-            # Install target binaries
-            for bin in $binaries; do
-              SRC=target/$BUILD_TYPE/$bin
-              DST=/tmp/zenith/bin/$bin
-              cp $SRC $DST
-              echo $DST >> /tmp/zenith/etc/binaries.list
-            done
-
-            # Install test executables (for code coverage)
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              for bin in $test_exe_paths; do
-                SRC=$bin
-                DST=/tmp/zenith/test_bin/$(basename $bin)
-                cp $SRC $DST
-                echo $DST >> /tmp/zenith/etc/binaries.list
-              done
+              echo "Install release mode"
+              # The default is release mode; there is no --release flag.
+              CARGO_FLAGS=""
            fi
+            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
+            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
+            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith

        # Install the postgres binaries, for use by test jobs
+        # FIXME: this is a silly way to do "install"; maybe just output a standard
+        # postgres package, whatever the favored form is (tarball? .deb package?)
+        # Note that pg_regress needs some build artifacts that probably aren't
+        # in the usual package...?
      - run:
-          name: Install postgres binaries
+          name: postgres install
          command: |
            cp -a tmp_install /tmp/zenith/pg_install

-        # Save the rust binaries and coverage data for other jobs in this workflow.
+        # Save the rust output binaries for other jobs in this workflow.
      - persist_to_workspace:
          root: /tmp/zenith
          paths:
            - "*"

-  check-codestyle-python:
-    executor: zenith-python-executor
-    steps:
-      - checkout
-      - run:
-          name: Install deps
-          command: pipenv --python 3.7 install --dev
-      - run:
-          name: Run yapf to ensure code format
-          when: always
-          command: pipenv run yapf --recursive --diff .
-      - run:
-          name: Run mypy to check types
-          when: always
-          command: pipenv run mypy .
-
  run-pytest:
-    executor: zenith-python-executor
+    #description: "Run pytest"
+    executor: python/default
    parameters:
      # pytest args to specify the tests to run.
      #
@@ -259,11 +204,6 @@ jobs:
      run_in_parallel:
        type: boolean
        default: true
-      save_perf_report:
-        type: boolean
-        default: false
-    environment:
-      BUILD_TYPE: << parameters.build_type >>
    steps:
      - attach_workspace:
          at: /tmp/zenith
@@ -273,74 +213,39 @@ jobs:
          steps:
            - run: git submodule update --init --depth 1
      - run:
-          name: Install deps
-          command: pipenv --python 3.7 install
+          name: Install pipenv & deps
+          working_directory: test_runner
+          command: |
+            pip install pipenv
+            pipenv install
      - run:
          name: Run pytest
-          # pytest doesn't output test logs in real time, so CI job may fail with
-          # `Too long with no output` error, if a test is running for a long time.
-          # In that case, tests should have internal timeouts that are less than
-          # no_output_timeout, specified here.
-          no_output_timeout: 10m
+          working_directory: test_runner
          environment:
            - ZENITH_BIN: /tmp/zenith/bin
            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
            - TEST_OUTPUT: /tmp/test_output
-            # this variable will be embedded in perf test report
-            # and is needed to distinguish different environments
-            - PLATFORM: zenith-local-ci
          command: |
-            PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
-
-            TEST_SELECTION="test_runner/<< parameters.test_selection >>"
+            TEST_SELECTION="<< parameters.test_selection >>"
            EXTRA_PARAMS="<< parameters.extra_params >>"
            if [ -z "$TEST_SELECTION" ]; then
              echo "test_selection must be set"
              exit 1
            fi
            if << parameters.run_in_parallel >>; then
-              EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
-            fi
-            if << parameters.save_perf_report >>; then
-              if [[ $CIRCLE_BRANCH == "main" ]]; then
-                mkdir -p "$PERF_REPORT_DIR"
-                EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
-              fi
-            fi
-
-            export GITHUB_SHA=$CIRCLE_SHA1
-
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-            fi
-
+              EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
+            fi;
            # Run the tests.
            #
            # The junit.xml file allows CircleCI to display more fine-grained test information
            # in its "Tests" tab in the results page.
+            # -s prevents pytest from capturing output, which helps to see
+            # what's going on if the test hangs
            # --verbose prints name of each test (helpful when there are
            # multiple tests in one file)
            # -rA prints summary in the end
            # -n4 uses four processes to run tests via pytest-xdist
-            # -s is not used to prevent pytest from capturing output, because tests are running
-            # in parallel and logs are mixed between different tests
-            "${cov_prefix[@]}" pipenv run pytest \
-              --junitxml=$TEST_OUTPUT/junit.xml \
-              --tb=short \
-              --verbose \
-              -m "not remote_cluster" \
-              -rA $TEST_SELECTION $EXTRA_PARAMS
-
-            if << parameters.save_perf_report >>; then
-              if [[ $CIRCLE_BRANCH == "main" ]]; then
-                # TODO: reuse scripts/git-upload
-                export REPORT_FROM="$PERF_REPORT_DIR"
-                export REPORT_TO=local
-                scripts/generate_and_push_perf_report.sh
-              fi
-            fi
+            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
      - run:
          # CircleCI artifacts are preserved one file at a time, so skipping
          # this step isn't a good idea. If you want to extract the
@@ -349,72 +254,13 @@ jobs:
          when: always
          command: |
            du -sh /tmp/test_output/*
-            find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
+            find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
            du -sh /tmp/test_output/*
      - store_artifacts:
          path: /tmp/test_output
      # The store_test_results step tells CircleCI where to find the junit.xml file.
      - store_test_results:
          path: /tmp/test_output
-      # Save coverage data (if any)
-      - persist_to_workspace:
-          root: /tmp/zenith
-          paths:
-            - "*"
-
-  coverage-report:
-    executor: zenith-build-executor
-    steps:
-      - attach_workspace:
-          at: /tmp/zenith
-      - checkout
-      - restore_cache:
-          name: Restore rust cache
-          keys:
-            # Require an exact match. While an out of date cache might speed up the build,
-            # there's no way to clean out old packages, so the cache grows every time something
-            # changes.
-            - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
-      - run:
-          name: Install llvm-tools
-          command: |
-            # TODO: install a proper symbol demangler, e.g. rustfilt
-            # TODO: we should embed this into a docker image
-            rustup component add llvm-tools-preview
-      - run:
-          name: Build coverage report
-          command: |
-            COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
-
-            scripts/coverage \
-              --dir=/tmp/zenith/coverage report \
-              --input-objects=/tmp/zenith/etc/binaries.list \
-              --commit-url=$COMMIT_URL \
-              --format=github
-      - run:
-          name: Upload coverage report
-          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-            REPORT_URL=https://zenithdb.github.io/zenith-coverage-data/$CIRCLE_SHA1
-            COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
-
-            scripts/git-upload \
-              --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \
-              --message="Add code coverage for $COMMIT_URL" \
-              copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
-
-            # Add link to the coverage report to the commit
-            curl -f -X POST \
-            https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"state\": \"success\",
-                \"context\": \"zenith-coverage\",
-                \"description\": \"Coverage report is ready\",
-                \"target_url\": \"$REPORT_URL\"
-              }"

  # Build zenithdb/zenith:latest image and push it to Docker hub
  docker-image:
@@ -431,7 +277,7 @@ jobs:
          name: Build and push Docker image
          command: |
            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
-            docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
+            docker build -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest

  # Trigger a new remote CI job
  remote-ci-trigger:
@@ -473,27 +319,23 @@ jobs:
                \"inputs\": {
                  \"ci_job_name\": \"zenith-remote-ci\",
                  \"commit_hash\": \"$CIRCLE_SHA1\",
-                  \"remote_repo\": \"$LOCAL_REPO\"
+                  \"remote_repo\": \"$LOCAL_REPO\",
+                  \"zenith_image_branch\": \"$CIRCLE_BRANCH\"
                }
              }"

 workflows:
  build_and_test:
    jobs:
-      - check-codestyle-rust
-      - check-codestyle-python
-      - build-postgres:
-          name: build-postgres-<< matrix.build_type >>
-          matrix:
-            parameters:
-              build_type: ["debug", "release"]
+      - check-codestyle
+      - build-postgres
      - build-zenith:
          name: build-zenith-<< matrix.build_type >>
          matrix:
            parameters:
              build_type: ["debug", "release"]
          requires:
-            - build-postgres-<< matrix.build_type >>
+            - build-postgres
      - run-pytest:
          name: pg_regress-tests-<< matrix.build_type >>
          matrix:
@@ -516,15 +358,8 @@ workflows:
          build_type: release
          test_selection: performance
          run_in_parallel: false
-          save_perf_report: true
          requires:
            - build-zenith-release
-      - coverage-report:
-          # Context passes credentials for gh api
-          context: CI_ACCESS_TOKEN
-          requires:
-            # TODO: consider adding more
-            - other-tests-debug
      - docker-image:
          # Context gives an ability to login
          context: Docker Hub
--- a/.dockerignore
+++ b/.dockerignore
@@ -11,8 +11,4 @@ test_output
 .vscode
 .zenith
 integration_tests/.zenith
-.mypy_cache
-
-Dockerfile
-.dockerignore
-
+.mypy_cache
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -1,114 +0,0 @@
-name: benchmarking
-
-on:
-  # uncomment to run on push for debugging your PR
-  # push:
-  #   branches: [ mybranch ]
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    #          ┌───────────── minute (0 - 59)
-    #          │ ┌───────────── hour (0 - 23)
-    #          │ │ ┌───────────── day of the month (1 - 31)
-    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '36 7 * * *' # run once a day, timezone is utc
-
-  workflow_dispatch: # adds ability to run this manually
-
-env:
-  BASE_URL: "https://console.zenith.tech"
-
-jobs:
-  bench:
-    # this workflow runs on self hosteed runner
-    # it's environment is quite different from usual guthub runner
-    # probably the most important difference is that it doesnt start from clean workspace each time
-    # e g if you install system packages they are not cleaned up since you install them directly in host machine
-    # not a container or something
-    # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
-    runs-on: [self-hosted, zenith-benchmarker]
-
-    env:
-      PG_BIN: "/usr/pgsql-13/bin"
-
-    steps:
-    - name: Checkout zenith repo
-      uses: actions/checkout@v2
-
-    # actions/setup-python@v2 is not working correctly on self-hosted runners
-    # see https://github.com/actions/setup-python/issues/162
-    # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
-    # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
-    # there is Python 3.7.10 already installed on the machine so use it to install pipenv and then use pipenv's virtuealenvs
-    - name: Install pipenv & deps
-      run: |
-        python3 -m pip install --upgrade pipenv wheel
-        # since pip/pipenv caches are reused there shouldn't be any troubles with install every time
-        pipenv install
-
-    - name: Show versions
-      run: |
-        echo Python
-        python3 --version
-        pipenv run python3 --version
-        echo Pipenv
-        pipenv --version
-        echo Pgbench
-        $PG_BIN/pgbench --version
-
-    # FIXME cluster setup is skipped due to various changes in console API
-    # for now pre created cluster is used. When API gain some stability
-    # after massive changes dynamic cluster setup will be revived.
-    # So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity
-    - name: Setup cluster
-      env:
-        BENCHMARK_CONSOLE_USER_PASSWORD: "${{ secrets.BENCHMARK_CONSOLE_USER_PASSWORD }}"
-        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
-        BENCHMARK_CLUSTER_ID: "${{ secrets.BENCHMARK_CLUSTER_ID }}"
-      shell: bash
-      run: |
-        set -e
-
-        echo "Starting cluster"
-        CLUSTER=$(curl -s --fail --show-error -X POST $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID/start \
-            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
-        echo $CLUSTER | python -m json.tool
-
-        echo "Waiting for cluster to become ready"
-        sleep 10
-
-        echo "CLUSTER_ID=$BENCHMARK_CLUSTER_ID" >> $GITHUB_ENV
-        CLUSTER=$(curl -s --fail --show-error -X GET $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID.json \
-            -H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
-        echo $CLUSTER | python -m json.tool
-
-    - name: Run benchmark
-      # pgbench is installed system wide from official repo
-      # https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
-      # via
-      # sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
-      # [pgdg13]
-      # name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
-      # baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
-      # enabled=1
-      # gpgcheck=0
-      # EOF
-      # sudo yum makecache
-      # sudo yum install postgresql13-contrib
-      # actual binaries are located in /usr/pgsql-13/bin/
-      env:
-        TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
-        TEST_PG_BENCH_SCALES_MATRIX: "10,15"
-        PLATFORM: "zenith-staging"
-        BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
-        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
-        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
-      run: |
-        mkdir -p perf-report-staging
-        pipenv run pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
-
-    - name: Submit result
-      env:
-        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
-      run: |
-        REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
--- a/.gitignore
+++ b/.gitignore
@@ -7,7 +7,3 @@ test_output/
 .vscode
 /.zenith
 /integration_tests/.zenith
-
-# Coverage
-*.profraw
-*.profdata
--- a/.yapfignore
+++ b/.yapfignore
@@ -1,10 +0,0 @@
-# This file is only read when `yapf` is run from this directory.
-# Hence we only top-level directories here to avoid confusion.
-# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43
-vendor/
-target/
-tmp_install/
-__pycache__/
-test_output/
-.zenith/
-.git/
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/9
+++ b/9
@@ -10,7 +10,6 @@ FROM zenithdb/build:buster AS pg-build
 WORKDIR /zenith
 COPY ./vendor/postgres vendor/postgres
 COPY ./Makefile Makefile
-ENV BUILD_TYPE release
 RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
 RUN rm -rf postgres_install/build

@@ -21,15 +20,11 @@ RUN rm -rf postgres_install/build
 # net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
 #
 FROM zenithdb/build:buster AS build
-
-ARG GIT_VERSION
-RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi
-
 WORKDIR /zenith
 COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server

 COPY . .
-RUN GIT_VERSION=$GIT_VERSION cargo build --release
+RUN cargo build --release

 #
 # Copy binaries to resulting image.
@@ -41,7 +36,7 @@ RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl
    mkdir zenith_install

 COPY --from=build /zenith/target/release/pageserver /usr/local/bin
-COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
+COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
 COPY --from=build /zenith/target/release/proxy /usr/local/bin
 COPY --from=pg-build /zenith/tmp_install postgres_install
 COPY docker-entrypoint.sh /docker-entrypoint.sh
--- a/Dockerfile.alpine
+++ b/Dockerfile.alpine
@@ -81,7 +81,7 @@ FROM alpine:3.13
 RUN apk add --update openssl build-base libseccomp-dev
 RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
 COPY --from=build /zenith/target/release/pageserver /usr/local/bin
-COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
+COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
 COPY --from=build /zenith/target/release/proxy /usr/local/bin
 COPY --from=pg-build /zenith/tmp_install /usr/local
 COPY docker-entrypoint.sh /docker-entrypoint.sh
--- a/58
+++ b/58
@@ -6,55 +6,34 @@ else
 	SECCOMP =
 endif

-#
-# We differentiate between release / debug build types using the BUILD_TYPE
-# environment variable.
-#
-BUILD_TYPE ?= debug
-ifeq ($(BUILD_TYPE),release)
-	PG_CONFIGURE_OPTS = --enable-debug
-	PG_CFLAGS = -O2 -g3 $(CFLAGS)
-	# Unfortunately, `--profile=...` is a nightly feature
-	CARGO_BUILD_FLAGS += --release
-else ifeq ($(BUILD_TYPE),debug)
-	PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
-	PG_CFLAGS = -O0 -g3 $(CFLAGS)
-else
-$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
-endif
-
-# Choose whether we should be silent or verbose
-CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
-# Fix for a corner case when make doesn't pass a jobserver
-CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
-
-# This option has a side effect of passing make jobserver to cargo.
-# However, we shouldn't do this if `make -n` (--dry-run) has been asked.
-CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
-# Force cargo not to print progress bar
-CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-
 #
 # Top level Makefile to build Zenith and PostgreSQL
 #
-.PHONY: all
 all: zenith postgres

+# We don't want to run 'cargo build' in parallel with the postgres build,
+# because interleaving cargo build output with postgres build output looks
+# confusing. Also, 'cargo build' is parallel on its own, so it would be too
+# much parallelism. (Recursive invocation of postgres target still gets any
+# '-j' flag from the command line, so 'make -j' is still useful.)
+.NOTPARALLEL:
+
 ### Zenith Rust bits
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: zenith
 zenith: postgres-headers
-	+@echo "Compiling Zenith"
-	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
+	cargo build

 ### PostgreSQL parts
 tmp_install/build/config.status:
 	+@echo "Configuring postgres build"
 	mkdir -p tmp_install/build
 	(cd tmp_install/build && \
-	../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
-		$(PG_CONFIGURE_OPTS) \
+	../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
+		--enable-cassert \
+		--enable-debug \
+		--enable-depend \
 		$(SECCOMP) \
 		--prefix=$(abspath tmp_install) > configure.log)

@@ -68,10 +47,10 @@ postgres-headers: postgres-configure
 	+@echo "Installing PostgreSQL headers"
 	$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install

+
 # Compile and install PostgreSQL and contrib/zenith
 .PHONY: postgres
-postgres: postgres-configure \
-		  postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
+postgres: postgres-configure
 	+@echo "Compiling PostgreSQL"
 	$(MAKE) -C tmp_install/build MAKELEVEL=0 install
 	+@echo "Compiling contrib/zenith"
@@ -79,21 +58,18 @@ postgres: postgres-configure \
 	+@echo "Compiling contrib/zenith_test_utils"
 	$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install

-.PHONY: postgres-clean
 postgres-clean:
 	$(MAKE) -C tmp_install/build MAKELEVEL=0 clean

 # This doesn't remove the effects of 'configure'.
-.PHONY: clean
 clean:
-	cd tmp_install/build && $(MAKE) clean
-	$(CARGO_CMD_PREFIX) cargo clean
+	cd tmp_install/build && ${MAKE} clean
+	cargo clean

 # This removes everything
-.PHONY: distclean
 distclean:
 	rm -rf tmp_install
-	$(CARGO_CMD_PREFIX) cargo clean
+	cargo clean

 .PHONY: fmt
 fmt:
--- a/30
+++ b/30
@@ -1,30 +0,0 @@
-[[source]]
-url = "https://pypi.python.org/simple"
-verify_ssl = true
-name = "pypi"
-
-[packages]
-pytest = ">=6.0.0"
-typing-extensions = "*"
-pyjwt = {extras = ["crypto"], version = "*"}
-requests = "*"
-pytest-xdist = "*"
-asyncpg = "*"
-cached-property = "*"
-psycopg2-binary = "*"
-jinja2 = "*"
-
-[dev-packages]
-# Behavior may change slightly between versions. These are run continuously,
-# so we pin exact versions to avoid suprising breaks. Update if comfortable.
-yapf = "==0.31.0"
-mypy = "==0.910"
-# Non-pinned packages follow.
-pipenv = "*"
-flake8 = "*"
-types-requests = "*"
-types-psycopg2 = "*"
-
-[requires]
-# we need at least 3.7, but pipenv doesn't allow to say this directly
-python_version = "3"
--- a/1
+++ b/1
@@ -0,0 +1 @@
+./test_runner/Pipfile
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,652 +0,0 @@
-{
-    "_meta": {
-        "hash": {
-            "sha256": "c309cb963a7b07ae3d30e9cbf08b495f77bdecc0e5356fc89d133c4fbcb65b2b"
-        },
-        "pipfile-spec": 6,
-        "requires": {
-            "python_version": "3"
-        },
-        "sources": [
-            {
-                "name": "pypi",
-                "url": "https://pypi.python.org/simple",
-                "verify_ssl": true
-            }
-        ]
-    },
-    "default": {
-        "asyncpg": {
-            "hashes": [
-                "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
-                "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
-                "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
-                "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
-                "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
-                "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
-                "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
-                "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
-                "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
-                "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
-                "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
-                "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
-                "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
-            ],
-            "index": "pypi",
-            "version": "==0.24.0"
-        },
-        "attrs": {
-            "hashes": [
-                "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
-                "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==21.2.0"
-        },
-        "cached-property": {
-            "hashes": [
-                "sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
-                "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
-            ],
-            "index": "pypi",
-            "version": "==1.5.2"
-        },
-        "certifi": {
-            "hashes": [
-                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
-                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
-            ],
-            "version": "==2021.10.8"
-        },
-        "cffi": {
-            "hashes": [
-                "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
-                "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
-                "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
-                "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
-                "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
-                "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
-                "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
-                "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
-                "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
-                "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
-                "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
-                "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
-                "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
-                "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
-                "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
-                "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
-                "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
-                "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
-                "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
-                "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
-                "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
-                "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
-                "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
-                "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
-                "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
-                "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
-                "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
-                "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
-                "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
-                "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
-                "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
-                "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
-                "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
-                "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
-                "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
-                "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
-                "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
-                "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
-                "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
-                "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
-                "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
-                "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
-                "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
-                "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
-                "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
-                "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
-                "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
-                "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
-                "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
-                "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
-            ],
-            "version": "==1.15.0"
-        },
-        "charset-normalizer": {
-            "hashes": [
-                "sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
-                "sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==2.0.7"
-        },
-        "cryptography": {
-            "hashes": [
-                "sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
-                "sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
-                "sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
-                "sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
-                "sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
-                "sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
-                "sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
-                "sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
-                "sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
-                "sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
-                "sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
-                "sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
-                "sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
-                "sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
-                "sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
-                "sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
-                "sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
-                "sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
-                "sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
-                "sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
-            ],
-            "version": "==35.0.0"
-        },
-        "execnet": {
-            "hashes": [
-                "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
-                "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==1.9.0"
-        },
-        "idna": {
-            "hashes": [
-                "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
-                "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==3.3"
-        },
-        "importlib-metadata": {
-            "hashes": [
-                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
-                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==4.8.1"
-        },
-        "iniconfig": {
-            "hashes": [
-                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
-                "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
-            ],
-            "version": "==1.1.1"
-        },
-        "jinja2": {
-            "hashes": [
-                "sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
-                "sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
-            ],
-            "index": "pypi",
-            "version": "==3.0.2"
-        },
-        "markupsafe": {
-            "hashes": [
-                "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
-                "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
-                "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
-                "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
-                "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
-                "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
-                "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
-                "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
-                "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
-                "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
-                "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
-                "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
-                "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
-                "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
-                "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
-                "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
-                "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
-                "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
-                "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
-                "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
-                "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
-                "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
-                "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
-                "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
-                "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
-                "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
-                "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
-                "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
-                "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
-                "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
-                "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
-                "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
-                "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
-                "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
-                "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
-                "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
-                "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
-                "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
-                "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
-                "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
-                "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
-                "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
-                "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
-                "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
-                "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
-                "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
-                "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
-                "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
-                "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
-                "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
-                "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
-                "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
-                "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
-                "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
-                "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
-                "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
-                "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
-                "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
-                "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
-                "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
-                "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
-                "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
-                "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
-                "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
-                "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
-                "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
-                "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
-                "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
-                "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==2.0.1"
-        },
-        "packaging": {
-            "hashes": [
-                "sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966",
-                "sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==21.2"
-        },
-        "pluggy": {
-            "hashes": [
-                "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
-                "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==1.0.0"
-        },
-        "psycopg2-binary": {
-            "hashes": [
-                "sha256:0b7dae87f0b729922e06f85f667de7bf16455d411971b2043bbd9577af9d1975",
-                "sha256:0f2e04bd2a2ab54fa44ee67fe2d002bb90cee1c0f1cc0ebc3148af7b02034cbd",
-                "sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616",
-                "sha256:1473c0215b0613dd938db54a653f68251a45a78b05f6fc21af4326f40e8360a2",
-                "sha256:14db1752acdd2187d99cb2ca0a1a6dfe57fc65c3281e0f20e597aac8d2a5bd90",
-                "sha256:1e3a362790edc0a365385b1ac4cc0acc429a0c0d662d829a50b6ce743ae61b5a",
-                "sha256:1e85b74cbbb3056e3656f1cc4781294df03383127a8114cbc6531e8b8367bf1e",
-                "sha256:20f1ab44d8c352074e2d7ca67dc00843067788791be373e67a0911998787ce7d",
-                "sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f",
-                "sha256:2f62c207d1740b0bde5c4e949f857b044818f734a3d57f1d0d0edc65050532ed",
-                "sha256:3242b9619de955ab44581a03a64bdd7d5e470cc4183e8fcadd85ab9d3756ce7a",
-                "sha256:35c4310f8febe41f442d3c65066ca93cccefd75013df3d8c736c5b93ec288140",
-                "sha256:4235f9d5ddcab0b8dbd723dca56ea2922b485ea00e1dafacf33b0c7e840b3d32",
-                "sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759",
-                "sha256:5ced67f1e34e1a450cdb48eb53ca73b60aa0af21c46b9b35ac3e581cf9f00e31",
-                "sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e",
-                "sha256:7360647ea04db2e7dff1648d1da825c8cf68dc5fbd80b8fb5b3ee9f068dcd21a",
-                "sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c",
-                "sha256:8c13d72ed6af7fd2c8acbd95661cf9477f94e381fce0792c04981a8283b52917",
-                "sha256:988b47ac70d204aed01589ed342303da7c4d84b56c2f4c4b8b00deda123372bf",
-                "sha256:995fc41ebda5a7a663a254a1dcac52638c3e847f48307b5416ee373da15075d7",
-                "sha256:a36c7eb6152ba5467fb264d73844877be8b0847874d4822b7cf2d3c0cb8cdcb0",
-                "sha256:aed4a9a7e3221b3e252c39d0bf794c438dc5453bc2963e8befe9d4cd324dff72",
-                "sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698",
-                "sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773",
-                "sha256:b4d7679a08fea64573c969f6994a2631908bb2c0e69a7235648642f3d2e39a68",
-                "sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76",
-                "sha256:ca86db5b561b894f9e5f115d6a159fff2a2570a652e07889d8a383b5fae66eb4",
-                "sha256:cfc523edecddaef56f6740d7de1ce24a2fdf94fd5e704091856a201872e37f9f",
-                "sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a",
-                "sha256:da113b70f6ec40e7d81b43d1b139b9db6a05727ab8be1ee559f3a69854a69d34",
-                "sha256:f6fac64a38f6768e7bc7b035b9e10d8a538a9fadce06b983fb3e6fa55ac5f5ce",
-                "sha256:f8559617b1fcf59a9aedba2c9838b5b6aa211ffedecabca412b92a1ff75aac1a",
-                "sha256:fbb42a541b1093385a2d8c7eec94d26d30437d0e77c1d25dae1dcc46741a385e"
-            ],
-            "index": "pypi",
-            "version": "==2.9.1"
-        },
-        "py": {
-            "hashes": [
-                "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
-                "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==1.10.0"
-        },
-        "pycparser": {
-            "hashes": [
-                "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
-                "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.20"
-        },
-        "pyjwt": {
-            "extras": [
-                "crypto"
-            ],
-            "hashes": [
-                "sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
-                "sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
-            ],
-            "index": "pypi",
-            "version": "==2.3.0"
-        },
-        "pyparsing": {
-            "hashes": [
-                "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
-                "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.4.7"
-        },
-        "pytest": {
-            "hashes": [
-                "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
-                "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
-            ],
-            "index": "pypi",
-            "version": "==6.2.5"
-        },
-        "pytest-forked": {
-            "hashes": [
-                "sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
-                "sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==1.3.0"
-        },
-        "pytest-xdist": {
-            "hashes": [
-                "sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
-                "sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
-            ],
-            "index": "pypi",
-            "version": "==2.4.0"
-        },
-        "requests": {
-            "hashes": [
-                "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
-                "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
-            ],
-            "index": "pypi",
-            "version": "==2.26.0"
-        },
-        "toml": {
-            "hashes": [
-                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
-                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.10.2"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
-                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
-                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
-            ],
-            "index": "pypi",
-            "version": "==3.10.0.2"
-        },
-        "urllib3": {
-            "hashes": [
-                "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
-                "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
-            "version": "==1.26.7"
-        },
-        "zipp": {
-            "hashes": [
-                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
-                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.6.0"
-        }
-    },
-    "develop": {
-        "backports.entry-points-selectable": {
-            "hashes": [
-                "sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
-                "sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
-            ],
-            "markers": "python_version >= '2.7'",
-            "version": "==1.1.0"
-        },
-        "certifi": {
-            "hashes": [
-                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
-                "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
-            ],
-            "version": "==2021.10.8"
-        },
-        "distlib": {
-            "hashes": [
-                "sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
-                "sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
-            ],
-            "version": "==0.3.3"
-        },
-        "filelock": {
-            "hashes": [
-                "sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
-                "sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.3.2"
-        },
-        "flake8": {
-            "hashes": [
-                "sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
-                "sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
-            ],
-            "index": "pypi",
-            "version": "==4.0.1"
-        },
-        "importlib-metadata": {
-            "hashes": [
-                "sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
-                "sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==4.8.1"
-        },
-        "mccabe": {
-            "hashes": [
-                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
-                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
-            ],
-            "version": "==0.6.1"
-        },
-        "mypy": {
-            "hashes": [
-                "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
-                "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
-                "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
-                "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
-                "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
-                "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
-                "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
-                "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
-                "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
-                "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
-                "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
-                "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
-                "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
-                "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
-                "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
-                "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
-                "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
-                "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
-                "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
-                "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
-                "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
-                "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
-                "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
-            ],
-            "index": "pypi",
-            "version": "==0.910"
-        },
-        "mypy-extensions": {
-            "hashes": [
-                "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
-                "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
-            ],
-            "version": "==0.4.3"
-        },
-        "pipenv": {
-            "hashes": [
-                "sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
-                "sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
-            ],
-            "index": "pypi",
-            "version": "==2021.5.29"
-        },
-        "platformdirs": {
-            "hashes": [
-                "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
-                "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==2.4.0"
-        },
-        "pycodestyle": {
-            "hashes": [
-                "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
-                "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==2.8.0"
-        },
-        "pyflakes": {
-            "hashes": [
-                "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
-                "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.4.0"
-        },
-        "six": {
-            "hashes": [
-                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
-                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==1.16.0"
-        },
-        "toml": {
-            "hashes": [
-                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
-                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.10.2"
-        },
-        "typed-ast": {
-            "hashes": [
-                "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
-                "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
-                "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
-                "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
-                "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
-                "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
-                "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
-                "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
-                "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
-                "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
-                "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
-                "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
-                "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
-                "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
-                "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
-                "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
-                "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
-                "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
-                "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
-                "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
-                "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
-                "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
-                "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
-                "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
-                "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
-                "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
-                "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
-                "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
-                "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
-                "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==1.4.3"
-        },
-        "types-psycopg2": {
-            "hashes": [
-                "sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
-                "sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
-            ],
-            "index": "pypi",
-            "version": "==2.9.1"
-        },
-        "types-requests": {
-            "hashes": [
-                "sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
-                "sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
-            ],
-            "index": "pypi",
-            "version": "==2.25.11"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
-                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
-                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
-            ],
-            "index": "pypi",
-            "version": "==3.10.0.2"
-        },
-        "virtualenv": {
-            "hashes": [
-                "sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814",
-                "sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==20.10.0"
-        },
-        "virtualenv-clone": {
-            "hashes": [
-                "sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
-                "sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.5.7"
-        },
-        "yapf": {
-            "hashes": [
-                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
-                "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
-            ],
-            "index": "pypi",
-            "version": "==0.31.0"
-        },
-        "zipp": {
-            "hashes": [
-                "sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
-                "sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.6.0"
-        }
-    }
-}
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -0,0 +1 @@
+./test_runner/Pipfile.lock
--- a/README.md
+++ b/README.md
@@ -25,15 +25,15 @@ Pageserver consists of:
 On Ubuntu or Debian this set of packages should be sufficient to build the code:
 ```text
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang pkg-config libpq-dev
+libssl-dev clang
 ```

-[Rust] 1.55 or later is also required.
+[Rust] 1.52 or later is also required.

 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

-To run the integration tests or Python scripts (not required to use the code), install
-Python (3.7 or higher), and install python3 packages using `pipenv install` in the project directory.
+To run the integration tests (not required to use the code), install
+Python (3.6 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.

 2. Build zenith and patched postgres
 ```sh
@@ -47,26 +47,17 @@ make -j5
 # Create repository in .zenith with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > ./target/debug/zenith init
-initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
-created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
-created main branch
 pageserver init succeeded

-# start pageserver and safekeeper
+# start pageserver
 > ./target/debug/zenith start
-Starting pageserver at 'localhost:64000' in '.zenith'
+Starting pageserver at '127.0.0.1:64000' in .zenith
 Pageserver started
-initializing for single for 7676
-Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single'
-Safekeeper started

-# start postgres compute node
+# start postgres on top on the pageserver
 > ./target/debug/zenith pg start main
-Starting new postgres main on main...
-Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
-Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
+Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
 waiting for server to start.... done
-server started

 # check list of running postgres instances
 > ./target/debug/zenith pg list
@@ -117,19 +108,13 @@ postgres=# insert into t values(2,2);
 INSERT 0 1
 ```

-6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
-   you have just started. You can stop them all with one command:
-```sh
-> ./target/debug/zenith stop
-```
-
 ## Running tests

 ```sh
 git clone --recursive https://github.com/zenithdb/zenith.git
 make # builds also postgres and installs it to ./tmp_install
 cd test_runner
-pipenv run pytest
+pytest
 ```

 ## Documentation
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -18,7 +18,7 @@ regex = "1"
 anyhow = "1.0"
 thiserror = "1"
 bytes = "1.0.1"
-nix = "0.23"
+nix = "0.20"
 url = "2.2.2"
 hex = { version = "0.4.3", features = ["serde"] }
 reqwest = { version = "0.11", features = ["blocking", "json"] }
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -1,20 +0,0 @@
-# Page server and three safekeepers.
-[pageserver]
-pg_port = 64000
-http_port = 9898
-auth_type = 'Trust'
-
-[[safekeepers]]
-name = 'sk1'
-pg_port = 5454
-http_port = 7676
-
-[[safekeepers]]
-name = 'sk2'
-pg_port = 5455
-http_port = 7677
-
-[[safekeepers]]
-name = 'sk3'
-pg_port = 5456
-http_port = 7678
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -1,11 +0,0 @@
-# Minimal zenith environment with one safekeeper. This is equivalent to the built-in
-# defaults that you get with no --config
-[pageserver]
-pg_port = 64000
-http_port = 9898
-auth_type = 'Trust'
-
-[[safekeepers]]
-name = 'single'
-pg_port = 5454
-http_port = 7676
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -39,6 +39,8 @@ impl ComputeControlPlane {
    // |  |- <tenant_id>
    // |  |   |- <branch name>
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
+        // TODO: since pageserver do not have config file yet we believe here that
+        // it is running on default port. Change that when pageserver will have config.
        let pageserver = Arc::new(PageServerNode::from_env(&env));

        let mut nodes = BTreeMap::default();
@@ -73,59 +75,40 @@ impl ComputeControlPlane {
            .unwrap_or(self.base_port)
    }

-    // FIXME: see also parse_point_in_time in branches.rs.
-    fn parse_point_in_time(
-        &self,
-        tenantid: ZTenantId,
-        s: &str,
-    ) -> Result<(ZTimelineId, Option<Lsn>)> {
-        let mut strings = s.split('@');
-        let name = strings.next().unwrap();
-
-        let lsn: Option<Lsn>;
-        if let Some(lsnstr) = strings.next() {
-            lsn = Some(
-                Lsn::from_str(lsnstr)
-                    .with_context(|| "invalid LSN in point-in-time specification")?,
-            );
-        } else {
-            lsn = None
+    pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
+        ComputeControlPlane {
+            base_port: 65431,
+            pageserver: Arc::clone(pageserver),
+            nodes: BTreeMap::new(),
+            env: local_env.clone(),
        }
-
-        // Resolve the timeline ID, given the human-readable branch name
-        let timeline_id = self
-            .pageserver
-            .branch_get_by_name(&tenantid, name)?
-            .timeline_id;
-
-        Ok((timeline_id, lsn))
    }

    pub fn new_node(
        &mut self,
        tenantid: ZTenantId,
-        name: &str,
-        timeline_spec: &str,
+        branch_name: &str,
        port: Option<u16>,
    ) -> Result<Arc<PostgresNode>> {
-        // Resolve the human-readable timeline spec into timeline ID and LSN
-        let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?;
+        let timeline_id = self
+            .pageserver
+            .branch_get_by_name(&tenantid, branch_name)?
+            .timeline_id;

        let port = port.unwrap_or_else(|| self.get_port());
        let node = Arc::new(PostgresNode {
-            name: name.to_owned(),
+            name: branch_name.to_owned(),
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            is_test: false,
-            timelineid,
-            lsn,
+            timelineid: timeline_id,
            tenantid,
            uses_wal_proposer: false,
        });

        node.create_pgdata()?;
-        node.setup_pg_conf(self.env.pageserver.auth_type)?;
+        node.setup_pg_conf(self.env.auth_type)?;

        self.nodes
            .insert((tenantid, node.name.clone()), Arc::clone(&node));
@@ -144,7 +127,6 @@ pub struct PostgresNode {
    pageserver: Arc<PageServerNode>,
    is_test: bool,
    pub timelineid: ZTimelineId,
-    pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
    pub tenantid: ZTenantId,
    uses_wal_proposer: bool,
 }
@@ -179,11 +161,8 @@ impl PostgresNode {
        let port: u16 = conf.parse_field("port", &context)?;
        let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
        let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
-        let uses_wal_proposer = conf.get("wal_acceptors").is_some();

-        // parse recovery_target_lsn, if any
-        let recovery_target_lsn: Option<Lsn> =
-            conf.parse_field_optional("recovery_target_lsn", &context)?;
+        let uses_wal_proposer = conf.get("wal_acceptors").is_some();

        // ok now
        Ok(PostgresNode {
@@ -193,13 +172,12 @@ impl PostgresNode {
            pageserver: Arc::clone(pageserver),
            is_test: false,
            timelineid,
-            lsn: recovery_target_lsn,
            tenantid,
            uses_wal_proposer,
        })
    }

-    fn sync_safekeepers(&self) -> Result<Lsn> {
+    fn sync_walkeepers(&self) -> Result<Lsn> {
        let pg_path = self.env.pg_bin_dir().join("postgres");
        let sync_handle = Command::new(pg_path)
            .arg("--sync-safekeepers")
@@ -224,7 +202,7 @@ impl PostgresNode {
        }

        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
-        println!("Safekeepers synced on {}", lsn);
+        println!("Walkeepers synced on {}", lsn);
        Ok(lsn)
    }

@@ -255,7 +233,7 @@ impl PostgresNode {
        // Read the archive directly from the `CopyOutReader`
        tar::Archive::new(copyreader)
            .unpack(&self.pgdata())
-            .with_context(|| "extracting base backup failed")?;
+            .with_context(|| "extracting page backup failed")?;

        Ok(())
    }
@@ -287,15 +265,10 @@ impl PostgresNode {
        conf.append("max_replication_slots", "10");
        conf.append("hot_standby", "on");
        conf.append("shared_buffers", "1MB");
-        conf.append("max_wal_size", "100GB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
+        conf.append("wal_sender_timeout", "0");
        conf.append("wal_level", "replica");
-        // wal_sender_timeout is the maximum time to wait for WAL replication.
-        // It also defines how often the walreciever will send a feedback message to the wal sender.
-        //conf.append("wal_sender_timeout", "5s");
-        //conf.append("max_replication_flush_lag", "160MB");
-        //conf.append("max_replication_apply_lag", "1500MB");
        conf.append("listen_addresses", &self.address.ip().to_string());
        conf.append("port", &self.address.port().to_string());

@@ -328,30 +301,11 @@ impl PostgresNode {
        conf.append("zenith.page_server_connstring", &pageserver_connstr);
        conf.append("zenith.zenith_tenant", &self.tenantid.to_string());
        conf.append("zenith.zenith_timeline", &self.timelineid.to_string());
-        if let Some(lsn) = self.lsn {
-            conf.append("recovery_target_lsn", &lsn.to_string());
-        }
        conf.append_line("");

-        if !self.env.safekeepers.is_empty() {
-            // Configure the node to connect to the safekeepers
-            conf.append("synchronous_standby_names", "walproposer");
-
-            let wal_acceptors = self
-                .env
-                .safekeepers
-                .iter()
-                .map(|sk| format!("localhost:{}", sk.pg_port))
-                .collect::<Vec<String>>()
-                .join(",");
-            conf.append("wal_acceptors", &wal_acceptors);
-        } else {
-            // Configure the node to stream WAL directly to the pageserver
-            // This isn't really a supported configuration, but can be useful for
-            // testing.
-            conf.append("synchronous_standby_names", "pageserver");
-            conf.append("zenith.callmemaybe_connstring", &self.connstr());
-        }
+        // Configure the node to stream WAL directly to the pageserver
+        conf.append("synchronous_standby_names", "pageserver"); // TODO: add a new function arg?
+        conf.append("zenith.callmemaybe_connstring", &self.connstr());

        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
        file.write_all(conf.to_string().as_bytes())?;
@@ -360,14 +314,12 @@ impl PostgresNode {
    }

    fn load_basebackup(&self) -> Result<()> {
-        let backup_lsn = if let Some(lsn) = self.lsn {
-            Some(lsn)
-        } else if self.uses_wal_proposer {
+        let lsn = if self.uses_wal_proposer {
            // LSN 0 means that it is bootstrap and we need to download just
            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
            // procedure evolves quite actively right now, so let's think about it again
            // when things would be more stable (TODO).
-            let lsn = self.sync_safekeepers()?;
+            let lsn = self.sync_walkeepers()?;
            if lsn == Lsn(0) {
                None
            } else {
@@ -377,7 +329,7 @@ impl PostgresNode {
            None
        };

-        self.do_basebackup(backup_lsn)?;
+        self.do_basebackup(lsn)?;

        Ok(())
    }
@@ -454,10 +406,6 @@ impl PostgresNode {
        // 3. Load basebackup
        self.load_basebackup()?;

-        if self.lsn.is_some() {
-            File::create(self.pgdata().join("standby.signal"))?;
-        }
-
        // 4. Finally start the compute node postgres
        println!("Starting postgres node at '{}'", self.connstr());
        self.pg_ctl(&["start"], auth_token)
@@ -504,7 +452,9 @@ impl PostgresNode {
            .output()
            .expect("failed to execute whoami");

-        assert!(output.status.success(), "whoami failed");
+        if !output.status.success() {
+            panic!("whoami failed");
+        }

        String::from_utf8(output.stdout).unwrap().trim().to_string()
    }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -13,7 +13,6 @@ use std::path::Path;
 pub mod compute;
 pub mod local_env;
 pub mod postgresql_conf;
-pub mod safekeeper;
 pub mod storage;

 /// Read a PID file
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -7,102 +7,46 @@
 use anyhow::{Context, Result};
 use serde::{Deserialize, Serialize};
 use std::env;
-use std::fmt::Write;
 use std::fs;
-use std::path::{Path, PathBuf};
+use std::path::PathBuf;
 use std::process::{Command, Stdio};
-use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
+use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::ZTenantId;

 //
-// This data structures represents zenith CLI config
-//
-// It is deserialized from the .zenith/config file, or the config file passed
-// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
-// an example.
+// This data structures represent deserialized zenith CLI config
 //
 #[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct LocalEnv {
-    // Base directory for all the nodes (the pageserver, safekeepers and
-    // compute nodes).
-    //
-    // This is not stored in the config file. Rather, this is the path where the
-    // config file itself is. It is read from the ZENITH_REPO_DIR env variable or
-    // '.zenith' if not given.
-    #[serde(skip)]
+    // Pageserver connection settings
+    pub pageserver_pg_port: u16,
+    pub pageserver_http_port: u16,
+
+    // Base directory for both pageserver and compute nodes
    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
    // "lib", "share" from postgres distribution are there. If at some point
    // in time we will be able to run against vanilla postgres we may split that
    // to four separate paths and match OS-specific installation layout.
-    #[serde(default)]
    pub pg_distrib_dir: PathBuf,

    // Path to pageserver binary.
-    #[serde(default)]
    pub zenith_distrib_dir: PathBuf,

-    // Default tenant ID to use with the 'zenith' command line utility, when
-    // --tenantid is not explicitly specified.
-    #[serde(with = "opt_tenantid_serde")]
-    #[serde(default)]
-    pub default_tenantid: Option<ZTenantId>,
+    // keeping tenant id in config to reduce copy paste when running zenith locally with single tenant
+    #[serde(with = "hex")]
+    pub tenantid: ZTenantId,

-    // used to issue tokens during e.g pg start
-    #[serde(default)]
-    pub private_key_path: PathBuf,
-
-    pub pageserver: PageServerConf,
-
-    #[serde(default)]
-    pub safekeepers: Vec<SafekeeperConf>,
-}
-
-#[derive(Serialize, Deserialize, Clone, Debug)]
-#[serde(default)]
-pub struct PageServerConf {
-    // Pageserver connection settings
-    pub pg_port: u16,
-    pub http_port: u16,
+    // jwt auth token used for communication with pageserver
+    pub auth_token: String,

    // used to determine which auth type is used
    pub auth_type: AuthType,

-    // jwt auth token used for communication with pageserver
-    pub auth_token: String,
-}
-
-impl Default for PageServerConf {
-    fn default() -> Self {
-        Self {
-            pg_port: 0,
-            http_port: 0,
-            auth_type: AuthType::Trust,
-            auth_token: "".to_string(),
-        }
-    }
-}
-
-#[derive(Serialize, Deserialize, Clone, Debug)]
-#[serde(default)]
-pub struct SafekeeperConf {
-    pub name: String,
-    pub pg_port: u16,
-    pub http_port: u16,
-    pub sync: bool,
-}
-
-impl Default for SafekeeperConf {
-    fn default() -> Self {
-        Self {
-            name: "".to_string(),
-            pg_port: 0,
-            http_port: 0,
-            sync: true,
-        }
-    }
+    // used to issue tokens during e.g pg start
+    pub private_key_path: PathBuf,
 }

 impl LocalEnv {
@@ -118,10 +62,6 @@ impl LocalEnv {
        Ok(self.zenith_distrib_dir.join("pageserver"))
    }

-    pub fn safekeeper_bin(&self) -> Result<PathBuf> {
-        Ok(self.zenith_distrib_dir.join("safekeeper"))
-    }
-
    pub fn pg_data_dirs_path(&self) -> PathBuf {
        self.base_data_dir.join("pgdatadirs").join("tenants")
    }
@@ -136,187 +76,6 @@ impl LocalEnv {
    pub fn pageserver_data_dir(&self) -> PathBuf {
        self.base_data_dir.clone()
    }
-
-    pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf {
-        self.base_data_dir.join("safekeepers").join(node_name)
-    }
-
-    /// Create a LocalEnv from a config file.
-    ///
-    /// Unlike 'load_config', this function fills in any defaults that are missing
-    /// from the config file.
-    pub fn create_config(toml: &str) -> Result<LocalEnv> {
-        let mut env: LocalEnv = toml::from_str(toml)?;
-
-        // Find postgres binaries.
-        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
-        if env.pg_distrib_dir == Path::new("") {
-            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
-                env.pg_distrib_dir = postgres_bin.into();
-            } else {
-                let cwd = env::current_dir()?;
-                env.pg_distrib_dir = cwd.join("tmp_install")
-            }
-        }
-        if !env.pg_distrib_dir.join("bin/postgres").exists() {
-            anyhow::bail!(
-                "Can't find postgres binary at {}",
-                env.pg_distrib_dir.display()
-            );
-        }
-
-        // Find zenith binaries.
-        if env.zenith_distrib_dir == Path::new("") {
-            env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
-        }
-        if !env.zenith_distrib_dir.join("pageserver").exists() {
-            anyhow::bail!("Can't find pageserver binary.");
-        }
-        if !env.zenith_distrib_dir.join("safekeeper").exists() {
-            anyhow::bail!("Can't find safekeeper binary.");
-        }
-
-        // If no initial tenant ID was given, generate it.
-        if env.default_tenantid.is_none() {
-            env.default_tenantid = Some(ZTenantId::generate());
-        }
-
-        env.base_data_dir = base_path();
-
-        Ok(env)
-    }
-
-    /// Locate and load config
-    pub fn load_config() -> Result<LocalEnv> {
-        let repopath = base_path();
-
-        if !repopath.exists() {
-            anyhow::bail!(
-                "Zenith config is not found in {}. You need to run 'zenith init' first",
-                repopath.to_str().unwrap()
-            );
-        }
-
-        // TODO: check that it looks like a zenith repository
-
-        // load and parse file
-        let config = fs::read_to_string(repopath.join("config"))?;
-        let mut env: LocalEnv = toml::from_str(config.as_str())?;
-
-        env.base_data_dir = repopath;
-
-        Ok(env)
-    }
-
-    // this function is used only for testing purposes in CLI e g generate tokens during init
-    pub fn generate_auth_token(&self, claims: &Claims) -> Result<String> {
-        let private_key_path = if self.private_key_path.is_absolute() {
-            self.private_key_path.to_path_buf()
-        } else {
-            self.base_data_dir.join(&self.private_key_path)
-        };
-
-        let key_data = fs::read(private_key_path)?;
-        encode_from_key_file(claims, &key_data)
-    }
-
-    //
-    // Initialize a new Zenith repository
-    //
-    pub fn init(&mut self) -> Result<()> {
-        // check if config already exists
-        let base_path = &self.base_data_dir;
-        if base_path == Path::new("") {
-            anyhow::bail!("repository base path is missing");
-        }
-        if base_path.exists() {
-            anyhow::bail!(
-                "directory '{}' already exists. Perhaps already initialized?",
-                base_path.to_str().unwrap()
-            );
-        }
-
-        fs::create_dir(&base_path)?;
-
-        // generate keys for jwt
-        // openssl genrsa -out private_key.pem 2048
-        let private_key_path;
-        if self.private_key_path == PathBuf::new() {
-            private_key_path = base_path.join("auth_private_key.pem");
-            let keygen_output = Command::new("openssl")
-                .arg("genrsa")
-                .args(&["-out", private_key_path.to_str().unwrap()])
-                .arg("2048")
-                .stdout(Stdio::null())
-                .output()
-                .with_context(|| "failed to generate auth private key")?;
-            if !keygen_output.status.success() {
-                anyhow::bail!(
-                    "openssl failed: '{}'",
-                    String::from_utf8_lossy(&keygen_output.stderr)
-                );
-            }
-            self.private_key_path = Path::new("auth_private_key.pem").to_path_buf();
-
-            let public_key_path = base_path.join("auth_public_key.pem");
-            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
-            let keygen_output = Command::new("openssl")
-                .arg("rsa")
-                .args(&["-in", private_key_path.to_str().unwrap()])
-                .arg("-pubout")
-                .args(&["-outform", "PEM"])
-                .args(&["-out", public_key_path.to_str().unwrap()])
-                .stdout(Stdio::null())
-                .output()
-                .with_context(|| "failed to generate auth private key")?;
-            if !keygen_output.status.success() {
-                anyhow::bail!(
-                    "openssl failed: '{}'",
-                    String::from_utf8_lossy(&keygen_output.stderr)
-                );
-            }
-        }
-
-        self.pageserver.auth_token =
-            self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
-
-        fs::create_dir_all(self.pg_data_dirs_path())?;
-
-        for safekeeper in self.safekeepers.iter() {
-            fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?;
-        }
-
-        let mut conf_content = String::new();
-
-        // Currently, the user first passes a config file with 'zenith init --config=<path>'
-        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
-        // to .zenith/config. TODO: We lose any formatting and comments along the way, which is
-        // a bit sad.
-        write!(
-            &mut conf_content,
-            r#"# This file describes a locale deployment of the page server
-# and safekeeeper node. It is read by the 'zenith' command-line
-# utility.
-"#
-        )?;
-
-        // Convert the LocalEnv to a toml file.
-        //
-        // This could be as simple as this:
-        //
-        // conf_content += &toml::to_string_pretty(env)?;
-        //
-        // But it results in a "values must be emitted before tables". I'm not sure
-        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
-        // Maybe rust reorders the fields to squeeze avoid padding or something?
-        // In any case, converting to toml::Value first, and serializing that, works.
-        // See https://github.com/alexcrichton/toml-rs/issues/142
-        conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?;
-
-        fs::write(base_path.join("config"), conf_content)?;
-
-        Ok(())
-    }
 }

 fn base_path() -> PathBuf {
@@ -326,29 +85,118 @@ fn base_path() -> PathBuf {
    }
 }

-/// Serde routines for Option<ZTenantId>. The serialized form is a hex string.
-mod opt_tenantid_serde {
-    use serde::{Deserialize, Deserializer, Serialize, Serializer};
-    use std::str::FromStr;
-    use zenith_utils::zid::ZTenantId;
-
-    pub fn serialize<S>(tenantid: &Option<ZTenantId>, ser: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        tenantid.map(|t| t.to_string()).serialize(ser)
+//
+// Initialize a new Zenith repository
+//
+pub fn init(
+    pageserver_pg_port: u16,
+    pageserver_http_port: u16,
+    tenantid: ZTenantId,
+    auth_type: AuthType,
+) -> Result<()> {
+    // check if config already exists
+    let base_path = base_path();
+    if base_path.exists() {
+        anyhow::bail!(
+            "{} already exists. Perhaps already initialized?",
+            base_path.to_str().unwrap()
+        );
    }
+    fs::create_dir(&base_path)?;

-    pub fn deserialize<'de, D>(des: D) -> Result<Option<ZTenantId>, D::Error>
-    where
-        D: Deserializer<'de>,
-    {
-        let s: Option<String> = Option::deserialize(des)?;
-        if let Some(s) = s {
-            return Ok(Some(
-                ZTenantId::from_str(&s).map_err(serde::de::Error::custom)?,
-            ));
+    // ok, now check that expected binaries are present
+
+    // Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
+    let pg_distrib_dir: PathBuf = {
+        if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+            postgres_bin.into()
+        } else {
+            let cwd = env::current_dir()?;
+            cwd.join("tmp_install")
        }
-        Ok(None)
+    };
+    if !pg_distrib_dir.join("bin/postgres").exists() {
+        anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
    }
+
+    // generate keys for jwt
+    // openssl genrsa -out private_key.pem 2048
+    let private_key_path = base_path.join("auth_private_key.pem");
+    let keygen_output = Command::new("openssl")
+        .arg("genrsa")
+        .args(&["-out", private_key_path.to_str().unwrap()])
+        .arg("2048")
+        .stdout(Stdio::null())
+        .output()
+        .with_context(|| "failed to generate auth private key")?;
+    if !keygen_output.status.success() {
+        anyhow::bail!(
+            "openssl failed: '{}'",
+            String::from_utf8_lossy(&keygen_output.stderr)
+        );
+    }
+
+    let public_key_path = base_path.join("auth_public_key.pem");
+    // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
+    let keygen_output = Command::new("openssl")
+        .arg("rsa")
+        .args(&["-in", private_key_path.to_str().unwrap()])
+        .arg("-pubout")
+        .args(&["-outform", "PEM"])
+        .args(&["-out", public_key_path.to_str().unwrap()])
+        .stdout(Stdio::null())
+        .output()
+        .with_context(|| "failed to generate auth private key")?;
+    if !keygen_output.status.success() {
+        anyhow::bail!(
+            "openssl failed: '{}'",
+            String::from_utf8_lossy(&keygen_output.stderr)
+        );
+    }
+
+    let auth_token =
+        encode_from_key_path(&Claims::new(None, Scope::PageServerApi), &private_key_path)?;
+
+    // Find zenith binaries.
+    let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+    if !zenith_distrib_dir.join("pageserver").exists() {
+        anyhow::bail!("Can't find pageserver binary.",);
+    }
+
+    let conf = LocalEnv {
+        pageserver_pg_port,
+        pageserver_http_port,
+        pg_distrib_dir,
+        zenith_distrib_dir,
+        base_data_dir: base_path,
+        tenantid,
+        auth_token,
+        auth_type,
+        private_key_path,
+    };
+
+    fs::create_dir_all(conf.pg_data_dirs_path())?;
+
+    let toml = toml::to_string_pretty(&conf)?;
+    fs::write(conf.base_data_dir.join("config"), toml)?;
+
+    Ok(())
+}
+
+// Locate and load config
+pub fn load_config() -> Result<LocalEnv> {
+    let repopath = base_path();
+
+    if !repopath.exists() {
+        anyhow::bail!(
+            "Zenith config is not found in {}. You need to run 'zenith init' first",
+            repopath.to_str().unwrap()
+        );
+    }
+
+    // TODO: check that it looks like a zenith repository
+
+    // load and parse file
+    let config = fs::read_to_string(repopath.join("config"))?;
+    toml::from_str(config.as_str()).map_err(|e| e.into())
 }
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -83,22 +83,6 @@ impl PostgresConf {
            .with_context(|| format!("could not parse '{}' option {}", field_name, context))
    }

-    pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
-    where
-        T: FromStr,
-        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
-    {
-        if let Some(val) = self.get(field_name) {
-            let result = val
-                .parse::<T>()
-                .with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
-
-            Ok(Some(result))
-        } else {
-            Ok(None)
-        }
-    }
-
    ///
    /// Note: if you call this multiple times for the same option, the config
    /// file will a line for each call. It would be nice to have a function
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,281 +0,0 @@
-use std::io::Write;
-use std::net::TcpStream;
-use std::path::PathBuf;
-use std::process::Command;
-use std::sync::Arc;
-use std::time::Duration;
-use std::{io, result, thread};
-
-use anyhow::bail;
-use nix::errno::Errno;
-use nix::sys::signal::{kill, Signal};
-use nix::unistd::Pid;
-use postgres::Config;
-use reqwest::blocking::{Client, RequestBuilder, Response};
-use reqwest::{IntoUrl, Method};
-use thiserror::Error;
-use zenith_utils::http::error::HttpErrorBody;
-use zenith_utils::postgres_backend::AuthType;
-
-use crate::local_env::{LocalEnv, SafekeeperConf};
-use crate::read_pidfile;
-use crate::storage::PageServerNode;
-use zenith_utils::connstring::connection_address;
-use zenith_utils::connstring::connection_host_port;
-
-#[derive(Error, Debug)]
-pub enum SafekeeperHttpError {
-    #[error("Reqwest error: {0}")]
-    Transport(#[from] reqwest::Error),
-
-    #[error("Error: {0}")]
-    Response(String),
-}
-
-type Result<T> = result::Result<T, SafekeeperHttpError>;
-
-pub trait ResponseErrorMessageExt: Sized {
-    fn error_from_body(self) -> Result<Self>;
-}
-
-impl ResponseErrorMessageExt for Response {
-    fn error_from_body(self) -> Result<Self> {
-        let status = self.status();
-        if !(status.is_client_error() || status.is_server_error()) {
-            return Ok(self);
-        }
-
-        // reqwest do not export it's error construction utility functions, so lets craft the message ourselves
-        let url = self.url().to_owned();
-        Err(SafekeeperHttpError::Response(
-            match self.json::<HttpErrorBody>() {
-                Ok(err_body) => format!("Error: {}", err_body.msg),
-                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
-            },
-        ))
-    }
-}
-
-//
-// Control routines for safekeeper.
-//
-// Used in CLI and tests.
-//
-#[derive(Debug)]
-pub struct SafekeeperNode {
-    pub name: String,
-
-    pub conf: SafekeeperConf,
-
-    pub pg_connection_config: Config,
-    pub env: LocalEnv,
-    pub http_client: Client,
-    pub http_base_url: String,
-
-    pub pageserver: Arc<PageServerNode>,
-}
-
-impl SafekeeperNode {
-    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
-        let pageserver = Arc::new(PageServerNode::from_env(env));
-
-        println!("initializing for {} for {}", conf.name, conf.http_port);
-
-        SafekeeperNode {
-            name: conf.name.clone(),
-            conf: conf.clone(),
-            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
-            env: env.clone(),
-            http_client: Client::new(),
-            http_base_url: format!("http://localhost:{}/v1", conf.http_port),
-            pageserver,
-        }
-    }
-
-    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> Config {
-        // TODO safekeeper authentication not implemented yet
-        format!("postgresql://no_user@localhost:{}/no_db", port)
-            .parse()
-            .unwrap()
-    }
-
-    pub fn datadir_path(&self) -> PathBuf {
-        self.env.safekeeper_data_dir(&self.name)
-    }
-
-    pub fn pid_file(&self) -> PathBuf {
-        self.datadir_path().join("safekeeper.pid")
-    }
-
-    pub fn start(&self) -> anyhow::Result<()> {
-        print!(
-            "Starting safekeeper at '{}' in '{}'",
-            connection_address(&self.pg_connection_config),
-            self.datadir_path().display()
-        );
-        io::stdout().flush().unwrap();
-
-        // Configure connection to page server
-        //
-        // FIXME: We extract the host and port from the connection string instead of using
-        // the connection string directly, because the 'safekeeper' binary expects
-        // host:port format. That's a bit silly when we already have a full libpq connection
-        // string at hand.
-        let pageserver_conn = {
-            let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
-            format!("{}:{}", host, port)
-        };
-
-        let listen_pg = format!("localhost:{}", self.conf.pg_port);
-        let listen_http = format!("localhost:{}", self.conf.http_port);
-
-        let mut cmd = Command::new(self.env.safekeeper_bin()?);
-        cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
-            .args(&["--listen-pg", &listen_pg])
-            .args(&["--listen-http", &listen_http])
-            .args(&["--pageserver", &pageserver_conn])
-            .args(&["--recall", "1 second"])
-            .arg("--daemonize")
-            .env_clear()
-            .env("RUST_BACKTRACE", "1");
-        if !self.conf.sync {
-            cmd.arg("--no-sync");
-        }
-
-        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
-            cmd.env("PAGESERVER_AUTH_TOKEN", &self.env.pageserver.auth_token);
-        }
-
-        let var = "LLVM_PROFILE_FILE";
-        if let Some(val) = std::env::var_os(var) {
-            cmd.env(var, val);
-        }
-
-        if !cmd.status()?.success() {
-            bail!(
-                "Safekeeper failed to start. See '{}' for details.",
-                self.datadir_path().join("safekeeper.log").display()
-            );
-        }
-
-        // It takes a while for the safekeeper to start up. Wait until it is
-        // open for business.
-        const RETRIES: i8 = 15;
-        for retries in 1..RETRIES {
-            match self.check_status() {
-                Ok(_) => {
-                    println!("\nSafekeeper started");
-                    return Ok(());
-                }
-                Err(err) => {
-                    match err {
-                        SafekeeperHttpError::Transport(err) => {
-                            if err.is_connect() && retries < 5 {
-                                print!(".");
-                                io::stdout().flush().unwrap();
-                            } else {
-                                if retries == 5 {
-                                    println!() // put a line break after dots for second message
-                                }
-                                println!(
-                                    "Safekeeper not responding yet, err {} retrying ({})...",
-                                    err, retries
-                                );
-                            }
-                        }
-                        SafekeeperHttpError::Response(msg) => {
-                            bail!("safekeeper failed to start: {} ", msg)
-                        }
-                    }
-                    thread::sleep(Duration::from_secs(1));
-                }
-            }
-        }
-        bail!("safekeeper failed to start in {} seconds", RETRIES);
-    }
-
-    ///
-    /// Stop the server.
-    ///
-    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
-    /// Otherwise we use SIGTERM, triggering a clean shutdown
-    ///
-    /// If the server is not running, returns success
-    ///
-    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid_file = self.pid_file();
-        if !pid_file.exists() {
-            println!("Safekeeper {} is already stopped", self.name);
-            return Ok(());
-        }
-        let pid = read_pidfile(&pid_file)?;
-        let pid = Pid::from_raw(pid);
-
-        let sig = if immediate {
-            println!("Stop safekeeper immediately");
-            Signal::SIGQUIT
-        } else {
-            println!("Stop safekeeper gracefully");
-            Signal::SIGTERM
-        };
-        match kill(pid, sig) {
-            Ok(_) => (),
-            Err(Errno::ESRCH) => {
-                println!(
-                    "Safekeeper with pid {} does not exist, but a PID file was found",
-                    pid
-                );
-                return Ok(());
-            }
-            Err(err) => bail!(
-                "Failed to send signal to safekeeper with pid {}: {}",
-                pid,
-                err.desc()
-            ),
-        }
-
-        let address = connection_address(&self.pg_connection_config);
-
-        // TODO Remove this "timeout" and handle it on caller side instead.
-        // Shutting down may take a long time,
-        // if safekeeper flushes a lot of data
-        for _ in 0..100 {
-            if let Err(_e) = TcpStream::connect(&address) {
-                println!("Safekeeper stopped receiving connections");
-
-                //Now check status
-                match self.check_status() {
-                    Ok(_) => {
-                        println!("Safekeeper status is OK. Wait a bit.");
-                        thread::sleep(Duration::from_secs(1));
-                    }
-                    Err(err) => {
-                        println!("Safekeeper status is: {}", err);
-                        return Ok(());
-                    }
-                }
-            } else {
-                println!("Safekeeper still receives connections");
-                thread::sleep(Duration::from_secs(1));
-            }
-        }
-
-        bail!("Failed to stop safekeeper with pid {}", pid);
-    }
-
-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
-        // TODO: authentication
-        //if self.env.auth_type == AuthType::ZenithJWT {
-        //    builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
-        //}
-        self.http_client.request(method, url)
-    }
-
-    pub fn check_status(&self) -> Result<()> {
-        self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
-            .send()?
-            .error_from_body()?;
-        Ok(())
-    }
-}
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -5,8 +5,7 @@ use std::process::Command;
 use std::time::Duration;
 use std::{io, result, thread};

-use anyhow::bail;
-use nix::errno::Errno;
+use anyhow::{anyhow, bail};
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
 use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
@@ -21,7 +20,6 @@ use zenith_utils::zid::ZTenantId;
 use crate::local_env::LocalEnv;
 use crate::read_pidfile;
 use pageserver::branches::BranchInfo;
-use pageserver::tenant_mgr::TenantInfo;
 use zenith_utils::connstring::connection_address;

 #[derive(Error, Debug)]
@@ -64,6 +62,7 @@ impl ResponseErrorMessageExt for Response {
 //
 #[derive(Debug)]
 pub struct PageServerNode {
+    pub kill_on_exit: bool,
    pub pg_connection_config: Config,
    pub env: LocalEnv,
    pub http_client: Client,
@@ -72,33 +71,34 @@ pub struct PageServerNode {

 impl PageServerNode {
    pub fn from_env(env: &LocalEnv) -> PageServerNode {
-        let password = if env.pageserver.auth_type == AuthType::ZenithJWT {
-            &env.pageserver.auth_token
+        let password = if env.auth_type == AuthType::ZenithJWT {
+            &env.auth_token
        } else {
            ""
        };

        PageServerNode {
+            kill_on_exit: false,
            pg_connection_config: Self::pageserver_connection_config(
                password,
-                env.pageserver.pg_port,
+                env.pageserver_pg_port,
            ),
            env: env.clone(),
            http_client: Client::new(),
-            http_base_url: format!("http://localhost:{}/v1", env.pageserver.http_port),
+            http_base_url: format!("http://localhost:{}/v1", env.pageserver_http_port),
        }
    }

-    /// Construct libpq connection string for connecting to the pageserver.
    fn pageserver_connection_config(password: &str, port: u16) -> Config {
        format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
            .parse()
            .unwrap()
    }

-    pub fn init(&self, create_tenant: Option<&str>) -> anyhow::Result<()> {
-        let listen_pg = format!("localhost:{}", self.env.pageserver.pg_port);
-        let listen_http = format!("localhost:{}", self.env.pageserver.http_port);
+    pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> anyhow::Result<()> {
+        let mut cmd = Command::new(self.env.pageserver_bin()?);
+        let listen_pg = format!("localhost:{}", self.env.pageserver_pg_port);
+        let listen_http = format!("localhost:{}", self.env.pageserver_http_port);
        let mut args = vec![
            "--init",
            "-D",
@@ -111,29 +111,27 @@ impl PageServerNode {
            &listen_http,
        ];

-        let auth_type_str = &self.env.pageserver.auth_type.to_string();
-        if self.env.pageserver.auth_type != AuthType::Trust {
+        if enable_auth {
            args.extend(&["--auth-validation-public-key-path", "auth_public_key.pem"]);
+            args.extend(&["--auth-type", "ZenithJWT"]);
        }
-        args.extend(&["--auth-type", auth_type_str]);

        if let Some(tenantid) = create_tenant {
            args.extend(&["--create-tenant", tenantid])
        }

-        let mut cmd = Command::new(self.env.pageserver_bin()?);
-        cmd.args(args).env_clear().env("RUST_BACKTRACE", "1");
+        let status = cmd
+            .args(args)
+            .env_clear()
+            .env("RUST_BACKTRACE", "1")
+            .status()
+            .expect("pageserver init failed");

-        let var = "LLVM_PROFILE_FILE";
-        if let Some(val) = std::env::var_os(var) {
-            cmd.env(var, val);
+        if status.success() {
+            Ok(())
+        } else {
+            Err(anyhow!("pageserver init failed"))
        }
-
-        if !cmd.status()?.success() {
-            bail!("pageserver init failed");
-        }
-
-        Ok(())
    }

    pub fn repo_path(&self) -> PathBuf {
@@ -154,15 +152,10 @@ impl PageServerNode {

        let mut cmd = Command::new(self.env.pageserver_bin()?);
        cmd.args(&["-D", self.repo_path().to_str().unwrap()])
-            .arg("--daemonize")
+            .arg("-d")
            .env_clear()
            .env("RUST_BACKTRACE", "1");

-        let var = "LLVM_PROFILE_FILE";
-        if let Some(val) = std::env::var_os(var) {
-            cmd.env(var, val);
-        }
-
        if !cmd.status()?.success() {
            bail!(
                "Pageserver failed to start. See '{}' for details.",
@@ -206,69 +199,23 @@ impl PageServerNode {
        bail!("pageserver failed to start in {} seconds", RETRIES);
    }

-    ///
-    /// Stop the server.
-    ///
-    /// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
-    /// Otherwise we use SIGTERM, triggering a clean shutdown
-    ///
-    /// If the server is not running, returns success
-    ///
-    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        let pid_file = self.pid_file();
-        if !pid_file.exists() {
-            println!("Pageserver is already stopped");
-            return Ok(());
+    pub fn stop(&self) -> anyhow::Result<()> {
+        let pid = read_pidfile(&self.pid_file())?;
+        let pid = Pid::from_raw(pid);
+        if kill(pid, Signal::SIGTERM).is_err() {
+            bail!("Failed to kill pageserver with pid {}", pid);
        }
-        let pid = Pid::from_raw(read_pidfile(&pid_file)?);

-        let sig = if immediate {
-            println!("Stop pageserver immediately");
-            Signal::SIGQUIT
-        } else {
-            println!("Stop pageserver gracefully");
-            Signal::SIGTERM
-        };
-        match kill(pid, sig) {
-            Ok(_) => (),
-            Err(Errno::ESRCH) => {
-                println!(
-                    "Pageserver with pid {} does not exist, but a PID file was found",
-                    pid
-                );
+        // wait for pageserver stop
+        let address = connection_address(&self.pg_connection_config);
+        for _ in 0..5 {
+            let stream = TcpStream::connect(&address);
+            thread::sleep(Duration::from_secs(1));
+            if let Err(_e) = stream {
+                println!("Pageserver stopped");
                return Ok(());
            }
-            Err(err) => bail!(
-                "Failed to send signal to pageserver with pid {}: {}",
-                pid,
-                err.desc()
-            ),
-        }
-
-        let address = connection_address(&self.pg_connection_config);
-
-        // TODO Remove this "timeout" and handle it on caller side instead.
-        // Shutting down may take a long time,
-        // if pageserver checkpoints a lot of data
-        for _ in 0..100 {
-            if let Err(_e) = TcpStream::connect(&address) {
-                println!("Pageserver stopped receiving connections");
-
-                //Now check status
-                match self.check_status() {
-                    Ok(_) => {
-                        println!("Pageserver status is OK. Wait a bit.");
-                        thread::sleep(Duration::from_secs(1));
-                    }
-                    Err(err) => {
-                        println!("Pageserver status is: {}", err);
-                        return Ok(());
-                    }
-                }
-            } else {
-                println!("Pageserver still receives connections");
-                thread::sleep(Duration::from_secs(1));
-            }
+            println!("Stopping pageserver on {}", address);
        }

        bail!("Failed to stop pageserver with pid {}", pid);
@@ -287,8 +234,8 @@ impl PageServerNode {

    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
        let mut builder = self.http_client.request(method, url);
-        if self.env.pageserver.auth_type == AuthType::ZenithJWT {
-            builder = builder.bearer_auth(&self.env.pageserver.auth_token)
+        if self.env.auth_type == AuthType::ZenithJWT {
+            builder = builder.bearer_auth(&self.env.auth_token)
        }
        builder
    }
@@ -300,7 +247,7 @@ impl PageServerNode {
        Ok(())
    }

-    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
+    pub fn tenant_list(&self) -> Result<Vec<String>> {
        Ok(self
            .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
            .send()?
@@ -363,3 +310,11 @@ impl PageServerNode {
            .json()?)
    }
 }
+
+impl Drop for PageServerNode {
+    fn drop(&mut self) {
+        if self.kill_on_exit {
+            let _ = self.stop();
+        }
+    }
+}
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -7,7 +7,7 @@ if [ "$1" = 'pageserver' ]; then
        pageserver --init -D /data --postgres-distrib /usr/local
    fi
    echo "Staring pageserver at 0.0.0.0:6400"
-    pageserver -l 0.0.0.0:6400 --listen-http 0.0.0.0:9898 -D /data
+    pageserver -l 0.0.0.0:6400 -D /data
 else
    "$@"
 fi
--- a/docs/README.md
+++ b/docs/README.md
@@ -10,5 +10,5 @@
 - [pageserver/README](/pageserver/README) — pageserver overview.
 - [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
 - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
- [walkeeper/README](/walkeeper/README) — WAL service overview.
+- [walkeeper/README](/walkeeper/README.md) — WAL service overview.
 - [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -4,7 +4,7 @@

 Currently we build two main images:

- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).

 And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -51,14 +51,11 @@ Each PostgreSQL fork is considered a separate relish.

 ### Layer

-A layer contains data needed to reconstruct any page versions within the
-layer's Segment and range of LSNs.
-
+Each layer corresponds to the specific version of a relish Segment in a range of LSNs.
 There are two kinds of layers, in-memory and on-disk layers. In-memory
 layers are used to ingest incoming WAL, and provide fast access
 to the recent page versions. On-disk layers are stored as files on disk, and
-are immutable. See pageserver/src/layered_repository/README.md for more.
-
+are immutable.
 ### Layer file (on-disk layer)

 Layered repository on-disk format is based on immutable files.  The
--- a/docs/multitenancy.md
+++ b/docs/multitenancy.md
@@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id

 ### Safety

-For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
+For now particular tenant can only appear on a particular pageserver. Set of WAL acceptors are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -1,128 +0,0 @@
-## Pageserver
-
-### listen_pg_addr
-
-Network interface and port number to listen at for connections from
-the compute nodes and safekeepers. The default is `127.0.0.1:64000`.
-
-### listen_http_addr
-
-Network interface and port number to listen at for admin connections.
-The default is `127.0.0.1:9898`.
-
-### checkpoint_distance
-
-`checkpoint_distance` is the amount of incoming WAL that is held in
-the open layer, before it's flushed to local disk. It puts an upper
-bound on how much WAL needs to be re-processed after a pageserver
-crash. It is a soft limit, the pageserver can momentarily go above it,
-but it will trigger a checkpoint operation to get it back below the
-limit.
-
-`checkpoint_distance` also determines how much WAL needs to be kept
-durable in the safekeeper.  The safekeeper must have capacity to hold
-this much WAL, with some headroom, otherwise you can get stuck in a
-situation where the safekeeper is full and stops accepting new WAL,
-but the pageserver is not flushing out and releasing the space in the
-safekeeper because it hasn't reached checkpoint_distance yet.
-
-`checkpoint_distance` also controls how often the WAL is uploaded to
-S3.
-
-The unit is # of bytes.
-
-### checkpoint_period
-
-The pageserver checks whether `checkpoint_distance` has been reached
-every `checkpoint_period` seconds. Default is 1 s, which should be
-fine.
-
-### gc_horizon
-
-`gz_horizon` determines how much history is retained, to allow
-branching and read replicas at an older point in time. The unit is #
-of bytes of WAL. Page versions older than this are garbage collected
-away.
-
-### gc_period
-
-Interval at which garbage collection is triggered. Default is 100 s.
-
-### superuser
-
-Name of the initial superuser role, passed to initdb when a new tenant
-is initialized. It doesn't affect anything after initialization. The
-default is Note: The default is 'zenith_admin', and the console
-depends on that, so if you change it, bad things will happen.
-
-### page_cache_size
-
-Size of the page cache, to hold materialized page versions. Unit is
-number of 8 kB blocks. The default is 8192, which means 64 MB.
-
-### max_file_descriptors
-
-Max number of file descriptors to hold open concurrently for accessing
-layer files. This should be kept well below the process/container/OS
-limit (see `ulimit -n`), as the pageserver also needs file descriptors
-for other files and for sockets for incoming connections.
-
-### postgres-distrib
-
-A directory with Postgres installation to use during pageserver activities.
-Inside that dir, a `bin/postgres` binary should be present.
-
-The default distrib dir is `./tmp_install/`.
-
-### workdir (-D)
-
-A directory in the file system, where pageserver will store its files.
-The default is `./.zenith/`.
-
-### Remote storage
-
-There's a way to automatically backup and restore some of the pageserver's data from working dir to the remote storage.
-The backup system is disabled by default and can be enabled for either of the currently available storages:
-
-#### Local FS storage
-
-##### remote-storage-local-path
-
-Pageserver can back up and restore some of its workdir contents to another directory.
-For that, only a path to that directory needs to be specified as a parameter.
-
-#### S3 storage
-
-Pageserver can back up and restore some of its workdir contents to S3.
-Full set of S3 credentials is needed for that as parameters:
-
-##### remote-storage-s3-bucket
-
-Name of the bucket to connect to, example: "some-sample-bucket".
-
-##### remote-storage-region
-
-Name of the region where the bucket is located at, example: "eu-north-1"
-
-##### remote-storage-access-key
-
-Access key to connect to the bucket ("login" part of the credentials), example: "AKIAIOSFODNN7EXAMPLE"
-
-##### remote-storage-secret-access-key
-
-Secret access key to connect to the bucket ("password" part of the credentials), example: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
-
-#### General remote storage configuration
-
-Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used.
-No default values are used for the remote storage configuration parameters.
-
-##### remote-storage-max-concurrent-sync
-
-Max number of concurrent connections to open for uploading to or
-downloading from S3.
-The default value is 100.
-
-## safekeeper
-
-TODO
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -79,61 +79,3 @@ Helpers for exposing Prometheus metrics from the server.
 `/zenith_utils`:

 Helpers that are shared between other crates in this repository.
-
-## Using Python
-Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
-so manual installation of dependencies is not recommended.
-
-A single virtual environment with all dependencies is described in the single `Pipfile`.
-
-### Prerequisites
- Install Python 3.7 (the minimal supported version)
-    - Later version (e.g. 3.8) is ok if you don't write Python code
-    - You can install Python 3.7 separately, e.g.:
-      ```bash
-      # In Ubuntu
-      sudo add-apt-repository ppa:deadsnakes/ppa
-      sudo apt update
-      sudo apt install python3.7
-      ```
- Install `pipenv`
-    - Exact version of `pipenv` is not important, you can use Debian/Ubuntu package `pipenv`.
- Install dependencies via either
-  * `pipenv --python 3.7 install --dev` if you will write Python code, or
-  * `pipenv install` if you only want to run Python scripts and don't have Python 3.7.
-
-Run `pipenv shell` to activate the virtual environment.
-Alternatively, use `pipenv run` to run a single command in the venv, e.g. `pipenv run pytest`.
-
-### Obligatory checks
-We force code formatting via `yapf` and type hints via `mypy`.
-Run the following commands in the repository's root (next to `setup.cfg`):
-
-```bash
-pipenv run yapf -ri .  # All code is reformatted
-pipenv run mypy .  # Ensure there are no typing errors
-```
-
-**WARNING**: do not run `mypy` from a directory other than the root of the repository.
-Otherwise it will not find its configuration.
-
-Also consider:
-
-* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
-* Adding more type hints to your code to avoid `Any`.
-
-### Changing dependencies
-You have to update `Pipfile.lock` if you have changed `Pipfile`:
-
-```bash
-pipenv --python 3.7 install --dev  # Re-create venv for Python 3.7 and install recent pipenv inside
-pipenv run pipenv --version  # Should be at least 2021.5.29
-pipenv run pipenv lock  # Regenerate Pipfile.lock
-```
-
-As the minimal supported version is Python 3.7 and we use it in CI,
-you have to use a Python 3.7 environment when updating `Pipfile.lock`.
-Otherwise some back-compatibility packages will be missing.
-
-It is also important to run recent `pipenv`.
-Older versions remove markers from `Pipfile.lock`.
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -5,7 +5,7 @@ authors = ["Stas Kelvich <stas@zenith.tech>"]
 edition = "2018"

 [dependencies]
-bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" }
+bookfile = "^0.3"
 chrono = "0.4.19"
 rand = "0.8.3"
 regex = "1.4.5"
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
 log = "0.4.14"
 clap = "2.33.0"
 daemonize = "0.4.1"
-tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
+tokio = { version = "1.11", features = ["process", "macros", "fs"] }
 postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
@@ -32,21 +32,11 @@ serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 toml = "0.5"
 scopeguard = "1.1.0"
+rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
 async-trait = "0.1"
 const_format = "0.2.21"
-tracing = "0.1.27"
-signal-hook = "0.3.10"
-url = "2"
-nix = "0.23"
-once_cell = "1.8.0"
-
-rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_metrics = { path = "../zenith_metrics" }
 zenith_utils = { path = "../zenith_utils" }
 workspace_hack = { path = "../workspace_hack" }
-
-[dev-dependencies]
-hex-literal = "0.3"
-tempfile = "3.2"
--- a/pageserver/README.md
+++ b/pageserver/README.md
@@ -7,9 +7,8 @@ The Page Server has a few different duties:
 - Replay WAL that's applicable to the chunks that the Page Server maintains
 - Backup to S3

-S3 is the main fault-tolerant storage of all data, as there are no Page Server
-replicas. We use a separate fault-tolerant WAL service to reduce latency. It
-keeps track of WAL records which are not syncted to S3 yet.
+
+

 The Page Server consists of multiple threads that operate on a shared
 repository of page versions:
@@ -41,7 +40,7 @@ Legend:
 +--+

 ....
-.  .   Component at its early development phase.
+.  .   Component that we will need, but doesn't exist at the moment. A TODO.
 ....

 --->   Data flow
@@ -116,49 +115,13 @@ Remove old on-disk layer files that are no longer needed according to the
 PITR retention policy


-### Backup service
+TODO: Backup service
+--------------------

-The backup service, responsible for storing pageserver recovery data externally.
+The backup service is responsible for periodically pushing the chunks to S3.

-Currently, pageserver stores its files in a filesystem directory it's pointed to.
-That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
-Therefore, the server interacts with external, more reliable storage to back up and restore its state.
-
-The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
-There are the following implementations present:
-* local filesystem — to use in tests mainly
-* AWS S3           - to use in production
-
-Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs.
-
-The backup service is disabled by default and can be enabled to interact with a single remote storage.
-
-CLI examples:
-* Local FS: `${PAGESERVER_BIN} --remote-storage-local-path="/some/local/path/"`
-* AWS S3  : `${PAGESERVER_BIN} --remote-storage-s3-bucket="some-sample-bucket" --remote-storage-region="eu-north-1" --remote-storage-access-key="SOMEKEYAAAAASADSAH*#" --remote-storage-secret-access-key="SOMEsEcReTsd292v"`
-
-For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
-For local S3 installations, refer to the their documentation for name format and credentials.
-
-Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
-Required sections are:
-
-```toml
-[remote_storage]
-local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
-```
-
-or
-
-```toml
-[remote_storage]
-bucket_name = 'some-sample-bucket'
-bucket_region = 'eu-north-1'
-access_key_id = 'SOMEKEYAAAAASADSAH*#'
-secret_access_key = 'SOMEsEcReTsd292v'
-```
-
-Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.
+TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
+a chunk we don't currently have? Or when an external Control Plane tells us?

 TODO: Sharding
 --------------------
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,10 +10,9 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{Context, Result};
+use anyhow::Result;
 use bytes::{BufMut, BytesMut};
 use log::*;
-use std::fmt::Write as FmtWrite;
 use std::io;
 use std::io::Write;
 use std::sync::Arc;
@@ -32,7 +31,7 @@ use zenith_utils::lsn::Lsn;
 pub struct Basebackup<'a> {
    ar: Builder<&'a mut dyn Write>,
    timeline: &'a Arc<dyn Timeline>,
-    pub lsn: Lsn,
+    lsn: Lsn,
    prev_record_lsn: Lsn,
 }

@@ -84,7 +83,7 @@ impl<'a> Basebackup<'a> {

        info!(
            "taking basebackup lsn={}, prev_lsn={}",
-            backup_lsn, backup_prev
+            backup_prev, backup_lsn
        );

        Ok(Basebackup {
@@ -98,6 +97,7 @@ impl<'a> Basebackup<'a> {
    pub fn send_tarball(&mut self) -> anyhow::Result<()> {
        // Create pgdata subdirs structure
        for dir in pg_constants::PGDATA_SUBDIRS.iter() {
+            info!("send subdir {:?}", *dir);
            let header = new_tar_header_dir(*dir)?;
            self.ar.append(&header, &mut io::empty())?;
        }
@@ -242,16 +242,20 @@ impl<'a> Basebackup<'a> {
    fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
        let checkpoint_bytes = self
            .timeline
-            .get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)
-            .context("failed to get checkpoint bytes")?;
-        let pg_control_bytes = self
-            .timeline
-            .get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)
-            .context("failed get control bytes")?;
+            .get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)?;
+        let pg_control_bytes =
+            self.timeline
+                .get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)?;
        let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
        let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;

-        // Generate new pg_control needed for bootstrap
+        // Generate new pg_control and WAL needed for bootstrap
+        let checkpoint_segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
+        let checkpoint_lsn = XLogSegNoOffsetToRecPtr(
+            checkpoint_segno,
+            XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
+            pg_constants::WAL_SEGMENT_SIZE,
+        );
        checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0;

        //reset some fields we don't want to preserve
@@ -260,24 +264,19 @@ impl<'a> Basebackup<'a> {
        checkpoint.oldestActiveXid = 0;

        //save new values in pg_control
-        pg_control.checkPoint = 0;
+        pg_control.checkPoint = checkpoint_lsn;
        pg_control.checkPointCopy = checkpoint;
        pg_control.state = pg_constants::DB_SHUTDOWNED;

        // add zenith.signal file
-        let mut zenith_signal = String::new();
-        if self.prev_record_lsn == Lsn(0) {
-            if self.lsn == self.timeline.get_ancestor_lsn() {
-                write!(zenith_signal, "PREV LSN: none")?;
-            } else {
-                write!(zenith_signal, "PREV LSN: invalid")?;
-            }
+        let xl_prev = if self.prev_record_lsn == Lsn(0) {
+            0xBAD0 // magic value to indicate that we don't know prev_lsn
        } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
-        }
+            self.prev_record_lsn.0
+        };
        self.ar.append(
-            &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
-            zenith_signal.as_bytes(),
+            &new_tar_header("zenith.signal", 8)?,
+            &xl_prev.to_le_bytes()[..],
        )?;

        //send pg_control
@@ -286,11 +285,14 @@ impl<'a> Basebackup<'a> {
        self.ar.append(&header, &pg_control_bytes[..])?;

        //send wal segment
-        let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
-        let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
+        let wal_file_name = XLogFileName(
+            1, // FIXME: always use Postgres timeline 1
+            checkpoint_segno,
+            pg_constants::WAL_SEGMENT_SIZE,
+        );
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
        let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
-        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
+        let wal_seg = generate_wal_segment(&pg_control);
        assert!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
        self.ar.append(&header, &wal_seg[..])?;
        Ok(())
--- a/pageserver/src/bin/dump_layerfile.rs
+++ b/pageserver/src/bin/dump_layerfile.rs
@@ -4,14 +4,11 @@
 use anyhow::Result;
 use clap::{App, Arg};
 use pageserver::layered_repository::dump_layerfile_from_path;
-use pageserver::virtual_file;
 use std::path::PathBuf;
-use zenith_utils::GIT_VERSION;

 fn main() -> Result<()> {
    let arg_matches = App::new("Zenith dump_layerfile utility")
        .about("Dump contents of one layer file, for debugging")
-        .version(GIT_VERSION)
        .arg(
            Arg::with_name("path")
                .help("Path to file to dump")
@@ -22,9 +19,6 @@ fn main() -> Result<()> {

    let path = PathBuf::from(arg_matches.value_of("path").unwrap());

-    // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
-
    dump_layerfile_from_path(&path)?;

    Ok(())
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,35 +2,33 @@
 // Main entry point for the Page Server executable
 //

+use log::*;
+use pageserver::defaults::*;
 use serde::{Deserialize, Serialize};
 use std::{
    env,
-    num::{NonZeroU32, NonZeroUsize},
+    net::TcpListener,
    path::{Path, PathBuf},
    str::FromStr,
    thread,
 };
-use tracing::*;
-use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION};
+use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};

 use anyhow::{bail, ensure, Context, Result};
-
 use clap::{App, Arg, ArgMatches};
 use daemonize::Daemonize;

 use pageserver::{
-    branches, defaults::*, http, page_cache, page_service, remote_storage, tenant_mgr,
-    virtual_file, PageServerConf, RemoteStorageConfig, RemoteStorageKind, S3Config, LOG_FILE_NAME,
+    branches,
+    defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR},
+    http, page_service, tenant_mgr, PageServerConf, RelishStorageConfig, S3Config, LOG_FILE_NAME,
 };
 use zenith_utils::http::endpoint;
-use zenith_utils::postgres_backend;
-use zenith_utils::shutdown::exit_now;
-use zenith_utils::signals::{self, Signal};

 use const_format::formatcp;

 /// String arguments that can be declared via CLI or config file
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[derive(Serialize, Deserialize)]
 struct CfgFileParams {
    listen_pg_addr: Option<String>,
    listen_http_addr: Option<String>,
@@ -38,29 +36,15 @@ struct CfgFileParams {
    checkpoint_period: Option<String>,
    gc_horizon: Option<String>,
    gc_period: Option<String>,
-    open_mem_limit: Option<String>,
-    page_cache_size: Option<String>,
-    max_file_descriptors: Option<String>,
    pg_distrib_dir: Option<String>,
    auth_validation_public_key_path: Option<String>,
    auth_type: Option<String>,
-    remote_storage_max_concurrent_sync: Option<String>,
-    remote_storage_max_sync_errors: Option<String>,
-    /////////////////////////////////
-    //// Don't put `Option<String>` and other "simple" values below.
-    ////
-    /// `Option<RemoteStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
-    /// Values in TOML cannot be defined after tables (other tables can),
-    /// and [`toml`] crate serializes all fields in the order of their appearance.
-    ////////////////////////////////
-    remote_storage: Option<RemoteStorage>,
+    // see https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for enum deserialisation examples
+    relish_storage: Option<RelishStorage>,
 }

-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
-// Without this attribute, enums with values won't be serialized by the `toml` library (but can be deserialized nonetheless!).
-// See https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for the examples
-#[serde(untagged)]
-enum RemoteStorage {
+#[derive(Serialize, Deserialize, Clone)]
+enum RelishStorage {
    Local {
        local_path: String,
    },
@@ -81,37 +65,32 @@ impl CfgFileParams {
            arg_matches.value_of(arg_name).map(str::to_owned)
        };

-        let remote_storage = if let Some(local_path) = get_arg("remote-storage-local-path") {
-            Some(RemoteStorage::Local { local_path })
+        let relish_storage = if let Some(local_path) = get_arg("relish-storage-local-path") {
+            Some(RelishStorage::Local { local_path })
        } else if let Some((bucket_name, bucket_region)) =
-            get_arg("remote-storage-s3-bucket").zip(get_arg("remote-storage-region"))
+            get_arg("relish-storage-s3-bucket").zip(get_arg("relish-storage-region"))
        {
-            Some(RemoteStorage::AwsS3 {
+            Some(RelishStorage::AwsS3 {
                bucket_name,
                bucket_region,
-                access_key_id: get_arg("remote-storage-access-key"),
-                secret_access_key: get_arg("remote-storage-secret-access-key"),
+                access_key_id: get_arg("relish-storage-access-key"),
+                secret_access_key: get_arg("relish-storage-secret-access-key"),
            })
        } else {
            None
        };

        Self {
-            listen_pg_addr: get_arg("listen_pg_addr"),
-            listen_http_addr: get_arg("listen_http_addr"),
+            listen_pg_addr: get_arg("listen-pg"),
+            listen_http_addr: get_arg("listen-http"),
            checkpoint_distance: get_arg("checkpoint_distance"),
            checkpoint_period: get_arg("checkpoint_period"),
            gc_horizon: get_arg("gc_horizon"),
            gc_period: get_arg("gc_period"),
-            open_mem_limit: get_arg("open_mem_limit"),
-            page_cache_size: get_arg("page_cache_size"),
-            max_file_descriptors: get_arg("max_file_descriptors"),
            pg_distrib_dir: get_arg("postgres-distrib"),
            auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
            auth_type: get_arg("auth-type"),
-            remote_storage,
-            remote_storage_max_concurrent_sync: get_arg("remote-storage-max-concurrent-sync"),
-            remote_storage_max_sync_errors: get_arg("remote-storage-max-sync-errors"),
+            relish_storage,
        }
    }

@@ -125,21 +104,12 @@ impl CfgFileParams {
            checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
            gc_horizon: self.gc_horizon.or(other.gc_horizon),
            gc_period: self.gc_period.or(other.gc_period),
-            open_mem_limit: self.open_mem_limit.or(other.open_mem_limit),
-            page_cache_size: self.page_cache_size.or(other.page_cache_size),
-            max_file_descriptors: self.max_file_descriptors.or(other.max_file_descriptors),
            pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
            auth_validation_public_key_path: self
                .auth_validation_public_key_path
                .or(other.auth_validation_public_key_path),
            auth_type: self.auth_type.or(other.auth_type),
-            remote_storage: self.remote_storage.or(other.remote_storage),
-            remote_storage_max_concurrent_sync: self
-                .remote_storage_max_concurrent_sync
-                .or(other.remote_storage_max_concurrent_sync),
-            remote_storage_max_sync_errors: self
-                .remote_storage_max_sync_errors
-                .or(other.remote_storage_max_sync_errors),
+            relish_storage: self.relish_storage.or(other.relish_storage),
        }
    }

@@ -175,21 +145,6 @@ impl CfgFileParams {
            None => DEFAULT_GC_PERIOD,
        };

-        let open_mem_limit: usize = match self.open_mem_limit.as_ref() {
-            Some(open_mem_limit_str) => open_mem_limit_str.parse()?,
-            None => DEFAULT_OPEN_MEM_LIMIT,
-        };
-
-        let page_cache_size: usize = match self.page_cache_size.as_ref() {
-            Some(page_cache_size_str) => page_cache_size_str.parse()?,
-            None => DEFAULT_PAGE_CACHE_SIZE,
-        };
-
-        let max_file_descriptors: usize = match self.max_file_descriptors.as_ref() {
-            Some(max_file_descriptors_str) => max_file_descriptors_str.parse()?,
-            None => DEFAULT_MAX_FILE_DESCRIPTORS,
-        };
-
        let pg_distrib_dir = match self.pg_distrib_dir.as_ref() {
            Some(pg_distrib_dir_str) => PathBuf::from(pg_distrib_dir_str),
            None => env::current_dir()?.join("tmp_install"),
@@ -223,37 +178,25 @@ impl CfgFileParams {
            );
        }

-        let max_concurrent_sync = match self.remote_storage_max_concurrent_sync.as_deref() {
-            Some(number_str) => number_str.parse()?,
-            None => NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap(),
-        };
-        let max_sync_errors = match self.remote_storage_max_sync_errors.as_deref() {
-            Some(number_str) => number_str.parse()?,
-            None => NonZeroU32::new(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap(),
-        };
-        let remote_storage_config = self.remote_storage.as_ref().map(|storage_params| {
-            let storage = match storage_params.clone() {
-                RemoteStorage::Local { local_path } => {
-                    RemoteStorageKind::LocalFs(PathBuf::from(local_path))
-                }
-                RemoteStorage::AwsS3 {
-                    bucket_name,
-                    bucket_region,
-                    access_key_id,
-                    secret_access_key,
-                } => RemoteStorageKind::AwsS3(S3Config {
-                    bucket_name,
-                    bucket_region,
-                    access_key_id,
-                    secret_access_key,
-                }),
-            };
-            RemoteStorageConfig {
-                max_concurrent_sync,
-                max_sync_errors,
-                storage,
-            }
-        });
+        let relish_storage_config =
+            self.relish_storage
+                .as_ref()
+                .map(|storage_params| match storage_params.clone() {
+                    RelishStorage::Local { local_path } => {
+                        RelishStorageConfig::LocalFs(PathBuf::from(local_path))
+                    }
+                    RelishStorage::AwsS3 {
+                        bucket_name,
+                        bucket_region,
+                        access_key_id,
+                        secret_access_key,
+                    } => RelishStorageConfig::AwsS3(S3Config {
+                        bucket_name,
+                        bucket_region,
+                        access_key_id,
+                        secret_access_key,
+                    }),
+                });

        Ok(PageServerConf {
            daemonize: false,
@@ -264,9 +207,6 @@ impl CfgFileParams {
            checkpoint_period,
            gc_horizon,
            gc_period,
-            open_mem_limit,
-            page_cache_size,
-            max_file_descriptors,

            superuser: String::from(DEFAULT_SUPERUSER),

@@ -276,28 +216,26 @@ impl CfgFileParams {

            auth_validation_public_key_path,
            auth_type,
-            remote_storage_config,
+            relish_storage_config,
        })
    }
 }

 fn main() -> Result<()> {
-    zenith_metrics::set_common_metrics_prefix("pageserver");
    let arg_matches = App::new("Zenith page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
-        .version(GIT_VERSION)
        .arg(
-            Arg::with_name("listen_pg_addr")
+            Arg::with_name("listen-pg")
                .short("l")
-                .long("listen_pg_addr")
-                .aliases(&["listen", "listen-pg"]) // keep some compatibility
+                .long("listen-pg")
+                .alias("listen") // keep some compatibility
                .takes_value(true)
                .help(formatcp!("listen for incoming page requests on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
        )
        .arg(
-            Arg::with_name("listen_http_addr")
-                .long("listen_http_addr")
-                .aliases(&["http_endpoint", "listen-http"]) // keep some compatibility
+            Arg::with_name("listen-http")
+                .long("listen-http")
+                .alias("http_endpoint") // keep some compatibility
                .takes_value(true)
                .help(formatcp!("http endpoint address for metrics and management API calls on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
        )
@@ -338,25 +276,6 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Interval between garbage collector iterations"),
        )
-        .arg(
-            Arg::with_name("open_mem_limit")
-                .long("open_mem_limit")
-                .takes_value(true)
-                .help("Amount of memory reserved for buffering incoming WAL"),
-        )
-        .arg(
-
-            Arg::with_name("page_cache_size")
-                .long("page_cache_size")
-                .takes_value(true)
-                .help("Number of pages in the page cache"),
-        )
-        .arg(
-            Arg::with_name("max_file_descriptors")
-                .long("max_file_descriptors")
-                .takes_value(true)
-                .help("Max number of file descriptors to keep open for files"),
-        )
        .arg(
            Arg::with_name("workdir")
                .short("D")
@@ -390,48 +309,42 @@ fn main() -> Result<()> {
                .help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
        )
        .arg(
-            Arg::with_name("remote-storage-local-path")
-                .long("remote-storage-local-path")
+            Arg::with_name("relish-storage-local-path")
+                .long("relish-storage-local-path")
                .takes_value(true)
-                .help("Path to the local directory, to be used as an external remote storage")
+                .help("Path to the local directory, to be used as an external relish storage")
                .conflicts_with_all(&[
-                    "remote-storage-s3-bucket",
-                    "remote-storage-region",
-                    "remote-storage-access-key",
-                    "remote-storage-secret-access-key",
+                    "relish-storage-s3-bucket",
+                    "relish-storage-region",
+                    "relish-storage-access-key",
+                    "relish-storage-secret-access-key",
                ]),
        )
        .arg(
-            Arg::with_name("remote-storage-s3-bucket")
-                .long("remote-storage-s3-bucket")
+            Arg::with_name("relish-storage-s3-bucket")
+                .long("relish-storage-s3-bucket")
                .takes_value(true)
-                .help("Name of the AWS S3 bucket to use an external remote storage")
-                .requires("remote-storage-region"),
+                .help("Name of the AWS S3 bucket to use an external relish storage")
+                .requires("relish-storage-region"),
        )
        .arg(
-            Arg::with_name("remote-storage-region")
-                .long("remote-storage-region")
+            Arg::with_name("relish-storage-region")
+                .long("relish-storage-region")
                .takes_value(true)
                .help("Region of the AWS S3 bucket"),
        )
        .arg(
-            Arg::with_name("remote-storage-access-key")
-                .long("remote-storage-access-key")
+            Arg::with_name("relish-storage-access-key")
+                .long("relish-storage-access-key")
                .takes_value(true)
                .help("Credentials to access the AWS S3 bucket"),
        )
        .arg(
-            Arg::with_name("remote-storage-secret-access-key")
-                .long("remote-storage-secret-access-key")
+            Arg::with_name("relish-storage-secret-access-key")
+                .long("relish-storage-secret-access-key")
                .takes_value(true)
                .help("Credentials to access the AWS S3 bucket"),
        )
-        .arg(
-            Arg::with_name("remote-storage-max-concurrent-sync")
-                .long("remote-storage-max-concurrent-sync")
-                .takes_value(true)
-                .help("Maximum allowed concurrent synchronisations with storage"),
-        )
        .get_matches();

    let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
@@ -488,11 +401,6 @@ fn main() -> Result<()> {
    // as a ref.
    let conf: &'static PageServerConf = Box::leak(Box::new(conf));

-    // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors);
-
-    page_cache::init(conf);
-
    // Create repo and exit if init was requested
    if init {
        branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
@@ -514,9 +422,7 @@ fn main() -> Result<()> {

 fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    // Initialize logger
-    let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;
-
-    info!("version: {}", GIT_VERSION);
+    let (_scope_guard, log_file) = logging::init(LOG_FILE_NAME, conf.daemonize)?;

    // TODO: Check that it looks like a valid repository before going further

@@ -525,15 +431,14 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
        "Starting pageserver http handler on {}",
        conf.listen_http_addr
    );
-    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
+    let http_listener = TcpListener::bind(conf.listen_http_addr.clone())?;

    info!(
        "Starting pageserver pg protocol handler on {}",
        conf.listen_pg_addr
    );
-    let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
+    let pageserver_listener = TcpListener::bind(conf.listen_pg_addr.clone())?;

-    // XXX: Don't spawn any threads before daemonizing!
    if conf.daemonize {
        info!("daemonizing...");

@@ -548,25 +453,18 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
            .stdout(stdout)
            .stderr(stderr);

-        // XXX: The parent process should exit abruptly right after
-        // it has spawned a child to prevent coverage machinery from
-        // dumping stats into a `profraw` file now owned by the child.
-        // Otherwise, the coverage data will be damaged.
-        match daemonize.exit_action(|| exit_now(0)).start() {
+        match daemonize.start() {
            Ok(_) => info!("Success, daemonized"),
-            Err(err) => error!(%err, "could not daemonize"),
+            Err(e) => error!("Error, {}", e),
        }
    }

-    let signals = signals::install_shutdown_handlers()?;
-    let mut threads = vec![];
-
-    if let Some(handle) = remote_storage::run_storage_sync_thread(conf)? {
-        threads.push(handle);
-    }
    // Initialize tenant manager.
    tenant_mgr::init(conf);

+    // keep join handles for spawned threads
+    let mut join_handles = vec![];
+
    // initialize authentication for incoming connections
    let auth = match &conf.auth_type {
        AuthType::Trust | AuthType::MD5 => None,
@@ -581,204 +479,30 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    // Spawn a new thread for the http endpoint
    // bind before launching separate thread so the error reported before startup exits
    let cloned = auth.clone();
-    threads.push(
-        thread::Builder::new()
-            .name("http_endpoint_thread".into())
-            .spawn(move || {
-                let router = http::make_router(conf, cloned);
-                endpoint::serve_thread_main(router, http_listener)
-            })?,
-    );
+    let http_endpoint_thread = thread::Builder::new()
+        .name("http_endpoint_thread".into())
+        .spawn(move || {
+            let router = http::make_router(conf, cloned);
+            endpoint::serve_thread_main(router, http_listener)
+        })?;
+
+    join_handles.push(http_endpoint_thread);

    // Spawn a thread to listen for connections. It will spawn further threads
    // for each connection.
-    threads.push(
-        thread::Builder::new()
-            .name("Page Service thread".into())
-            .spawn(move || {
-                page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
-            })?,
-    );
+    let page_service_thread = thread::Builder::new()
+        .name("Page Service thread".into())
+        .spawn(move || {
+            page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
+        })?;

-    signals.handle(|signal| match signal {
-        Signal::Quit => {
-            info!(
-                "Got {}. Terminating in immediate shutdown mode",
-                signal.name()
-            );
-            std::process::exit(111);
-        }
+    join_handles.push(page_service_thread);

-        Signal::Interrupt | Signal::Terminate => {
-            info!(
-                "Got {}. Terminating gracefully in fast shutdown mode",
-                signal.name()
-            );
-
-            postgres_backend::set_pgbackend_shutdown_requested();
-            tenant_mgr::shutdown_all_tenants()?;
-            endpoint::shutdown();
-
-            for handle in std::mem::take(&mut threads) {
-                handle
-                    .join()
-                    .expect("thread panicked")
-                    .expect("thread exited with an error");
-            }
-
-            info!("Shut down successfully completed");
-            std::process::exit(0);
-        }
-    })
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn page_server_conf_toml_serde() {
-        let params = CfgFileParams {
-            listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
-            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
-            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
-            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
-            gc_horizon: Some("gc_horizon_VALUE".to_string()),
-            gc_period: Some("gc_period_VALUE".to_string()),
-            open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
-            page_cache_size: Some("page_cache_size_VALUE".to_string()),
-            max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
-            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
-            auth_validation_public_key_path: Some(
-                "auth_validation_public_key_path_VALUE".to_string(),
-            ),
-            auth_type: Some("auth_type_VALUE".to_string()),
-            remote_storage: Some(RemoteStorage::Local {
-                local_path: "remote_storage_local_VALUE".to_string(),
-            }),
-            remote_storage_max_concurrent_sync: Some(
-                "remote_storage_max_concurrent_sync_VALUE".to_string(),
-            ),
-            remote_storage_max_sync_errors: Some(
-                "remote_storage_max_sync_errors_VALUE".to_string(),
-            ),
-        };
-
-        let toml_string = toml::to_string(&params).expect("Failed to serialize correct config");
-        let toml_pretty_string =
-            toml::to_string_pretty(&params).expect("Failed to serialize correct config");
-        assert_eq!(
-            r#"listen_pg_addr = 'listen_pg_addr_VALUE'
-listen_http_addr = 'listen_http_addr_VALUE'
-checkpoint_distance = 'checkpoint_distance_VALUE'
-checkpoint_period = 'checkpoint_period_VALUE'
-gc_horizon = 'gc_horizon_VALUE'
-gc_period = 'gc_period_VALUE'
-open_mem_limit = 'open_mem_limit_VALUE'
-page_cache_size = 'page_cache_size_VALUE'
-max_file_descriptors = 'max_file_descriptors_VALUE'
-pg_distrib_dir = 'pg_distrib_dir_VALUE'
-auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
-auth_type = 'auth_type_VALUE'
-remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'
-remote_storage_max_sync_errors = 'remote_storage_max_sync_errors_VALUE'
-
-[remote_storage]
-local_path = 'remote_storage_local_VALUE'
-"#,
-            toml_pretty_string
-        );
-
-        let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
-            .expect("Failed to deserialize the serialization result of the config");
-        let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
-            .expect("Failed to deserialize the prettified serialization result of the config");
-        assert!(
-            params_from_serialized == params,
-            "Expected the same config in the end of config -> serialize -> deserialize chain"
-        );
-        assert!(
-            params_from_serialized_pretty == params,
-            "Expected the same config in the end of config -> serialize pretty -> deserialize chain"
-        );
-    }
-
-    #[test]
-    fn credentials_omitted_during_serialization() {
-        let params = CfgFileParams {
-            listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
-            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
-            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
-            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
-            gc_horizon: Some("gc_horizon_VALUE".to_string()),
-            gc_period: Some("gc_period_VALUE".to_string()),
-            open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
-            page_cache_size: Some("page_cache_size_VALUE".to_string()),
-            max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
-            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
-            auth_validation_public_key_path: Some(
-                "auth_validation_public_key_path_VALUE".to_string(),
-            ),
-            auth_type: Some("auth_type_VALUE".to_string()),
-            remote_storage: Some(RemoteStorage::AwsS3 {
-                bucket_name: "bucket_name_VALUE".to_string(),
-                bucket_region: "bucket_region_VALUE".to_string(),
-                access_key_id: Some("access_key_id_VALUE".to_string()),
-                secret_access_key: Some("secret_access_key_VALUE".to_string()),
-            }),
-            remote_storage_max_concurrent_sync: Some(
-                "remote_storage_max_concurrent_sync_VALUE".to_string(),
-            ),
-            remote_storage_max_sync_errors: Some(
-                "remote_storage_max_sync_errors_VALUE".to_string(),
-            ),
-        };
-
-        let toml_string = toml::to_string(&params).expect("Failed to serialize correct config");
-        let toml_pretty_string =
-            toml::to_string_pretty(&params).expect("Failed to serialize correct config");
-        assert_eq!(
-            r#"listen_pg_addr = 'listen_pg_addr_VALUE'
-listen_http_addr = 'listen_http_addr_VALUE'
-checkpoint_distance = 'checkpoint_distance_VALUE'
-checkpoint_period = 'checkpoint_period_VALUE'
-gc_horizon = 'gc_horizon_VALUE'
-gc_period = 'gc_period_VALUE'
-open_mem_limit = 'open_mem_limit_VALUE'
-page_cache_size = 'page_cache_size_VALUE'
-max_file_descriptors = 'max_file_descriptors_VALUE'
-pg_distrib_dir = 'pg_distrib_dir_VALUE'
-auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
-auth_type = 'auth_type_VALUE'
-remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'
-remote_storage_max_sync_errors = 'remote_storage_max_sync_errors_VALUE'
-
-[remote_storage]
-bucket_name = 'bucket_name_VALUE'
-bucket_region = 'bucket_region_VALUE'
-"#,
-            toml_pretty_string
-        );
-
-        let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
-            .expect("Failed to deserialize the serialization result of the config");
-        let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
-            .expect("Failed to deserialize the prettified serialization result of the config");
-
-        let mut expected_params = params;
-        expected_params.remote_storage = Some(RemoteStorage::AwsS3 {
-            bucket_name: "bucket_name_VALUE".to_string(),
-            bucket_region: "bucket_region_VALUE".to_string(),
-            access_key_id: None,
-            secret_access_key: None,
-        });
-        assert!(
-            params_from_serialized == expected_params,
-            "Expected the config without credentials in the end of a 'config -> serialize -> deserialize' chain"
-        );
-        assert!(
-            params_from_serialized_pretty == expected_params,
-            "Expected the config without credentials in the end of a 'config -> serialize pretty -> deserialize' chain"
-        );
+    for handle in join_handles.into_iter() {
+        handle
+            .join()
+            .expect("thread panicked")
+            .expect("thread exited with an error")
    }
+    Ok(())
 }
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -4,7 +4,7 @@
 // TODO: move all paths construction to conf impl
 //

-use anyhow::{bail, Context, Result};
+use anyhow::{bail, ensure, Context, Result};
 use postgres_ffi::ControlFileData;
 use serde::{Deserialize, Serialize};
 use std::{
@@ -14,16 +14,14 @@ use std::{
    str::FromStr,
    sync::Arc,
 };
-use tracing::*;
+use zenith_utils::zid::{ZTenantId, ZTimelineId};

-use zenith_utils::crashsafe_dir;
+use log::*;
 use zenith_utils::logging;
 use zenith_utils::lsn::Lsn;
-use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::tenant_mgr;
 use crate::walredo::WalRedoManager;
-use crate::CheckpointConfig;
 use crate::{repository::Repository, PageServerConf};
 use crate::{restore_local_repo, LOG_FILE_NAME};

@@ -36,14 +34,15 @@ pub struct BranchInfo {
    pub ancestor_id: Option<String>,
    pub ancestor_lsn: Option<String>,
    pub current_logical_size: usize,
-    pub current_logical_size_non_incremental: Option<usize>,
+    pub current_logical_size_non_incremental: usize,
 }

 impl BranchInfo {
    pub fn from_path<T: AsRef<Path>>(
        path: T,
+        conf: &PageServerConf,
+        tenantid: &ZTenantId,
        repo: &Arc<dyn Repository>,
-        include_non_incremental_logical_size: bool,
    ) -> Result<Self> {
        let name = path
            .as_ref()
@@ -56,22 +55,27 @@ impl BranchInfo {

        let timeline = repo.get_timeline(timeline_id)?;

-        // we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id
-        let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() {
-            Some(ancestor_id) => (
-                Some(ancestor_id.to_string()),
-                Some(timeline.get_ancestor_lsn().to_string()),
-            ),
-            None => (None, None),
-        };
+        let ancestor_path = conf.ancestor_path(&timeline_id, tenantid);
+        let mut ancestor_id: Option<String> = None;
+        let mut ancestor_lsn: Option<String> = None;

-        // non incremental size calculation can be heavy, so let it be optional
-        // needed for tests to check size calculation
-        let current_logical_size_non_incremental = include_non_incremental_logical_size
-            .then(|| {
-                timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
-            })
-            .transpose()?;
+        if ancestor_path.exists() {
+            let ancestor = std::fs::read_to_string(ancestor_path)?;
+            let mut strings = ancestor.split('@');
+
+            ancestor_id = Some(
+                strings
+                    .next()
+                    .with_context(|| "wrong branch ancestor point in time format")?
+                    .to_owned(),
+            );
+            ancestor_lsn = Some(
+                strings
+                    .next()
+                    .with_context(|| "wrong branch ancestor point in time format")?
+                    .to_owned(),
+            );
+        }

        Ok(BranchInfo {
            name,
@@ -80,7 +84,8 @@ impl BranchInfo {
            ancestor_id,
            ancestor_lsn,
            current_logical_size: timeline.get_current_logical_size(),
-            current_logical_size_non_incremental,
+            current_logical_size_non_incremental: timeline
+                .get_current_logical_size_non_incremental(timeline.get_last_record_lsn())?,
        })
    }
 }
@@ -94,7 +99,7 @@ pub struct PointInTime {
 pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
    // Initialize logger
    // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
-    let _log_file = logging::init(LOG_FILE_NAME, true)?;
+    let (_scope_guard, _log_file) = logging::init(LOG_FILE_NAME, true)?;

    // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
    // process during repository initialization.
@@ -113,7 +118,7 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
        println!("initializing tenantid {}", tenantid);
        create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?;
    }
-    crashsafe_dir::create_dir_all(conf.tenants_path())?;
+    fs::create_dir_all(conf.tenants_path())?;

    println!("pageserver init succeeded");
    Ok(())
@@ -130,32 +135,27 @@ pub fn create_repo(
    }

    // top-level dir may exist if we are creating it through CLI
-    crashsafe_dir::create_dir_all(&repo_dir)
+    fs::create_dir_all(&repo_dir)
        .with_context(|| format!("could not create directory {}", repo_dir.display()))?;

-    crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?;
-    crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?;
-    crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?;
+    fs::create_dir(conf.timelines_path(&tenantid))?;
+    fs::create_dir_all(conf.branches_path(&tenantid))?;
+    fs::create_dir_all(conf.tags_path(&tenantid))?;

    info!("created directory structure in {}", repo_dir.display());

-    // create a new timeline directory
-    let timeline_id = ZTimelineId::generate();
-    let timelinedir = conf.timeline_path(&timeline_id, &tenantid);
-
-    crashsafe_dir::create_dir(&timelinedir)?;
+    let tli = create_timeline(conf, None, &tenantid)?;

    let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
        conf,
        wal_redo_manager,
        tenantid,
-        false,
    ));

    // Load data into pageserver
    // TODO To implement zenith import we need to
    //      move data loading out of create_repo()
-    bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?;
+    bootstrap_timeline(conf, tenantid, tli, &*repo)?;

    Ok(repo)
 }
@@ -174,16 +174,13 @@ fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 // to get bootstrap data for timeline initialization.
 //
 fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
-    info!("running initdb in {}... ", initdbpath.display());
+    info!("running initdb... ");

    let initdb_path = conf.pg_bin_dir().join("initdb");
    let initdb_output = Command::new(initdb_path)
        .args(&["-D", initdbpath.to_str().unwrap()])
        .args(&["-U", &conf.superuser])
        .arg("--no-instructions")
-        // This is only used for a temporary installation that is deleted shortly after,
-        // so no need to fsync it
-        .arg("--no-sync")
        .env_clear()
        .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
@@ -196,6 +193,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
            String::from_utf8_lossy(&initdb_output.stderr)
        );
    }
+    info!("initdb succeeded");

    Ok(())
 }
@@ -210,8 +208,6 @@ fn bootstrap_timeline(
    tli: ZTimelineId,
    repo: &dyn Repository,
 ) -> Result<()> {
-    let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
-
    let initdb_path = conf.tenant_path(&tenantid).join("tmp");

    // Init temporarily repo to get bootstrap data
@@ -220,17 +216,13 @@ fn bootstrap_timeline(

    let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();

+    info!("bootstrap_timeline {:?} at lsn {}", pgdata_path, lsn);
+
    // Import the contents of the data directory at the initial checkpoint
    // LSN, and any WAL after that.
-    // Initdb lsn will be equal to last_record_lsn which will be set after import.
-    // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
-    let timeline = repo.create_empty_timeline(tli, lsn)?;
-    restore_local_repo::import_timeline_from_postgres_datadir(
-        &pgdata_path,
-        timeline.writer().as_ref(),
-        lsn,
-    )?;
-    timeline.checkpoint(CheckpointConfig::Forced)?;
+    let timeline = repo.create_empty_timeline(tli)?;
+    restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
+    timeline.checkpoint()?;

    println!(
        "created initial timeline {} timeline.lsn {}",
@@ -248,38 +240,29 @@ fn bootstrap_timeline(
    Ok(())
 }

-pub(crate) fn get_branches(
-    conf: &PageServerConf,
-    tenantid: &ZTenantId,
-    include_non_incremental_logical_size: bool,
-) -> Result<Vec<BranchInfo>> {
+pub(crate) fn get_tenants(conf: &PageServerConf) -> Result<Vec<String>> {
+    let tenants_dir = conf.tenants_path();
+
+    std::fs::read_dir(&tenants_dir)?
+        .map(|dir_entry_res| {
+            let dir_entry = dir_entry_res?;
+            ensure!(dir_entry.file_type()?.is_dir());
+            Ok(dir_entry.file_name().to_str().unwrap().to_owned())
+        })
+        .collect()
+}
+
+pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
    let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;

    // Each branch has a corresponding record (text file) in the refs/branches
    // with timeline_id.
    let branches_dir = conf.branches_path(tenantid);

-    std::fs::read_dir(&branches_dir)
-        .with_context(|| {
-            format!(
-                "Found no branches directory '{}' for tenant {}",
-                branches_dir.display(),
-                tenantid
-            )
-        })?
+    std::fs::read_dir(&branches_dir)?
        .map(|dir_entry_res| {
-            let dir_entry = dir_entry_res.with_context(|| {
-                format!(
-                    "Failed to list branches directory '{}' content for tenant {}",
-                    branches_dir.display(),
-                    tenantid
-                )
-            })?;
-            BranchInfo::from_path(
-                dir_entry.path(),
-                &repo,
-                include_non_incremental_logical_size,
-            )
+            let dir_entry = dir_entry_res?;
+            BranchInfo::from_path(dir_entry.path(), conf, tenantid, &repo)
        })
        .collect()
 }
@@ -322,26 +305,26 @@ pub(crate) fn create_branch(
        );
    }

-    let new_timeline_id = ZTimelineId::generate();
+    // create a new timeline directory for it
+    let newtli = create_timeline(conf, Some(startpoint), tenantid)?;

-    // Forward entire timeline creation routine to repository
-    // backend, so it can do all needed initialization
-    repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?;
+    // Let the Repository backend do its initialization
+    repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;

    // Remember the human-readable branch name for the new timeline.
    // FIXME: there's a race condition, if you create a branch with the same
    // name concurrently.
-    let data = new_timeline_id.to_string();
+    let data = newtli.to_string();
    fs::write(conf.branch_path(branchname, tenantid), data)?;

    Ok(BranchInfo {
        name: branchname.to_string(),
-        timeline_id: new_timeline_id,
+        timeline_id: newtli,
        latest_valid_lsn: startpoint.lsn,
-        ancestor_id: Some(startpoint.timelineid.to_string()),
-        ancestor_lsn: Some(startpoint.lsn.to_string()),
+        ancestor_id: None,
+        ancestor_lsn: None,
        current_logical_size: 0,
-        current_logical_size_non_incremental: Some(0),
+        current_logical_size_non_incremental: 0,
    })
 }

@@ -417,3 +400,25 @@ fn parse_point_in_time(

    bail!("could not parse point-in-time {}", s);
 }
+
+fn create_timeline(
+    conf: &PageServerConf,
+    ancestor: Option<PointInTime>,
+    tenantid: &ZTenantId,
+) -> Result<ZTimelineId> {
+    // Create initial timeline
+
+    let timelineid = ZTimelineId::generate();
+
+    let timelinedir = conf.timeline_path(&timelineid, tenantid);
+
+    fs::create_dir(&timelinedir)?;
+    fs::create_dir(&timelinedir.join("wal"))?;
+
+    if let Some(ancestor) = ancestor {
+        let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
+        fs::write(timelinedir.join("ancestor"), data)?;
+    }
+
+    Ok(timelineid)
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -25,11 +25,6 @@ paths:
        schema:
          type: string
          format: hex
-      - name: include-non-incremental-logical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_logical_size_non_incremental
    get:
      description: Get branches for tenant
      responses:
@@ -78,11 +73,6 @@ paths:
        required: true
        schema:
          type: string
-      - name: include-non-incremental-logical-size
-        in: query
-        schema:
-          type: string
-          description: Controls calculation of current_logical_size_non_incremental
    get:
      description: Get branches for tenant
      responses:
@@ -174,13 +164,13 @@ paths:
      description: Get tenants list
      responses:
        "200":
-          description: TenantInfo
+          description: OK
          content:
            application/json:
              schema:
                type: array
                items:
-                  $ref: "#/components/schemas/TenantInfo"
+                  type: string
        "401":
          description: Unauthorized Error
          content:
@@ -253,16 +243,6 @@ components:
      scheme: bearer
      bearerFormat: JWT
  schemas:
-    TenantInfo:
-      type: object
-      required:
-        - id
-        - state
-      properties:
-        id:
-          type: string
-        state:
-          type: string
    BranchInfo:
      type: object
      required:
@@ -270,6 +250,7 @@ components:
        - timeline_id
        - latest_valid_lsn
        - current_logical_size
+        - current_logical_size_non_incremental
      properties:
        name:
          type: string
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,3 +1,4 @@
+use std::str::FromStr;
 use std::sync::Arc;

 use anyhow::Result;
@@ -5,7 +6,6 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use routerify::{ext::RequestExt, RouterBuilder};
-use tracing::*;
 use zenith_utils::auth::JwtAuth;
 use zenith_utils::http::endpoint::attach_openapi_ui;
 use zenith_utils::http::endpoint::auth_middleware;
@@ -15,8 +15,6 @@ use zenith_utils::http::{
    endpoint,
    error::HttpErrorBody,
    json::{json_request, json_response},
-    request::get_request_param,
-    request::parse_request_param,
 };

 use super::models::BranchCreateRequest;
@@ -58,6 +56,33 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
    get_state(request).conf
 }

+fn get_request_param<'a>(
+    request: &'a Request<Body>,
+    param_name: &str,
+) -> Result<&'a str, ApiError> {
+    match request.param(param_name) {
+        Some(arg) => Ok(arg),
+        None => {
+            return Err(ApiError::BadRequest(format!(
+                "no {} specified in path param",
+                param_name
+            )))
+        }
+    }
+}
+
+fn parse_request_param<T: FromStr>(
+    request: &Request<Body>,
+    param_name: &str,
+) -> Result<T, ApiError> {
+    match get_request_param(request, param_name)?.parse() {
+        Ok(v) => Ok(v),
+        Err(_) => Err(ApiError::BadRequest(
+            "failed to parse tenant id".to_string(),
+        )),
+    }
+}
+
 // healthcheck handler
 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    Ok(Response::builder()
@@ -73,7 +98,6 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    check_permission(&request, Some(request_data.tenant_id))?;

    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered();
        branches::create_branch(
            get_config(&request),
            &request_data.name,
@@ -86,53 +110,29 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    Ok(json_response(StatusCode::CREATED, response_data)?)
 }

-// Gate non incremental logical size calculation behind a flag
-// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines
-// and tenants it can take noticeable amount of time. Also the value currently used only in tests
-fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
-    request
-        .uri()
-        .query()
-        .map(|v| {
-            url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
-                .any(|(param, _)| param == "include-non-incremental-logical-size")
-        })
-        .unwrap_or(false)
-}
-
 async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;

-    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
-
    check_permission(&request, Some(tenantid))?;

    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("branch_list", tenant = %tenantid).entered();
-        crate::branches::get_branches(
-            get_config(&request),
-            &tenantid,
-            include_non_incremental_logical_size,
-        )
+        crate::branches::get_branches(get_config(&request), &tenantid)
    })
    .await
    .map_err(ApiError::from_err)??;
    Ok(json_response(StatusCode::OK, response_data)?)
 }

+// TODO add to swagger
 async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
-    let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
+    let branch_name: &str = get_request_param(&request, "branch_name")?;
    let conf = get_state(&request).conf;
-    let path = conf.branch_path(&branch_name, &tenantid);
-
-    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
+    let path = conf.branch_path(branch_name, &tenantid);

    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        BranchInfo::from_path(path, &repo, include_non_incremental_logical_size)
+        BranchInfo::from_path(path, conf, &tenantid, &repo)
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -144,13 +144,10 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
    // check for management permission
    check_permission(&request, None)?;

-    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_list").entered();
-        crate::tenant_mgr::list_tenants()
-    })
-    .await
-    .map_err(ApiError::from_err)??;
-
+    let response_data =
+        tokio::task::spawn_blocking(move || crate::branches::get_tenants(get_config(&request)))
+            .await
+            .map_err(ApiError::from_err)??;
    Ok(json_response(StatusCode::OK, response_data)?)
 }

@@ -161,7 +158,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    let request_data: TenantCreateRequest = json_request(&mut request).await?;

    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
        tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
    })
    .await
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -1,56 +1,12 @@
 # Overview

-The on-disk format is based on immutable files. The page server receives a
-stream of incoming WAL, parses the WAL records to determine which pages they
-apply to, and accumulates the incoming changes in memory. Every now and then,
-the accumulated changes are written out to new immutable files. This process is
-called checkpointing. Old versions of on-disk files that are not needed by any
-timeline are removed by GC process.
-
-The main responsibility of the Page Server is to process the incoming WAL, and
-reprocess it into a format that allows reasonably quick access to any page
-version.
-
-The incoming WAL contains updates to arbitrary pages in the system. The
-distribution depends on the workload: the updates could be totally random, or
-there could be a long stream of updates to a single relation when data is bulk
-loaded, for example, or something in between. The page server slices the
-incoming WAL per relation and page, and packages the sliced WAL into
-suitably-sized "layer files". The layer files contain all the history of the
-database, back to some reasonable retention period. This system replaces the
-base backups and the WAL archive used in a traditional PostgreSQL
-installation. The layer files are immutable, they are not modified in-place
-after creation. New layer files are created for new incoming WAL, and old layer
-files are removed when they are no longer needed. We could also replace layer
-files with new files that contain the same information, merging small files for
-example, but that hasn't been implemented yet.
-
-
-Cloud Storage                   Page Server                   Safekeeper
-                     Local disk                Memory            WAL
-
-|AAAA|               |AAAA|AAAA|               |AA
-|BBBB|               |BBBB|BBBB|               |
-|CCCC|CCCC|  <----   |CCCC|CCCC|CCCC|   <---   |CC     <----   ADEBAABED
-|DDDD|DDDD|          |DDDD|DDDD|               |DDD
-|EEEE|               |EEEE|EEEE|EEEE|          |E
-
-
-In this illustration, WAL is received as a stream from the Safekeeper, from the
-right.  It is immediately captured by the page server and stored quickly in
-memory. The page server memory can be thought of as a quick "reorder buffer",
-used to hold the incoming WAL and reorder it so that we keep the WAL records for
-the same page and relation close to each other.
-
-From the page server memory, whenever enough WAL has been accumulated for one
-relation segment, it is moved to local disk, as a new layer file, and the memory
-is released.
-
-From the local disk, the layers are further copied to Cloud Storage, for
-long-term archival. After a layer has been copied to Cloud Storage, it can be
-removed from local disk, although we currently keep everything locally for fast
-access. If a layer is needed that isn't found locally, it is fetched from Cloud
-Storage and stored in local disk.
+The on-disk format is based on immutable files. The page server
+receives a stream of incoming WAL, parses the WAL records to determine
+which pages they apply to, and accumulates the incoming changes in
+memory. Every now and then, the accumulated changes are written out to
+new immutable files. This process is called checkpointing. Old versions
+of on-disk files that are not needed by any timeline are removed by GC
+process.

 # Terms used in layered repository

@@ -58,9 +14,32 @@ Storage and stored in local disk.
 - Segment - one slice of a Relish that is stored in a LayeredTimeline.
 - Layer -  specific version of a relish Segment in a range of LSNs.

-# Layer map
+Layers can be InMemory or OnDisk:
+- InMemory layer is not durably stored and needs to rebuild from WAL on pageserver start.
+- OnDisk layer is durably stored.

-The LayerMap tracks what layers exist for all the relishes in a timeline.
+OnDisk layers can be Image or Delta:
+- ImageLayer represents an image or a snapshot of a segment at one particular LSN.
+- DeltaLayer represents a collection of WAL records or page images in a range of LSNs.
+
+Dropped segments are always represented on disk by DeltaLayer.
+
+LSN range defined by start_lsn and end_lsn:
+- start_lsn is inclusive.
+- end_lsn is exclusive.
+
+For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen
+in-memory layer or a delta layer, it is a valid end bound. An image
+layer represents snapshot at one LSN, so end_lsn is always the
+snapshot LSN + 1
+
+Layers can be open or historical:
+- Open layer is a writeable one. Only InMemory layer can be open.
+FIXME: If open layer is dropped, it is not writeable, so it should be turned into historical, 
+but now it is not implemented - see bug #569.
+- Historical layer is the one that cannot be modified anymore. Now only OnDisk layers can be historical.
+
+- LayerMap - a map that tracks what layers exist for all the relishes in a timeline.

 LayerMap consists of two data structures:
 - segs - All the layers keyed by segment tag
@@ -75,55 +54,8 @@ TODO: Are there any exceptions to this?
 For example, timeline.list_rels(lsn) will return all segments that are visible in this timeline at the LSN,
 including ones that were not modified in this timeline and thus don't have a layer in the timeline's LayerMap.

-
-# Different kinds of layers
-
-A layer can be in different states:
-
- Open - a layer where new WAL records can be appended to.
- Closed - a layer that is read-only, no new WAL records can be appended to it
- Historic: synonym for closed
- InMemory: A layer that needs to be rebuilt from WAL on pageserver start.
-To avoid OOM errors, InMemory layers can be spilled to disk into ephemeral file.
- OnDisk: A layer that is stored on disk. If its end-LSN is older than
-  disk_consistent_lsn, it is known to be fully flushed and fsync'd to local disk.
- Frozen layer: an in-memory layer that is Closed.
-
-TODO: Clarify the difference between Closed, Historic and Frozen.
-
-There are two kinds of OnDisk layers:
- ImageLayer represents an image or a snapshot of a 10 MB relish segment, at one particular LSN.
- DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for one
-  relish segment.
-
-Dropped segments are always represented on disk by DeltaLayer.
-
-# Layer life cycle
-
-LSN range defined by start_lsn and end_lsn:
- start_lsn is inclusive.
- end_lsn is exclusive.
-
-For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen in-memory
-layer or a delta layer, it is a valid end bound. An image layer represents
-snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
-
-Every layer starts its life as an Open In-Memory layer. When the page server
-receives the first WAL record for a segment, it creates a new In-Memory layer
-for it, and puts it to the layer map. Later, the layer is old enough, its
-contents are written to disk, as On-Disk layers. This process is called
-"evicting" a layer.
-
-Layer eviction is a two-step process: First, the layer is marked as closed, so
-that it no longer accepts new WAL records, and the layer map is updated
-accordingly. If a new WAL record for that segment arrives after this step, a new
-Open layer is created to hold it. After this first step, the layer is a Closed
-InMemory state. This first step is called "freezing" the layer.
-
-In the second step, new Delta and Image layers are created, containing all the
-data in the Frozen InMemory layer. When the new layers are ready, the original
-frozen layer is replaced with the new layers in the layer map, and the original
-frozen layer is dropped, releasing the memory.
+TODO:
+Describe GC and checkpoint interval settings.

 # Layer files (On-disk layers)

@@ -434,8 +366,6 @@ is a newer layer file there. TODO: This optimization hasn't been
 implemented! The GC algorithm will currently keep the file on the
 'main' branch anyway, for as long as the child branch exists.

-TODO:
-Describe GC and checkpoint interval settings.

 # TODO: On LSN ranges

--- a/pageserver/src/layered_repository/blob.rs
+++ b/pageserver/src/layered_repository/blob.rs
@@ -1,5 +1,4 @@
-use std::io::{Read, Write};
-use std::os::unix::prelude::FileExt;
+use std::{fs::File, io::Write};

 use anyhow::Result;
 use bookfile::{BookWriter, BoundedReader, ChapterId, ChapterWriter};
@@ -11,7 +10,7 @@ pub struct BlobRange {
    size: usize,
 }

-pub fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
+pub fn read_blob(reader: &BoundedReader<&'_ File>, range: &BlobRange) -> Result<Vec<u8>> {
    let mut buf = vec![0u8; range.size];
    reader.read_exact_at(&mut buf, range.offset)?;
    Ok(buf)
@@ -29,14 +28,14 @@ impl<W: Write> BlobWriter<W> {
        Self { writer, offset: 0 }
    }

-    pub fn write_blob_from_reader(&mut self, r: &mut impl Read) -> Result<BlobRange> {
-        let len = std::io::copy(r, &mut self.writer)?;
+    pub fn write_blob(&mut self, blob: &[u8]) -> Result<BlobRange> {
+        self.writer.write_all(blob)?;

        let range = BlobRange {
            offset: self.offset,
-            size: len as usize,
+            size: blob.len(),
        };
-        self.offset += len as u64;
+        self.offset += blob.len() as u64;
        Ok(range)
    }

--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -39,26 +39,27 @@
 //!
 use crate::layered_repository::blob::BlobWriter;
 use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
-use crate::layered_repository::page_versions::PageVersions;
 use crate::layered_repository::storage_layer::{
    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
 };
-use crate::virtual_file::VirtualFile;
+use crate::repository::WALRecord;
 use crate::waldecoder;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, ensure, Result};
+use anyhow::{bail, Result};
+use bytes::Bytes;
 use log::*;
 use serde::{Deserialize, Serialize};
-use zenith_utils::vec_map::VecMap;
+use std::collections::BTreeMap;
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::fs;
+use std::fs::File;
 use std::io::{BufWriter, Write};
 use std::ops::Bound::Included;
 use std::path::{Path, PathBuf};
-use std::sync::{Mutex, MutexGuard};
+use std::sync::{Arc, Mutex, MutexGuard};

 use bookfile::{Book, BookWriter};

@@ -108,6 +109,12 @@ impl From<&DeltaLayer> for Summary {
    }
 }

+#[derive(Serialize, Deserialize)]
+struct PageVersionMeta {
+    page_image_range: Option<BlobRange>,
+    record_range: Option<BlobRange>,
+}
+
 ///
 /// DeltaLayer is the in-memory data structure associated with an
 /// on-disk delta file.  We keep a DeltaLayer in memory for each
@@ -132,6 +139,9 @@ pub struct DeltaLayer {

    dropped: bool,

+    /// Predecessor layer
+    predecessor: Option<Arc<dyn Layer>>,
+
    inner: Mutex<DeltaLayerInner>,
 }

@@ -140,21 +150,15 @@ pub struct DeltaLayerInner {
    /// loaded into memory yet.
    loaded: bool,

-    book: Option<Book<VirtualFile>>,
-
    /// All versions of all pages in the file are are kept here.
    /// Indexed by block number and LSN.
-    page_version_metas: VecMap<(u32, Lsn), BlobRange>,
+    page_version_metas: BTreeMap<(u32, Lsn), PageVersionMeta>,

    /// `relsizes` tracks the size of the relation at different points in time.
-    relsizes: VecMap<Lsn, u32>,
+    relsizes: BTreeMap<Lsn, u32>,
 }

 impl Layer for DeltaLayer {
-    fn get_tenant_id(&self) -> ZTenantId {
-        self.tenantid
-    }
-
    fn get_timeline_id(&self) -> ZTimelineId {
        self.timelineid
    }
@@ -176,7 +180,29 @@ impl Layer for DeltaLayer {
    }

    fn filename(&self) -> PathBuf {
-        PathBuf::from(self.layer_name().to_string())
+        PathBuf::from(
+            DeltaFileName {
+                seg: self.seg,
+                start_lsn: self.start_lsn,
+                end_lsn: self.end_lsn,
+                dropped: self.dropped,
+            }
+            .to_string(),
+        )
+    }
+
+    fn path(&self) -> Option<PathBuf> {
+        Some(Self::path_for(
+            &self.path_or_conf,
+            self.timelineid,
+            self.tenantid,
+            &DeltaFileName {
+                seg: self.seg,
+                start_lsn: self.start_lsn,
+                end_lsn: self.end_lsn,
+                dropped: self.dropped,
+            },
+        ))
    }

    /// Look up given page in the cache.
@@ -184,63 +210,44 @@ impl Layer for DeltaLayer {
        &self,
        blknum: u32,
        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        let mut need_image = true;

        assert!(self.seg.blknum_in_seg(blknum));

-        match &cached_img_lsn {
-            Some(cached_lsn) if &self.end_lsn <= cached_lsn => {
-                return Ok(PageReconstructResult::Cached)
-            }
-            _ => {}
-        }
-
        {
            // Open the file and lock the metadata in memory
+            // TODO: avoid opening the file for each read
+            let (_path, book) = self.open_book()?;
+            let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
            let inner = self.load()?;
-            let page_version_reader = inner
-                .book
-                .as_ref()
-                .expect("should be loaded in load call above")
-                .chapter_reader(PAGE_VERSIONS_CHAPTER)?;

            // Scan the metadata BTreeMap backwards, starting from the given entry.
            let minkey = (blknum, Lsn(0));
            let maxkey = (blknum, lsn);
-            let iter = inner
+            let mut iter = inner
                .page_version_metas
-                .slice_range((Included(&minkey), Included(&maxkey)))
-                .iter()
-                .rev();
-            for ((_blknum, pv_lsn), blob_range) in iter {
-                match &cached_img_lsn {
-                    Some(cached_lsn) if pv_lsn <= cached_lsn => {
-                        return Ok(PageReconstructResult::Cached)
-                    }
-                    _ => {}
-                }
-
-                let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
-
-                match pv {
-                    PageVersion::Page(img) => {
-                        // Found a page image, return it
-                        reconstruct_data.page_img = Some(img);
+                .range((Included(&minkey), Included(&maxkey)));
+            while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
+                if let Some(img_range) = &entry.page_image_range {
+                    // Found a page image, return it
+                    let img = Bytes::from(read_blob(&page_version_reader, img_range)?);
+                    reconstruct_data.page_img = Some(img);
+                    need_image = false;
+                    break;
+                } else if let Some(rec_range) = &entry.record_range {
+                    let rec = WALRecord::des(&read_blob(&page_version_reader, rec_range)?)?;
+                    let will_init = rec.will_init;
+                    reconstruct_data.records.push(rec);
+                    if will_init {
+                        // This WAL record initializes the page, so no need to go further back
                        need_image = false;
                        break;
                    }
-                    PageVersion::Wal(rec) => {
-                        let will_init = rec.will_init;
-                        reconstruct_data.records.push((*pv_lsn, rec));
-                        if will_init {
-                            // This WAL record initializes the page, so no need to go further back
-                            need_image = false;
-                            break;
-                        }
-                    }
+                } else {
+                    // No base image, and no WAL record. Huh?
+                    bail!("no page image or WAL record for requested page");
                }
            }

@@ -248,9 +255,16 @@ impl Layer for DeltaLayer {
        }

        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
+        // caller know about the predecessor layer.
        if need_image {
-            Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
+            if let Some(cont_layer) = &self.predecessor {
+                Ok(PageReconstructResult::Continue(
+                    self.start_lsn,
+                    Arc::clone(cont_layer),
+                ))
+            } else {
+                Ok(PageReconstructResult::Missing(self.start_lsn))
+            }
        } else {
            Ok(PageReconstructResult::Complete)
        }
@@ -259,22 +273,21 @@ impl Layer for DeltaLayer {
    /// Get size of the relation at given LSN
    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
        assert!(lsn >= self.start_lsn);
-        ensure!(
-            self.seg.rel.is_blocky(),
-            "get_seg_size() called on a non-blocky rel"
-        );

        // Scan the BTreeMap backwards, starting from the given entry.
        let inner = self.load()?;
-        let slice = inner
-            .relsizes
-            .slice_range((Included(&Lsn(0)), Included(&lsn)));
+        let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));

-        if let Some((_entry_lsn, entry)) = slice.last() {
-            Ok(*entry)
+        let result;
+        if let Some((_entry_lsn, entry)) = iter.next_back() {
+            result = *entry;
+        // Use the base image if needed
+        } else if let Some(predecessor) = &self.predecessor {
+            result = predecessor.get_seg_size(lsn)?;
        } else {
-            Err(anyhow::anyhow!("could not find seg size in delta layer"))
+            result = 0;
        }
+        Ok(result)
    }

    /// Does this segment exist at given LSN?
@@ -294,20 +307,17 @@ impl Layer for DeltaLayer {
    ///
    fn unload(&self) -> Result<()> {
        let mut inner = self.inner.lock().unwrap();
-        inner.page_version_metas = VecMap::default();
-        inner.relsizes = VecMap::default();
+        inner.page_version_metas = BTreeMap::new();
+        inner.relsizes = BTreeMap::new();
        inner.loaded = false;
-
-        // Note: we keep the Book open. Is that a good idea? The virtual file
-        // machinery has its own rules for closing the file descriptor if it's not
-        // needed, but the Book struct uses up some memory, too.
-
        Ok(())
    }

    fn delete(&self) -> Result<()> {
        // delete underlying file
-        fs::remove_file(self.path())?;
+        if let Some(path) = self.path() {
+            fs::remove_file(path)?;
+        }
        Ok(())
    }

@@ -315,10 +325,6 @@ impl Layer for DeltaLayer {
        true
    }

-    fn is_in_memory(&self) -> bool {
-        false
-    }
-
    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
        println!(
@@ -328,39 +334,32 @@ impl Layer for DeltaLayer {

        println!("--- relsizes ---");
        let inner = self.load()?;
-        for (k, v) in inner.relsizes.as_slice() {
+        for (k, v) in inner.relsizes.iter() {
            println!("  {}: {}", k, v);
        }
        println!("--- page versions ---");
-
-        let path = self.path();
-        let file = std::fs::File::open(&path)?;
-        let book = Book::new(file)?;
-
+        let (_path, book) = self.open_book()?;
        let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
-        for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
+        for (k, v) in inner.page_version_metas.iter() {
            let mut desc = String::new();

-            let buf = read_blob(&chapter, blob_range)?;
-            let pv = PageVersion::des(&buf)?;
-
-            match pv {
-                PageVersion::Page(img) => {
-                    write!(&mut desc, " img {} bytes", img.len())?;
-                }
-                PageVersion::Wal(rec) => {
-                    let wal_desc = waldecoder::describe_wal_record(&rec.rec);
-                    write!(
-                        &mut desc,
-                        " rec {} bytes will_init: {} {}",
-                        rec.rec.len(),
-                        rec.will_init,
-                        wal_desc
-                    )?;
-                }
+            if let Some(page_image_range) = v.page_image_range.as_ref() {
+                let image = read_blob(&chapter, page_image_range)?;
+                write!(&mut desc, " img {} bytes", image.len())?;
            }
-
-            println!("  blk {} at {}: {}", blk, lsn, desc);
+            if let Some(record_range) = v.record_range.as_ref() {
+                let record_bytes = read_blob(&chapter, record_range)?;
+                let rec = WALRecord::des(&record_bytes)?;
+                let wal_desc = waldecoder::describe_wal_record(&rec.rec);
+                write!(
+                    &mut desc,
+                    " rec {} bytes will_init: {} {}",
+                    rec.rec.len(),
+                    rec.will_init,
+                    wal_desc
+                )?;
+            }
+            println!("  blk {} at {}: {}", k.0, k.1, desc);
        }

        Ok(())
@@ -383,14 +382,14 @@ impl DeltaLayer {
    }

    /// Create a new delta file, using the given page versions and relsizes.
-    /// The page versions are passed in a PageVersions struct. If 'cutoff' is
-    /// given, only page versions with LSN < cutoff are included.
+    /// The page versions are passed by an iterator; the iterator must return
+    /// page versions in blknum+lsn order.
    ///
-    /// This is used to write the in-memory layer to disk. The page_versions and
-    /// relsizes are thus passed in the same format as they are in the in-memory
-    /// layer, as that's expedient.
+    /// This is used to write the in-memory layer to disk. The in-memory layer uses the same
+    /// data structure with two btreemaps as we do, so passing the btreemaps is currently
+    /// expedient.
    #[allow(clippy::too_many_arguments)]
-    pub fn create(
+    pub fn create<'a>(
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
@@ -398,14 +397,10 @@ impl DeltaLayer {
        start_lsn: Lsn,
        end_lsn: Lsn,
        dropped: bool,
-        page_versions: &PageVersions,
-        cutoff: Option<Lsn>,
-        relsizes: VecMap<Lsn, u32>,
+        predecessor: Option<Arc<dyn Layer>>,
+        page_versions: impl Iterator<Item = (&'a (u32, Lsn), &'a PageVersion)>,
+        relsizes: BTreeMap<Lsn, u32>,
    ) -> Result<DeltaLayer> {
-        if seg.rel.is_blocky() {
-            assert!(!relsizes.is_empty());
-        }
-
        let delta_layer = DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
@@ -415,51 +410,65 @@ impl DeltaLayer {
            end_lsn,
            dropped,
            inner: Mutex::new(DeltaLayerInner {
-                loaded: false,
-                book: None,
-                page_version_metas: VecMap::default(),
+                loaded: true,
+                page_version_metas: BTreeMap::new(),
                relsizes,
            }),
+            predecessor,
        };
        let mut inner = delta_layer.inner.lock().unwrap();

-        // Write the data into a file
-        //
-        // Note: Because we open the file in write-only mode, we cannot
-        // reuse the same VirtualFile for reading later. That's why we don't
-        // set inner.book here. The first read will have to re-open it.
-        //
+        // Write the in-memory btreemaps into a file
+        let path = delta_layer
+            .path()
+            .expect("DeltaLayer is supposed to have a layer path on disk");
+
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-        let path = delta_layer.path();
-        let file = VirtualFile::create(&path)?;
+        let file = File::create(&path)?;
        let buf_writer = BufWriter::new(file);
        let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;

        let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);

-        let page_versions_iter = page_versions.ordered_page_version_iter(cutoff);
-        for (blknum, lsn, pos) in page_versions_iter {
-            let blob_range =
-                page_version_writer.write_blob_from_reader(&mut page_versions.reader(pos)?)?;
+        for (key, page_version) in page_versions {
+            let page_image_range = page_version
+                .page_image
+                .as_ref()
+                .map(|page_image| page_version_writer.write_blob(page_image))
+                .transpose()?;

-            inner
-                .page_version_metas
-                .append((blknum, lsn), blob_range)
-                .unwrap();
+            let record_range = page_version
+                .record
+                .as_ref()
+                .map(|record| {
+                    let buf = WALRecord::ser(record)?;
+                    page_version_writer.write_blob(&buf)
+                })
+                .transpose()?;
+
+            let old = inner.page_version_metas.insert(
+                *key,
+                PageVersionMeta {
+                    page_image_range,
+                    record_range,
+                },
+            );
+
+            assert!(old.is_none());
        }

        let book = page_version_writer.close()?;

        // Write out page versions
        let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
-        let buf = VecMap::ser(&inner.page_version_metas)?;
+        let buf = BTreeMap::ser(&inner.page_version_metas)?;
        chapter.write_all(&buf)?;
        let book = chapter.close()?;

        // and relsizes to separate chapter
        let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
-        let buf = VecMap::ser(&inner.relsizes)?;
+        let buf = BTreeMap::ser(&inner.relsizes)?;
        chapter.write_all(&buf)?;
        let book = chapter.close()?;

@@ -478,8 +487,7 @@ impl DeltaLayer {
        let book = chapter.close()?;

        // This flushes the underlying 'buf_writer'.
-        let writer = book.close()?;
-        writer.get_ref().sync_all()?;
+        book.close()?;

        trace!("saved {}", &path.display());

@@ -488,6 +496,25 @@ impl DeltaLayer {
        Ok(delta_layer)
    }

+    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
+        let path = Self::path_for(
+            &self.path_or_conf,
+            self.timelineid,
+            self.tenantid,
+            &DeltaFileName {
+                seg: self.seg,
+                start_lsn: self.start_lsn,
+                end_lsn: self.end_lsn,
+                dropped: self.dropped,
+            },
+        );
+
+        let file = File::open(&path)?;
+        let book = Book::new(file)?;
+
+        Ok((path, book))
+    }
+
    ///
    /// Load the contents of the file into memory
    ///
@@ -499,14 +526,7 @@ impl DeltaLayer {
            return Ok(inner);
        }

-        let path = self.path();
-
-        // Open the file if it's not open already.
-        if inner.book.is_none() {
-            let file = VirtualFile::open(&path)?;
-            inner.book = Some(Book::new(file)?);
-        }
-        let book = inner.book.as_ref().unwrap();
+        let (path, book) = self.open_book()?;

        match &self.path_or_conf {
            PathOrConf::Conf(_) => {
@@ -534,16 +554,18 @@ impl DeltaLayer {
        }

        let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
-        let page_version_metas = VecMap::des(&chapter)?;
+        let page_version_metas = BTreeMap::des(&chapter)?;

        let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
-        let relsizes = VecMap::des(&chapter)?;
+        let relsizes = BTreeMap::des(&chapter)?;

        debug!("loaded from {}", &path.display());

-        inner.page_version_metas = page_version_metas;
-        inner.relsizes = relsizes;
-        inner.loaded = true;
+        *inner = DeltaLayerInner {
+            loaded: true,
+            page_version_metas,
+            relsizes,
+        };

        Ok(inner)
    }
@@ -554,6 +576,7 @@ impl DeltaLayer {
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
        filename: &DeltaFileName,
+        predecessor: Option<Arc<dyn Layer>>,
    ) -> DeltaLayer {
        DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
@@ -565,20 +588,17 @@ impl DeltaLayer {
            dropped: filename.dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: false,
-                book: None,
-                page_version_metas: VecMap::default(),
-                relsizes: VecMap::default(),
+                page_version_metas: BTreeMap::new(),
+                relsizes: BTreeMap::new(),
            }),
+            predecessor,
        }
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
-    pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
-    where
-        F: std::os::unix::prelude::FileExt,
-    {
+    pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<Self> {
        let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
        let summary = Summary::des(&chapter)?;

@@ -592,29 +612,10 @@ impl DeltaLayer {
            dropped: summary.dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: false,
-                book: None,
-                page_version_metas: VecMap::default(),
-                relsizes: VecMap::default(),
+                page_version_metas: BTreeMap::new(),
+                relsizes: BTreeMap::new(),
            }),
+            predecessor: None,
        })
    }
-
-    fn layer_name(&self) -> DeltaFileName {
-        DeltaFileName {
-            seg: self.seg,
-            start_lsn: self.start_lsn,
-            end_lsn: self.end_lsn,
-            dropped: self.dropped,
-        }
-    }
-
-    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            self.timelineid,
-            self.tenantid,
-            &self.layer_name(),
-        )
-    }
 }
--- a/pageserver/src/layered_repository/ephemeral_file.rs
+++ b/pageserver/src/layered_repository/ephemeral_file.rs
@@ -1,298 +0,0 @@
-//! Implementation of append-only file data structure
-//! used to keep in-memory layers spilled on disk.
-
-use crate::page_cache;
-use crate::page_cache::PAGE_SZ;
-use crate::page_cache::{ReadBufResult, WriteBufResult};
-use crate::virtual_file::VirtualFile;
-use crate::PageServerConf;
-use lazy_static::lazy_static;
-use std::cmp::min;
-use std::collections::HashMap;
-use std::fs::OpenOptions;
-use std::io::{Error, ErrorKind, Seek, SeekFrom, Write};
-use std::ops::DerefMut;
-use std::path::PathBuf;
-use std::sync::{Arc, RwLock};
-use zenith_utils::zid::ZTenantId;
-use zenith_utils::zid::ZTimelineId;
-
-use std::os::unix::fs::FileExt;
-
-lazy_static! {
-    ///
-    /// This is the global cache of file descriptors (File objects).
-    ///
-    static ref EPHEMERAL_FILES: RwLock<EphemeralFiles> = RwLock::new(EphemeralFiles {
-        next_file_id: 1,
-        files: HashMap::new(),
-    });
-}
-
-pub struct EphemeralFiles {
-    next_file_id: u64,
-
-    files: HashMap<u64, Arc<VirtualFile>>,
-}
-
-pub struct EphemeralFile {
-    file_id: u64,
-    _tenantid: ZTenantId,
-    _timelineid: ZTimelineId,
-    file: Arc<VirtualFile>,
-
-    pos: u64,
-}
-
-impl EphemeralFile {
-    pub fn create(
-        conf: &PageServerConf,
-        tenantid: ZTenantId,
-        timelineid: ZTimelineId,
-    ) -> Result<EphemeralFile, std::io::Error> {
-        let mut l = EPHEMERAL_FILES.write().unwrap();
-        let file_id = l.next_file_id;
-        l.next_file_id += 1;
-
-        let filename = conf
-            .timeline_path(&timelineid, &tenantid)
-            .join(PathBuf::from(format!("ephemeral-{}", file_id)));
-
-        let file = VirtualFile::open_with_options(
-            &filename,
-            OpenOptions::new().read(true).write(true).create(true),
-        )?;
-        let file_rc = Arc::new(file);
-        l.files.insert(file_id, file_rc.clone());
-
-        Ok(EphemeralFile {
-            file_id,
-            _tenantid: tenantid,
-            _timelineid: timelineid,
-            file: file_rc,
-            pos: 0,
-        })
-    }
-
-    pub fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> {
-        let mut off = 0;
-        while off < PAGE_SZ {
-            let n = self
-                .file
-                .read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?;
-
-            if n == 0 {
-                // Reached EOF. Fill the rest of the buffer with zeros.
-                const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ];
-
-                buf[off..].copy_from_slice(&ZERO_BUF[off..]);
-                break;
-            }
-
-            off += n as usize;
-        }
-        Ok(())
-    }
-}
-
-impl FileExt for EphemeralFile {
-    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        // Look up the right page
-        let blkno = (offset / PAGE_SZ as u64) as u32;
-        let off = offset as usize % PAGE_SZ;
-        let len = min(PAGE_SZ - off, dstbuf.len());
-
-        let read_guard;
-        let mut write_guard;
-
-        let cache = page_cache::get();
-        let buf = match cache.read_ephemeral_buf(self.file_id, blkno) {
-            ReadBufResult::Found(guard) => {
-                read_guard = guard;
-                read_guard.as_ref()
-            }
-            ReadBufResult::NotFound(guard) => {
-                // Read the page from disk into the buffer
-                write_guard = guard;
-                self.fill_buffer(write_guard.deref_mut(), blkno)?;
-                write_guard.mark_valid();
-
-                // And then fall through to read the requested slice from the
-                // buffer.
-                write_guard.as_ref()
-            }
-        };
-
-        dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
-        Ok(len)
-    }
-
-    fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, Error> {
-        // Look up the right page
-        let blkno = (offset / PAGE_SZ as u64) as u32;
-        let off = offset as usize % PAGE_SZ;
-        let len = min(PAGE_SZ - off, srcbuf.len());
-
-        let mut write_guard;
-        let cache = page_cache::get();
-        let buf = match cache.write_ephemeral_buf(self.file_id, blkno) {
-            WriteBufResult::Found(guard) => {
-                write_guard = guard;
-                write_guard.deref_mut()
-            }
-            WriteBufResult::NotFound(guard) => {
-                // Read the page from disk into the buffer
-                // TODO: if we're overwriting the whole page, no need to read it in first
-                write_guard = guard;
-                self.fill_buffer(write_guard.deref_mut(), blkno)?;
-                write_guard.mark_valid();
-
-                // And then fall through to modify it.
-                write_guard.deref_mut()
-            }
-        };
-
-        buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
-        write_guard.mark_dirty();
-        Ok(len)
-    }
-}
-
-impl Write for EphemeralFile {
-    fn write(&mut self, buf: &[u8]) -> Result<usize, Error> {
-        let n = self.write_at(buf, self.pos)?;
-        self.pos += n as u64;
-        Ok(n)
-    }
-
-    fn flush(&mut self) -> Result<(), std::io::Error> {
-        todo!()
-    }
-}
-
-impl Seek for EphemeralFile {
-    fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
-        match pos {
-            SeekFrom::Start(offset) => {
-                self.pos = offset;
-            }
-            SeekFrom::End(_offset) => {
-                return Err(Error::new(
-                    ErrorKind::Other,
-                    "SeekFrom::End not supported by EphemeralFile",
-                ));
-            }
-            SeekFrom::Current(offset) => {
-                let pos = self.pos as i128 + offset as i128;
-                if pos < 0 {
-                    return Err(Error::new(
-                        ErrorKind::InvalidInput,
-                        "offset would be negative",
-                    ));
-                }
-                if pos > u64::MAX as i128 {
-                    return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
-                }
-                self.pos = pos as u64;
-            }
-        }
-        Ok(self.pos)
-    }
-}
-
-impl Drop for EphemeralFile {
-    fn drop(&mut self) {
-        // drop all pages from page cache
-        let cache = page_cache::get();
-        cache.drop_buffers_for_ephemeral(self.file_id);
-
-        // remove entry from the hash map
-        EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
-
-        // unlink file
-        // FIXME: print error
-        let _ = std::fs::remove_file(&self.file.path);
-    }
-}
-
-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
-    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
-        file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64)?;
-        Ok(())
-    } else {
-        Err(std::io::Error::new(
-            ErrorKind::Other,
-            "could not write back page, not found in ephemeral files hash",
-        ))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use rand::seq::SliceRandom;
-    use rand::thread_rng;
-    use std::fs;
-    use std::str::FromStr;
-
-    fn repo_harness(
-        test_name: &str,
-    ) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> {
-        let repo_dir = PageServerConf::test_repo_dir(test_name);
-        let _ = fs::remove_dir_all(&repo_dir);
-        let conf = PageServerConf::dummy_conf(repo_dir);
-        // Make a static copy of the config. This can never be free'd, but that's
-        // OK in a test.
-        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-        let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
-        let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
-
-        Ok((conf, tenantid, timelineid))
-    }
-
-    // Helper function to slurp contents of a file, starting at the current position,
-    // into a string
-    fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, Error> {
-        let mut buf = Vec::new();
-        buf.resize(len, 0u8);
-
-        efile.read_exact_at(&mut buf, offset)?;
-
-        Ok(String::from_utf8_lossy(&buf)
-            .trim_end_matches('\0')
-            .to_string())
-    }
-
-    #[test]
-    fn test_ephemeral_files() -> Result<(), Error> {
-        let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?;
-
-        let mut file_a = EphemeralFile::create(conf, tenantid, timelineid)?;
-
-        file_a.write_all(b"foo")?;
-        assert_eq!("foo", read_string(&file_a, 0, 20)?);
-
-        file_a.write_all(b"bar")?;
-        assert_eq!("foobar", read_string(&file_a, 0, 20)?);
-
-        // Open a lot of files, enough to cause some page evictions.
-        let mut efiles = Vec::new();
-        for fileno in 0..100 {
-            let mut efile = EphemeralFile::create(conf, tenantid, timelineid)?;
-            efile.write_all(format!("file {}", fileno).as_bytes())?;
-            assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
-            efiles.push((fileno, efile));
-        }
-
-        // Check that all the files can still be read from. Use them in random order for
-        // good measure.
-        efiles.as_mut_slice().shuffle(&mut thread_rng());
-        for (fileno, efile) in efiles.iter_mut() {
-            assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
-        }
-
-        Ok(())
-    }
-}
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -13,8 +13,6 @@ use anyhow::Result;
 use log::*;
 use zenith_utils::lsn::Lsn;

-use super::metadata::METADATA_FILE_NAME;
-
 // Note: LayeredTimeline::load_layer_map() relies on this sort order
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
 pub struct DeltaFileName {
@@ -37,7 +35,7 @@ impl DeltaFileName {
    /// Parse a string as a delta file name. Returns None if the filename does not
    /// match the expected pattern.
    ///
-    pub fn parse_str(fname: &str) -> Option<Self> {
+    pub fn from_str(fname: &str) -> Option<Self> {
        let rel;
        let mut parts;
        if let Some(rest) = fname.strip_prefix("rel_") {
@@ -170,7 +168,7 @@ impl ImageFileName {
    /// Parse a string as an image file name. Returns None if the filename does not
    /// match the expected pattern.
    ///
-    pub fn parse_str(fname: &str) -> Option<Self> {
+    pub fn from_str(fname: &str) -> Option<Self> {
        let rel;
        let mut parts;
        if let Some(rest) = fname.strip_prefix("rel_") {
@@ -288,11 +286,11 @@ pub fn list_files(
        let fname = direntry?.file_name();
        let fname = fname.to_str().unwrap();

-        if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
+        if let Some(deltafilename) = DeltaFileName::from_str(fname) {
            deltafiles.push(deltafilename);
-        } else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
+        } else if let Some(imgfilename) = ImageFileName::from_str(fname) {
            imgfiles.push(imgfilename);
-        } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
+        } else if fname == "wal" || fname == "metadata" || fname == "ancestor" {
            // ignore these
        } else {
            warn!("unrecognized filename in timeline dir: {}", fname);
--- a/pageserver/src/layered_repository/global_layer_map.rs
+++ b/pageserver/src/layered_repository/global_layer_map.rs
@@ -1,142 +0,0 @@
-//!
-//! Global registry of open layers.
-//!
-//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
-//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
-//! in-memory layers in the system, and know when we need to evict some to release
-//! memory.
-//!
-//! Each layer is assigned a unique ID when it's registered in the global registry.
-//! The ID can be used to relocate the layer later, without having to hold locks.
-//!
-
-use std::sync::atomic::{AtomicU8, Ordering};
-use std::sync::{Arc, RwLock};
-
-use super::inmemory_layer::InMemoryLayer;
-
-use lazy_static::lazy_static;
-
-const MAX_USAGE_COUNT: u8 = 5;
-
-lazy_static! {
-    pub static ref GLOBAL_LAYER_MAP: RwLock<InMemoryLayers> =
-        RwLock::new(InMemoryLayers::default());
-}
-
-// TODO these types can probably be smaller
-#[derive(PartialEq, Eq, Clone, Copy)]
-pub struct LayerId {
-    index: usize,
-    tag: u64, // to avoid ABA problem
-}
-
-enum SlotData {
-    Occupied(Arc<InMemoryLayer>),
-    /// Vacant slots form a linked list, the value is the index
-    /// of the next vacant slot in the list.
-    Vacant(Option<usize>),
-}
-
-struct Slot {
-    tag: u64,
-    data: SlotData,
-    usage_count: AtomicU8, // for clock algorithm
-}
-
-#[derive(Default)]
-pub struct InMemoryLayers {
-    slots: Vec<Slot>,
-    num_occupied: usize,
-
-    // Head of free-slot list.
-    next_empty_slot_idx: Option<usize>,
-}
-
-impl InMemoryLayers {
-    pub fn insert(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
-        let slot_idx = match self.next_empty_slot_idx {
-            Some(slot_idx) => slot_idx,
-            None => {
-                let idx = self.slots.len();
-                self.slots.push(Slot {
-                    tag: 0,
-                    data: SlotData::Vacant(None),
-                    usage_count: AtomicU8::new(0),
-                });
-                idx
-            }
-        };
-        let slots_len = self.slots.len();
-
-        let slot = &mut self.slots[slot_idx];
-
-        match slot.data {
-            SlotData::Occupied(_) => {
-                panic!("an occupied slot was in the free list");
-            }
-            SlotData::Vacant(next_empty_slot_idx) => {
-                self.next_empty_slot_idx = next_empty_slot_idx;
-            }
-        }
-
-        slot.data = SlotData::Occupied(layer);
-        slot.usage_count.store(1, Ordering::Relaxed);
-
-        self.num_occupied += 1;
-        assert!(self.num_occupied <= slots_len);
-
-        LayerId {
-            index: slot_idx,
-            tag: slot.tag,
-        }
-    }
-
-    pub fn get(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
-        let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
-        if slot.tag != layer_id.tag {
-            return None;
-        }
-
-        if let SlotData::Occupied(layer) = &slot.data {
-            let _ = slot.usage_count.fetch_update(
-                Ordering::Relaxed,
-                Ordering::Relaxed,
-                |old_usage_count| {
-                    if old_usage_count < MAX_USAGE_COUNT {
-                        Some(old_usage_count + 1)
-                    } else {
-                        None
-                    }
-                },
-            );
-            Some(Arc::clone(layer))
-        } else {
-            None
-        }
-    }
-
-    // TODO this won't be a public API in the future
-    pub fn remove(&mut self, layer_id: &LayerId) {
-        let slot = &mut self.slots[layer_id.index];
-
-        if slot.tag != layer_id.tag {
-            return;
-        }
-
-        match &slot.data {
-            SlotData::Occupied(_layer) => {
-                // TODO evict the layer
-            }
-            SlotData::Vacant(_) => unimplemented!(),
-        }
-
-        slot.data = SlotData::Vacant(self.next_empty_slot_idx);
-        self.next_empty_slot_idx = Some(layer_id.index);
-
-        assert!(self.num_occupied > 0);
-        self.num_occupied -= 1;
-
-        slot.tag = slot.tag.wrapping_add(1);
-    }
-}
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -27,15 +27,15 @@ use crate::layered_repository::storage_layer::{
 };
 use crate::layered_repository::LayeredTimeline;
 use crate::layered_repository::RELISH_SEG_SIZE;
-use crate::virtual_file::VirtualFile;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Result};
 use bytes::Bytes;
 use log::*;
 use serde::{Deserialize, Serialize};
 use std::convert::TryInto;
 use std::fs;
+use std::fs::File;
 use std::io::{BufWriter, Write};
 use std::path::{Path, PathBuf};
 use std::sync::{Mutex, MutexGuard};
@@ -104,8 +104,9 @@ enum ImageType {
 }

 pub struct ImageLayerInner {
-    /// If None, the 'image_type' has not been loaded into memory yet.
-    book: Option<Book<VirtualFile>>,
+    /// If false, the 'image_type' has not been
+    /// loaded into memory yet.
+    loaded: bool,

    /// Derived from filename and bookfile chapter metadata
    image_type: ImageType,
@@ -113,11 +114,25 @@ pub struct ImageLayerInner {

 impl Layer for ImageLayer {
    fn filename(&self) -> PathBuf {
-        PathBuf::from(self.layer_name().to_string())
+        PathBuf::from(
+            ImageFileName {
+                seg: self.seg,
+                lsn: self.lsn,
+            }
+            .to_string(),
+        )
    }

-    fn get_tenant_id(&self) -> ZTenantId {
-        self.tenantid
+    fn path(&self) -> Option<PathBuf> {
+        Some(Self::path_for(
+            &self.path_or_conf,
+            self.timelineid,
+            self.tenantid,
+            &ImageFileName {
+                seg: self.seg,
+                lsn: self.lsn,
+            },
+        ))
    }

    fn get_timeline_id(&self) -> ZTimelineId {
@@ -146,20 +161,16 @@ impl Layer for ImageLayer {
        &self,
        blknum: u32,
        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        assert!(lsn >= self.lsn);

-        match cached_img_lsn {
-            Some(cached_lsn) if self.lsn <= cached_lsn => return Ok(PageReconstructResult::Cached),
-            _ => {}
-        }
-
        let inner = self.load()?;

        let base_blknum = blknum % RELISH_SEG_SIZE;

+        let (_path, book) = self.open_book()?;
+
        let buf = match &inner.image_type {
            ImageType::Blocky { num_blocks } => {
                if base_blknum >= *num_blocks {
@@ -169,23 +180,14 @@ impl Layer for ImageLayer {
                let mut buf = vec![0u8; BLOCK_SIZE];
                let offset = BLOCK_SIZE as u64 * base_blknum as u64;

-                let chapter = inner
-                    .book
-                    .as_ref()
-                    .unwrap()
-                    .chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
+                let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
                chapter.read_exact_at(&mut buf, offset)?;

                buf
            }
            ImageType::NonBlocky => {
                ensure!(base_blknum == 0);
-                inner
-                    .book
-                    .as_ref()
-                    .unwrap()
-                    .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?
-                    .into_vec()
+                book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?.into_vec()
            }
        };

@@ -207,13 +209,22 @@ impl Layer for ImageLayer {
        Ok(true)
    }

+    ///
+    /// Release most of the memory used by this layer. If it's accessed again later,
+    /// it will need to be loaded back.
+    ///
    fn unload(&self) -> Result<()> {
+        let mut inner = self.inner.lock().unwrap();
+        inner.image_type = ImageType::Blocky { num_blocks: 0 };
+        inner.loaded = false;
        Ok(())
    }

    fn delete(&self) -> Result<()> {
        // delete underlying file
-        fs::remove_file(self.path())?;
+        if let Some(path) = self.path() {
+            fs::remove_file(path)?;
+        }
        Ok(())
    }

@@ -221,10 +232,6 @@ impl Layer for ImageLayer {
        false
    }

-    fn is_in_memory(&self) -> bool {
-        false
-    }
-
    /// debugging function to print out the contents of the layer
    fn dump(&self) -> Result<()> {
        println!(
@@ -237,11 +244,8 @@ impl Layer for ImageLayer {
        match inner.image_type {
            ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
            ImageType::NonBlocky => {
-                let chapter = inner
-                    .book
-                    .as_ref()
-                    .unwrap()
-                    .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
+                let (_path, book) = self.open_book()?;
+                let chapter = book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
                println!("non-blocky ({} bytes)", chapter.len());
            }
        }
@@ -289,22 +293,19 @@ impl ImageLayer {
            seg,
            lsn,
            inner: Mutex::new(ImageLayerInner {
-                book: None,
+                loaded: true,
                image_type: image_type.clone(),
            }),
        };
        let inner = layer.inner.lock().unwrap();

        // Write the images into a file
-        //
-        // Note: Because we open the file in write-only mode, we cannot
-        // reuse the same VirtualFile for reading later. That's why we don't
-        // set inner.book here. The first read will have to re-open it.
-        //
+        let path = layer
+            .path()
+            .expect("ImageLayer is supposed to have a layer path on disk");
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-        let path = layer.path();
-        let file = VirtualFile::create(&path)?;
+        let file = File::create(&path)?;
        let buf_writer = BufWriter::new(file);
        let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;

@@ -336,10 +337,9 @@ impl ImageLayer {
        let book = chapter.close()?;

        // This flushes the underlying 'buf_writer'.
-        let writer = book.close()?;
-        writer.get_ref().sync_all()?;
+        book.close()?;

-        trace!("saved {}", path.display());
+        trace!("saved {}", &path.display());

        drop(inner);

@@ -391,19 +391,11 @@ impl ImageLayer {
        // quick exit if already loaded
        let mut inner = self.inner.lock().unwrap();

-        if inner.book.is_some() {
+        if inner.loaded {
            return Ok(inner);
        }

-        let path = self.path();
-        let file = VirtualFile::open(&path)
-            .with_context(|| format!("Failed to open virtual file '{}'", path.display()))?;
-        let book = Book::new(file).with_context(|| {
-            format!(
-                "Failed to open virtual file '{}' as a bookfile",
-                path.display()
-            )
-        })?;
+        let (path, book) = self.open_book()?;

        match &self.path_or_conf {
            PathOrConf::Conf(_) => {
@@ -444,13 +436,30 @@ impl ImageLayer {
        debug!("loaded from {}", &path.display());

        *inner = ImageLayerInner {
-            book: Some(book),
+            loaded: true,
            image_type,
        };

        Ok(inner)
    }

+    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
+        let path = Self::path_for(
+            &self.path_or_conf,
+            self.timelineid,
+            self.tenantid,
+            &ImageFileName {
+                seg: self.seg,
+                lsn: self.lsn,
+            },
+        );
+
+        let file = File::open(&path)?;
+        let book = Book::new(file)?;
+
+        Ok((path, book))
+    }
+
    /// Create an ImageLayer struct representing an existing file on disk
    pub fn new(
        conf: &'static PageServerConf,
@@ -465,7 +474,7 @@ impl ImageLayer {
            seg: filename.seg,
            lsn: filename.lsn,
            inner: Mutex::new(ImageLayerInner {
-                book: None,
+                loaded: false,
                image_type: ImageType::Blocky { num_blocks: 0 },
            }),
        }
@@ -474,10 +483,7 @@ impl ImageLayer {
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
-    pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<ImageLayer>
-    where
-        F: std::os::unix::prelude::FileExt,
-    {
+    pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<ImageLayer> {
        let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
        let summary = Summary::des(&chapter)?;

@@ -488,26 +494,9 @@ impl ImageLayer {
            seg: summary.seg,
            lsn: summary.lsn,
            inner: Mutex::new(ImageLayerInner {
-                book: None,
+                loaded: false,
                image_type: ImageType::Blocky { num_blocks: 0 },
            }),
        })
    }
-
-    fn layer_name(&self) -> ImageFileName {
-        ImageFileName {
-            seg: self.seg,
-            lsn: self.lsn,
-        }
-    }
-
-    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            self.timelineid,
-            self.tenantid,
-            &self.layer_name(),
-        )
-    }
 }
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -1,10 +1,7 @@
-//! An in-memory layer stores recently received PageVersions.
-//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited
-//! and layers can be spilled to disk into ephemeral files.
 //!
-//! And there's another BTreeMap to track the size of the relation.
+//! An in-memory layer stores recently received page versions in memory. The page versions
+//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
 //!
-use crate::layered_repository::ephemeral_file::EphemeralFile;
 use crate::layered_repository::filename::DeltaFileName;
 use crate::layered_repository::storage_layer::{
    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
@@ -15,15 +12,17 @@ use crate::layered_repository::{DeltaLayer, ImageLayer};
 use crate::repository::WALRecord;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{ensure, Result};
+use anyhow::{bail, Result};
 use bytes::Bytes;
 use log::*;
+use std::cmp::min;
+use std::collections::BTreeMap;
+use std::ops::Bound::Included;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
-use zenith_utils::lsn::Lsn;
-use zenith_utils::vec_map::VecMap;

-use super::page_versions::PageVersions;
+use zenith_utils::accum::Accum;
+use zenith_utils::lsn::Lsn;

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
@@ -37,87 +36,90 @@ pub struct InMemoryLayer {
    ///
    start_lsn: Lsn,

+    /// Frozen in-memory layers have an inclusive end LSN.
+    end_lsn: Option<Lsn>,
+
    /// LSN of the oldest page version stored in this layer
    oldest_pending_lsn: Lsn,

    /// The above fields never change. The parts that do change are in 'inner',
    /// and protected by mutex.
    inner: RwLock<InMemoryLayerInner>,
-
-    /// Predecessor layer might be needed?
-    incremental: bool,
 }

 pub struct InMemoryLayerInner {
-    /// Frozen layers have an exclusive end LSN.
-    /// Writes are only allowed when this is None
-    end_lsn: Option<Lsn>,
-
    /// If this relation was dropped, remember when that happened.
-    /// The drop LSN is recorded in [`end_lsn`].
-    dropped: bool,
+    drop_lsn: Option<Lsn>,

    ///
    /// All versions of all pages in the layer are are kept here.
    /// Indexed by block number and LSN.
    ///
-    page_versions: PageVersions,
+    page_versions: BTreeMap<(u32, Lsn), PageVersion>,

    ///
    /// `segsizes` tracks the size of the segment at different points in time.
    ///
-    /// For a blocky rel, there is always one entry, at the layer's start_lsn,
-    /// so that determining the size never depends on the predecessor layer. For
-    /// a non-blocky rel, 'segsizes' is not used and is always empty.
-    ///
-    segsizes: VecMap<Lsn, u32>,
+    segsizes: BTreeMap<Lsn, u32>,
+
+    /// Writes are only allowed when true.
+    /// Set to false when this layer is in the process of being replaced.
+    writeable: bool,
+
+    /// Predecessor layer
+    predecessor: Option<Arc<dyn Layer>>,
 }

 impl InMemoryLayerInner {
-    fn assert_writeable(&self) {
-        assert!(self.end_lsn.is_none());
+    fn check_writeable(&self) -> WriteResult<()> {
+        if self.writeable {
+            Ok(())
+        } else {
+            Err(NonWriteableError)
+        }
    }

    fn get_seg_size(&self, lsn: Lsn) -> u32 {
        // Scan the BTreeMap backwards, starting from the given entry.
-        let slice = self.segsizes.slice_range(..=lsn);
+        let mut iter = self.segsizes.range((Included(&Lsn(0)), Included(&lsn)));

-        // We make sure there is always at least one entry
-        if let Some((_entry_lsn, entry)) = slice.last() {
+        if let Some((_entry_lsn, entry)) = iter.next_back() {
            *entry
        } else {
-            panic!("could not find seg size in in-memory layer");
+            0
        }
    }
 }

 impl Layer for InMemoryLayer {
-    // An in-memory layer can be spilled to disk into ephemeral file,
-    // This function is used only for debugging, so we don't need to be very precise.
-    // Construct a filename as if it was a delta layer.
+    // An in-memory layer doesn't really have a filename as it's not stored on disk,
+    // but we construct a filename as if it was a delta layer
    fn filename(&self) -> PathBuf {
        let inner = self.inner.read().unwrap();

        let end_lsn;
-        if let Some(drop_lsn) = inner.end_lsn {
+        let dropped;
+        if let Some(drop_lsn) = inner.drop_lsn {
            end_lsn = drop_lsn;
+            dropped = true;
        } else {
            end_lsn = Lsn(u64::MAX);
+            dropped = false;
        }

        let delta_filename = DeltaFileName {
            seg: self.seg,
            start_lsn: self.start_lsn,
            end_lsn,
-            dropped: inner.dropped,
+            dropped,
        }
        .to_string();

        PathBuf::from(format!("inmem-{}", delta_filename))
    }

-    fn get_tenant_id(&self) -> ZTenantId {
-        self.tenantid
+    fn path(&self) -> Option<PathBuf> {
+        None
    }

    fn get_timeline_id(&self) -> ZTimelineId {
@@ -133,10 +135,14 @@ impl Layer for InMemoryLayer {
    }

    fn get_end_lsn(&self) -> Lsn {
+        if let Some(end_lsn) = self.end_lsn {
+            return Lsn(end_lsn.0 + 1);
+        }
+
        let inner = self.inner.read().unwrap();

-        if let Some(end_lsn) = inner.end_lsn {
-            end_lsn
+        if let Some(drop_lsn) = inner.drop_lsn {
+            drop_lsn
        } else {
            Lsn(u64::MAX)
        }
@@ -144,7 +150,7 @@ impl Layer for InMemoryLayer {

    fn is_dropped(&self) -> bool {
        let inner = self.inner.read().unwrap();
-        inner.dropped
+        inner.drop_lsn.is_some()
    }

    /// Look up given page in the cache.
@@ -152,55 +158,50 @@ impl Layer for InMemoryLayer {
        &self,
        blknum: u32,
        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult> {
        let mut need_image = true;

        assert!(self.seg.blknum_in_seg(blknum));

+        let predecessor: Option<Arc<dyn Layer>>;
+
        {
            let inner = self.inner.read().unwrap();

-            // Scan the page versions backwards, starting from `lsn`.
-            let iter = inner
+            // Scan the BTreeMap backwards, starting from reconstruct_data.lsn.
+            let minkey = (blknum, Lsn(0));
+            let maxkey = (blknum, lsn);
+            let mut iter = inner
                .page_versions
-                .get_block_lsn_range(blknum, ..=lsn)
-                .iter()
-                .rev();
-            for (entry_lsn, pos) in iter {
-                match &cached_img_lsn {
-                    Some(cached_lsn) if entry_lsn <= cached_lsn => {
-                        return Ok(PageReconstructResult::Cached)
-                    }
-                    _ => {}
-                }
-
-                let pv = inner.page_versions.get_page_version(*pos)?;
-                match pv {
-                    PageVersion::Page(img) => {
-                        reconstruct_data.page_img = Some(img);
+                .range((Included(&minkey), Included(&maxkey)));
+            while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
+                if let Some(img) = &entry.page_image {
+                    reconstruct_data.page_img = Some(img.clone());
+                    need_image = false;
+                    break;
+                } else if let Some(rec) = &entry.record {
+                    reconstruct_data.records.push(rec.clone());
+                    if rec.will_init {
+                        // This WAL record initializes the page, so no need to go further back
                        need_image = false;
                        break;
                    }
-                    PageVersion::Wal(rec) => {
-                        reconstruct_data.records.push((*entry_lsn, rec.clone()));
-                        if rec.will_init {
-                            // This WAL record initializes the page, so no need to go further back
-                            need_image = false;
-                            break;
-                        }
-                    }
+                } else {
+                    // No base image, and no WAL record. Huh?
+                    bail!("no page image or WAL record for requested page");
                }
            }
+
+            predecessor = inner.predecessor.clone();
            // release lock on 'inner'
        }

        // If an older page image is needed to reconstruct the page, let the
-        // caller know
+        // caller know about the predecessor layer.
        if need_image {
-            if self.incremental {
-                Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
+            if let Some(cont_layer) = predecessor {
+                Ok(PageReconstructResult::Continue(self.start_lsn, cont_layer))
            } else {
                Ok(PageReconstructResult::Missing(self.start_lsn))
            }
@@ -212,10 +213,6 @@ impl Layer for InMemoryLayer {
    /// Get size of the relation at given LSN
    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
        assert!(lsn >= self.start_lsn);
-        ensure!(
-            self.seg.rel.is_blocky(),
-            "get_seg_size() called on a non-blocky rel"
-        );

        let inner = self.inner.read().unwrap();
        Ok(inner.get_seg_size(lsn))
@@ -231,13 +228,9 @@ impl Layer for InMemoryLayer {
        assert!(lsn >= self.start_lsn);

        // Is the requested LSN after the segment was dropped?
-        if inner.dropped {
-            if let Some(end_lsn) = inner.end_lsn {
-                if lsn >= end_lsn {
-                    return Ok(false);
-                }
-            } else {
-                panic!("dropped in-memory layer with no end LSN");
+        if let Some(drop_lsn) = inner.drop_lsn {
+            if lsn >= drop_lsn {
+                return Ok(false);
            }
        }

@@ -255,15 +248,12 @@ impl Layer for InMemoryLayer {
    /// Nothing to do here. When you drop the last reference to the layer, it will
    /// be deallocated.
    fn delete(&self) -> Result<()> {
-        panic!("can't delete an InMemoryLayer")
+        Ok(())
    }

    fn is_incremental(&self) -> bool {
-        self.incremental
-    }
-
-    fn is_in_memory(&self) -> bool {
-        true
+        let inner = self.inner.read().unwrap();
+        inner.predecessor.is_some()
    }

    /// debugging function to print out the contents of the layer
@@ -271,41 +261,54 @@ impl Layer for InMemoryLayer {
        let inner = self.inner.read().unwrap();

        let end_str = inner
-            .end_lsn
+            .drop_lsn
            .as_ref()
-            .map(Lsn::to_string)
+            .map(|drop_lsn| drop_lsn.to_string())
            .unwrap_or_default();

        println!(
-            "----- in-memory layer for tli {} seg {} {}-{} {} ----",
-            self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
+            "----- in-memory layer for tli {} seg {} {}-{} ----",
+            self.timelineid, self.seg, self.start_lsn, end_str
        );

-        for (k, v) in inner.segsizes.as_slice() {
+        for (k, v) in inner.segsizes.iter() {
            println!("segsizes {}: {}", k, v);
        }

-        for (blknum, lsn, pos) in inner.page_versions.ordered_page_version_iter(None) {
-            let pv = inner.page_versions.get_page_version(pos)?;
-            let pv_description = match pv {
-                PageVersion::Page(_img) => "page",
-                PageVersion::Wal(_rec) => "wal",
-            };
-
-            println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
+        for (k, v) in inner.page_versions.iter() {
+            println!(
+                "blk {} at {}: {}/{}\n",
+                k.0,
+                k.1,
+                v.page_image.is_some(),
+                v.record.is_some()
+            );
        }

        Ok(())
    }
 }

-/// A result of an inmemory layer data being written to disk.
-pub struct LayersOnDisk {
-    pub delta_layers: Vec<DeltaLayer>,
-    pub image_layers: Vec<ImageLayer>,
+/// Write failed because the layer is in process of being replaced.
+/// See [`LayeredTimeline::perform_write_op`] for how to handle this error.
+#[derive(Debug)]
+pub struct NonWriteableError;
+
+pub type WriteResult<T> = std::result::Result<T, NonWriteableError>;
+
+/// Helper struct to cleanup `InMemoryLayer::freeze` return signature.
+pub struct FreezeLayers {
+    /// Replacement layer for the layer which freeze was called on.
+    pub frozen: Arc<InMemoryLayer>,
+    /// New open layer containing leftover data.
+    pub open: Option<Arc<InMemoryLayer>>,
 }

 impl InMemoryLayer {
+    fn assert_not_frozen(&self) {
+        assert!(self.end_lsn.is_none());
+    }
+
    /// Return the oldest page version that's stored in this layer
    pub fn get_oldest_pending_lsn(&self) -> Lsn {
        self.oldest_pending_lsn
@@ -329,27 +332,20 @@ impl InMemoryLayer {
            start_lsn
        );

-        // The segment is initially empty, so initialize 'segsizes' with 0.
-        let mut segsizes = VecMap::default();
-        if seg.rel.is_blocky() {
-            segsizes.append(start_lsn, 0).unwrap();
-        }
-
-        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
-
        Ok(InMemoryLayer {
            conf,
            timelineid,
            tenantid,
            seg,
            start_lsn,
+            end_lsn: None,
            oldest_pending_lsn,
-            incremental: false,
            inner: RwLock::new(InMemoryLayerInner {
-                end_lsn: None,
-                dropped: false,
-                page_versions: PageVersions::new(file),
-                segsizes,
+                drop_lsn: None,
+                page_versions: BTreeMap::new(),
+                segsizes: BTreeMap::new(),
+                writeable: true,
+                predecessor: None,
            }),
        })
    }
@@ -357,18 +353,33 @@ impl InMemoryLayer {
    // Write operations

    /// Remember new page version, as a WAL record over previous version
-    pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> Result<u32> {
-        self.put_page_version(blknum, lsn, PageVersion::Wal(rec))
+    pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> WriteResult<u32> {
+        self.put_page_version(
+            blknum,
+            rec.lsn,
+            PageVersion {
+                page_image: None,
+                record: Some(rec),
+            },
+        )
    }

    /// Remember new page version, as a full page image
-    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<u32> {
-        self.put_page_version(blknum, lsn, PageVersion::Page(img))
+    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> WriteResult<u32> {
+        self.put_page_version(
+            blknum,
+            lsn,
+            PageVersion {
+                page_image: Some(img),
+                record: None,
+            },
+        )
    }

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<u32> {
+    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> WriteResult<u32> {
+        self.assert_not_frozen();
        assert!(self.seg.blknum_in_seg(blknum));

        trace!(
@@ -380,9 +391,9 @@ impl InMemoryLayer {
        );
        let mut inner = self.inner.write().unwrap();

-        inner.assert_writeable();
+        inner.check_writeable()?;

-        let old = inner.page_versions.append_or_update_last(blknum, lsn, pv)?;
+        let old = inner.page_versions.insert((blknum, lsn), pv);

        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -418,15 +429,16 @@ impl InMemoryLayer {
                // subsequent call to initialize the gap page.
                let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize;
                for gapblknum in gapstart..blknum {
-                    let zeropv = PageVersion::Page(ZERO_PAGE.clone());
+                    let zeropv = PageVersion {
+                        page_image: Some(ZERO_PAGE.clone()),
+                        record: None,
+                    };
                    trace!(
                        "filling gap blk {} with zeros for write of {}",
                        gapblknum,
                        blknum
                    );
-                    let old = inner
-                        .page_versions
-                        .append_or_update_last(gapblknum, lsn, zeropv)?;
+                    let old = inner.page_versions.insert((gapblknum, lsn), zeropv);
                    // We already had an entry for this LSN. That's odd..

                    if old.is_some() {
@@ -437,47 +449,49 @@ impl InMemoryLayer {
                    }
                }

-                inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
+                inner.segsizes.insert(lsn, newsize);
                return Ok(newsize - oldsize);
            }
        }
-
        Ok(0)
    }

    /// Remember that the relation was truncated at given LSN
-    pub fn put_truncation(&self, lsn: Lsn, segsize: u32) {
-        assert!(
-            self.seg.rel.is_blocky(),
-            "put_truncation() called on a non-blocky rel"
-        );
+    pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> WriteResult<()> {
+        self.assert_not_frozen();

        let mut inner = self.inner.write().unwrap();
-        inner.assert_writeable();
+        inner.check_writeable()?;

        // check that this we truncate to a smaller size than segment was before the truncation
        let oldsize = inner.get_seg_size(lsn);
        assert!(segsize < oldsize);

-        let (old, _delta_size) = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
+        let old = inner.segsizes.insert(lsn, segsize);

        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
            warn!("Inserting truncation, but had an entry for the LSN already");
        }
+
+        Ok(())
    }

    /// Remember that the segment was dropped at given LSN
-    pub fn drop_segment(&self, lsn: Lsn) {
+    pub fn drop_segment(&self, lsn: Lsn) -> WriteResult<()> {
+        self.assert_not_frozen();
+
        let mut inner = self.inner.write().unwrap();

-        assert!(inner.end_lsn.is_none());
-        assert!(!inner.dropped);
-        inner.dropped = true;
-        assert!(self.start_lsn < lsn);
-        inner.end_lsn = Some(lsn);
+        inner.check_writeable()?;
+
+        assert!(inner.drop_lsn.is_none());
+        inner.drop_lsn = Some(lsn);
+        inner.writeable = false;

        trace!("dropped segment {} at {}", self.seg, lsn);
+
+        Ok(())
    }

    ///
@@ -504,58 +518,137 @@ impl InMemoryLayer {
            start_lsn,
        );

-        // Copy the segment size at the start LSN from the predecessor layer.
-        let mut segsizes = VecMap::default();
+        // For convenience, copy the segment size from the predecessor layer
+        let mut segsizes = BTreeMap::new();
        if seg.rel.is_blocky() {
            let size = src.get_seg_size(start_lsn)?;
-            segsizes.append(start_lsn, size).unwrap();
+            segsizes.insert(start_lsn, size);
        }

-        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
-
        Ok(InMemoryLayer {
            conf,
            timelineid,
            tenantid,
            seg,
            start_lsn,
+            end_lsn: None,
            oldest_pending_lsn,
-            incremental: true,
            inner: RwLock::new(InMemoryLayerInner {
-                end_lsn: None,
-                dropped: false,
-                page_versions: PageVersions::new(file),
+                drop_lsn: None,
+                page_versions: BTreeMap::new(),
                segsizes,
+                writeable: true,
+                predecessor: Some(src),
            }),
        })
    }

    pub fn is_writeable(&self) -> bool {
        let inner = self.inner.read().unwrap();
-        inner.end_lsn.is_none()
+        inner.writeable
    }

-    /// Make the layer non-writeable. Only call once.
-    /// Records the end_lsn for non-dropped layers.
-    /// `end_lsn` is inclusive
-    pub fn freeze(&self, end_lsn: Lsn) {
-        let mut inner = self.inner.write().unwrap();
+    /// Splits `self` into two InMemoryLayers: `frozen` and `open`.
+    /// All data up to and including `cutoff_lsn`
+    /// is copied to `frozen`, while the remaining data is copied to `open`.
+    /// After completion, self is non-writeable, but not frozen.
+    pub fn freeze(self: Arc<Self>, cutoff_lsn: Lsn) -> Result<FreezeLayers> {
+        info!(
+            "freezing in memory layer {} on timeline {} at {} (oldest {})",
+            self.filename().display(),
+            self.timelineid,
+            cutoff_lsn,
+            self.oldest_pending_lsn
+        );

-        if inner.end_lsn.is_some() {
-            assert!(inner.dropped);
-        } else {
-            assert!(!inner.dropped);
-            assert!(self.start_lsn < end_lsn + 1);
-            inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
+        self.assert_not_frozen();

-            if let Some((lsn, _)) = inner.segsizes.as_slice().last() {
-                assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
-            }
+        let self_ref = self.clone();
+        let mut inner = self_ref.inner.write().unwrap();
+        // Dropped layers don't need any special freeze actions,
+        // they are marked as non-writeable at drop and just
+        // written out to disk by checkpointer.
+        if inner.drop_lsn.is_some() {
+            assert!(!inner.writeable);
+            info!(
+                "freezing in memory layer for {} on timeline {} is dropped at {}",
+                self.seg,
+                self.timelineid,
+                inner.drop_lsn.unwrap()
+            );

-            for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) {
-                assert!(lsn <= end_lsn);
+            // There should be no newer layer that refers this non-writeable layer,
+            // because layer that is created after dropped one represents a new rel.
+            return Ok(FreezeLayers {
+                frozen: self,
+                open: None,
+            });
+        }
+        assert!(inner.writeable);
+        inner.writeable = false;
+
+        // Divide all the page versions into old and new
+        // at the 'cutoff_lsn' point.
+        let mut before_segsizes = BTreeMap::new();
+        let mut after_segsizes = BTreeMap::new();
+        let mut after_oldest_lsn: Accum<Lsn> = Accum(None);
+        for (lsn, size) in inner.segsizes.iter() {
+            if *lsn > cutoff_lsn {
+                after_segsizes.insert(*lsn, *size);
+                after_oldest_lsn.accum(min, *lsn);
+            } else {
+                before_segsizes.insert(*lsn, *size);
            }
        }
+
+        let mut before_page_versions = BTreeMap::new();
+        let mut after_page_versions = BTreeMap::new();
+        for ((blknum, lsn), pv) in inner.page_versions.iter() {
+            if *lsn > cutoff_lsn {
+                after_page_versions.insert((*blknum, *lsn), pv.clone());
+                after_oldest_lsn.accum(min, *lsn);
+            } else {
+                before_page_versions.insert((*blknum, *lsn), pv.clone());
+            }
+        }
+
+        let frozen = Arc::new(InMemoryLayer {
+            conf: self.conf,
+            tenantid: self.tenantid,
+            timelineid: self.timelineid,
+            seg: self.seg,
+            start_lsn: self.start_lsn,
+            end_lsn: Some(cutoff_lsn),
+            oldest_pending_lsn: self.start_lsn,
+            inner: RwLock::new(InMemoryLayerInner {
+                drop_lsn: inner.drop_lsn,
+                page_versions: before_page_versions,
+                segsizes: before_segsizes,
+                writeable: false,
+                predecessor: inner.predecessor.clone(),
+            }),
+        });
+
+        let open = if !after_segsizes.is_empty() || !after_page_versions.is_empty() {
+            let mut new_open = Self::create_successor_layer(
+                self.conf,
+                frozen.clone(),
+                self.timelineid,
+                self.tenantid,
+                cutoff_lsn + 1,
+                after_oldest_lsn.0.unwrap(),
+            )?;
+
+            let new_inner = new_open.inner.get_mut().unwrap();
+            new_inner.page_versions.append(&mut after_page_versions);
+            new_inner.segsizes.append(&mut after_segsizes);
+
+            Some(Arc::new(new_open))
+        } else {
+            None
+        };
+
+        Ok(FreezeLayers { frozen, open })
    }

    /// Write the this frozen in-memory layer to disk.
@@ -566,15 +659,16 @@ impl InMemoryLayer {
    /// WAL records between start and end LSN. (The delta layer is not needed
    /// when a new relish is created with a single LSN, so that the start and
    /// end LSN are the same.)
-    pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<LayersOnDisk> {
+    pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<Vec<Arc<dyn Layer>>> {
        trace!(
-            "write_to_disk {} get_end_lsn is {}",
+            "write_to_disk {} end_lsn is {} get_end_lsn is {}",
            self.filename().display(),
+            self.end_lsn.unwrap_or(Lsn(0)),
            self.get_end_lsn()
        );

        // Grab the lock in read-mode. We hold it over the I/O, but because this
-        // layer is not writeable anymore, no one should be trying to acquire the
+        // layer is not writeable anymore, no one should be trying to aquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
        // though: another thread might have grabbed a reference to this layer
        // in `get_layer_for_write' just before the checkpointer called
@@ -583,42 +677,49 @@ impl InMemoryLayer {
        // would have to wait until we release it. That race condition is very
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().unwrap();
-        let end_lsn_exclusive = inner.end_lsn.unwrap();
+        assert!(!inner.writeable);

-        if inner.dropped {
+        let predecessor = inner.predecessor.clone();
+
+        if let Some(drop_lsn) = inner.drop_lsn {
            let delta_layer = DeltaLayer::create(
                self.conf,
                self.timelineid,
                self.tenantid,
                self.seg,
                self.start_lsn,
-                end_lsn_exclusive,
+                drop_lsn,
                true,
-                &inner.page_versions,
-                None,
+                predecessor,
+                inner.page_versions.iter(),
                inner.segsizes.clone(),
            )?;
            trace!(
                "freeze: created delta layer for dropped segment {} {}-{}",
                self.seg,
                self.start_lsn,
-                end_lsn_exclusive
+                drop_lsn
            );
-            return Ok(LayersOnDisk {
-                delta_layers: vec![delta_layer],
-                image_layers: Vec::new(),
-            });
+            return Ok(vec![Arc::new(delta_layer)]);
        }

-        // Since `end_lsn` is inclusive, subtract 1.
-        // We want to make an ImageLayer for the last included LSN,
-        // so the DeltaLayer should exclude that LSN.
-        let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
+        let end_lsn = self.end_lsn.unwrap();

-        let mut delta_layers = Vec::new();
+        let mut before_segsizes = BTreeMap::new();
+        for (lsn, size) in inner.segsizes.iter() {
+            if *lsn <= end_lsn {
+                before_segsizes.insert(*lsn, *size);
+            }
+        }
+        let mut before_page_versions = inner.page_versions.iter().filter(|tup| {
+            let ((_blknum, lsn), _pv) = tup;

-        if self.start_lsn != end_lsn_inclusive {
-            let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive);
+            *lsn < end_lsn
+        });
+
+        let mut frozen_layers: Vec<Arc<dyn Layer>> = Vec::new();
+
+        if self.start_lsn != end_lsn {
            // Write the page versions before the cutoff to disk.
            let delta_layer = DeltaLayer::create(
                self.conf,
@@ -626,41 +727,35 @@ impl InMemoryLayer {
                self.tenantid,
                self.seg,
                self.start_lsn,
-                end_lsn_inclusive,
+                end_lsn,
                false,
-                &inner.page_versions,
-                Some(end_lsn_inclusive),
-                segsizes,
+                predecessor,
+                before_page_versions,
+                before_segsizes,
            )?;
-            delta_layers.push(delta_layer);
+            frozen_layers.push(Arc::new(delta_layer));
            trace!(
                "freeze: created delta layer {} {}-{}",
                self.seg,
                self.start_lsn,
-                end_lsn_inclusive
+                end_lsn
            );
        } else {
-            assert!(inner
-                .page_versions
-                .ordered_page_version_iter(None)
-                .next()
-                .is_none());
+            assert!(before_page_versions.next().is_none());
        }

        drop(inner);

        // Write a new base image layer at the cutoff point
-        let image_layer =
-            ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?;
-        trace!(
-            "freeze: created image layer {} at {}",
-            self.seg,
-            end_lsn_inclusive
-        );
+        let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
+        frozen_layers.push(Arc::new(image_layer));
+        trace!("freeze: created image layer {} at {}", self.seg, end_lsn);

-        Ok(LayersOnDisk {
-            delta_layers,
-            image_layers: vec![image_layer],
-        })
+        Ok(frozen_layers)
+    }
+
+    pub fn update_predecessor(&self, predecessor: Arc<dyn Layer>) -> Option<Arc<dyn Layer>> {
+        let mut inner = self.inner.write().unwrap();
+        inner.predecessor.replace(predecessor)
    }
 }
--- a/pageserver/src/layered_repository/interval_tree.rs
+++ b/pageserver/src/layered_repository/interval_tree.rs
@@ -283,7 +283,6 @@ mod tests {
            write!(f, "{}", self.val)
        }
    }
-    #[rustfmt::skip]
    fn assert_search(
        tree: &IntervalTree<MockItem>,
        key: u32,
@@ -292,20 +291,24 @@ mod tests {
        if let Some(v) = tree.search(key) {
            let vstr = v.to_string();

-            assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
-            assert!(
-                expected.contains(&vstr.as_str()),
-                "search with {} returned {}, expected one of: {:?}",
-                key, v, expected,
-            );
+            if expected.is_empty() {
+                panic!("search with {} returned {}, expected None", key, v);
+            }

+            if !expected.contains(&vstr.as_str()) {
+                panic!(
+                    "search with {} returned {}, expected one of: {:?}",
+                    key, v, expected
+                );
+            }
            Some(v)
        } else {
-            assert!(
-                expected.is_empty(),
-                "search with {} returned None, expected one of {:?}",
-                key, expected
-            );
+            if !expected.is_empty() {
+                panic!(
+                    "search with {} returned None, expected one of {:?}",
+                    key, expected
+                );
+            }
            None
        }
    }
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -21,8 +21,6 @@ use std::sync::Arc;
 use zenith_metrics::{register_int_gauge, IntGauge};
 use zenith_utils::lsn::Lsn;

-use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
-
 lazy_static! {
    static ref NUM_INMEMORY_LAYERS: IntGauge =
        register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
@@ -70,9 +68,7 @@ impl LayerMap {
    pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
        let segentry = self.segs.get(tag)?;

-        segentry
-            .open_layer_id
-            .and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
+        segentry.open.as_ref().map(Arc::clone)
    }

    ///
@@ -81,7 +77,7 @@ impl LayerMap {
    pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
        let segentry = self.segs.entry(layer.get_seg_tag()).or_default();

-        let layer_id = segentry.update_open(Arc::clone(&layer));
+        segentry.update_open(Arc::clone(&layer));

        let oldest_pending_lsn = layer.get_oldest_pending_lsn();

@@ -93,7 +89,7 @@ impl LayerMap {
        // Also add it to the binary heap
        let open_layer_entry = OpenLayerEntry {
            oldest_pending_lsn: layer.get_oldest_pending_lsn(),
-            layer_id,
+            layer,
            generation: self.current_generation,
        };
        self.open_layers.push(open_layer_entry);
@@ -101,35 +97,24 @@ impl LayerMap {
        NUM_INMEMORY_LAYERS.inc();
    }

-    /// Remove an open in-memory layer
-    pub fn remove_open(&mut self, layer_id: LayerId) {
-        // Note: we don't try to remove the entry from the binary heap.
-        // It will be removed lazily by peek_oldest_open() when it's made it to
-        // the top of the heap.
+    /// Remove the oldest in-memory layer
+    pub fn pop_oldest_open(&mut self) {
+        // Pop it from the binary heap
+        let oldest_entry = self.open_layers.pop().unwrap();
+        let segtag = oldest_entry.layer.get_seg_tag();

-        let layer_opt = {
-            let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
-            let layer_opt = global_map.get(&layer_id);
-            global_map.remove(&layer_id);
-            // TODO it's bad that a ref can still exist after being evicted from cache
-            layer_opt
-        };
-
-        if let Some(layer) = layer_opt {
-            let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
-
-            if segentry.open_layer_id == Some(layer_id) {
-                // Also remove it from the SegEntry of this segment
-                segentry.open_layer_id = None;
-            } else {
-                // We could have already updated segentry.open for
-                // dropped (non-writeable) layer. This is fine.
-                assert!(!layer.is_writeable());
-                assert!(layer.is_dropped());
-            }
-
-            NUM_INMEMORY_LAYERS.dec();
+        // Also remove it from the SegEntry of this segment
+        let mut segentry = self.segs.get_mut(&segtag).unwrap();
+        if Arc::ptr_eq(segentry.open.as_ref().unwrap(), &oldest_entry.layer) {
+            segentry.open = None;
+        } else {
+            // We could have already updated segentry.open for
+            // dropped (non-writeable) layer. This is fine.
+            assert!(!oldest_entry.layer.is_writeable());
+            assert!(oldest_entry.layer.is_dropped());
        }
+
+        NUM_INMEMORY_LAYERS.dec();
    }

    ///
@@ -214,17 +199,10 @@ impl LayerMap {
    }

    /// Return the oldest in-memory layer, along with its generation number.
-    pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<InMemoryLayer>, u64)> {
-        let global_map = GLOBAL_LAYER_MAP.read().unwrap();
-
-        while let Some(oldest_entry) = self.open_layers.peek() {
-            if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
-                return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
-            } else {
-                self.open_layers.pop();
-            }
-        }
-        None
+    pub fn peek_oldest_open(&self) -> Option<(Arc<InMemoryLayer>, u64)> {
+        self.open_layers
+            .peek()
+            .map(|oldest_entry| (Arc::clone(&oldest_entry.layer), oldest_entry.generation))
    }

    /// Increment the generation number used to stamp open in-memory layers. Layers
@@ -247,12 +225,8 @@ impl LayerMap {
    pub fn dump(&self) -> Result<()> {
        println!("Begin dump LayerMap");
        for (seg, segentry) in self.segs.iter() {
-            if let Some(open) = &segentry.open_layer_id {
-                if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
-                    layer.dump()?;
-                } else {
-                    println!("layer not found in global map");
-                }
+            if let Some(open) = &segentry.open {
+                open.dump()?;
            }

            for layer in segentry.historic.iter() {
@@ -285,7 +259,7 @@ impl IntervalItem for dyn Layer {
 /// IntervalTree.
 #[derive(Default)]
 struct SegEntry {
-    open_layer_id: Option<LayerId>,
+    open: Option<Arc<InMemoryLayer>>,
    historic: IntervalTree<dyn Layer>,
 }

@@ -301,10 +275,10 @@ impl SegEntry {
    }

    pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
-        if let Some(open_layer_id) = &self.open_layer_id {
-            let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
-            if open_layer.get_start_lsn() <= lsn {
-                return Some(open_layer);
+        if let Some(open) = &self.open {
+            if open.get_start_lsn() <= lsn {
+                let x: Arc<dyn Layer> = Arc::clone(open) as _;
+                return Some(x);
            }
        }

@@ -323,16 +297,11 @@ impl SegEntry {
    // Set new open layer for a SegEntry.
    // It's ok to rewrite previous open layer,
    // but only if it is not writeable anymore.
-    pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
-        if let Some(prev_open_layer_id) = &self.open_layer_id {
-            if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
-            {
-                assert!(!prev_open_layer.is_writeable());
-            }
+    pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) {
+        if let Some(prev_open) = &self.open {
+            assert!(!prev_open.is_writeable());
        }
-        let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
-        self.open_layer_id = Some(open_layer_id);
-        open_layer_id
+        self.open = Some(layer);
    }

    pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
@@ -347,9 +316,9 @@ impl SegEntry {
 /// recently-added entries (i.e after last call to increment_generation()) from older
 /// entries with the same 'oldest_pending_lsn'.
 struct OpenLayerEntry {
-    oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
-    generation: u64,
-    layer_id: LayerId,
+    pub oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
+    pub generation: u64,
+    pub layer: Arc<InMemoryLayer>,
 }
 impl Ord for OpenLayerEntry {
    fn cmp(&self, other: &Self) -> Ordering {
@@ -414,13 +383,6 @@ mod tests {
        forknum: 0,
    });

-    lazy_static! {
-        static ref DUMMY_TIMELINEID: ZTimelineId =
-            ZTimelineId::from_str("00000000000000000000000000000000").unwrap();
-        static ref DUMMY_TENANTID: ZTenantId =
-            ZTenantId::from_str("00000000000000000000000000000000").unwrap();
-    }
-
    /// Construct a dummy InMemoryLayer for testing
    fn dummy_inmem_layer(
        conf: &'static PageServerConf,
@@ -431,8 +393,8 @@ mod tests {
        Arc::new(
            InMemoryLayer::create(
                conf,
-                *DUMMY_TIMELINEID,
-                *DUMMY_TENANTID,
+                ZTimelineId::from_str("00000000000000000000000000000000").unwrap(),
+                ZTenantId::from_str("00000000000000000000000000000000").unwrap(),
                SegmentTag {
                    rel: TESTREL_A,
                    segno,
@@ -448,7 +410,6 @@ mod tests {
    fn test_open_layers() -> Result<()> {
        let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer"));
        let conf = Box::leak(Box::new(conf));
-        std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?;

        let mut layers = LayerMap::default();

@@ -465,10 +426,10 @@ mod tests {
        // A helper function (closure) to pop the next oldest open entry from the layer map,
        // and assert that it is what we'd expect
        let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
-            let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
+            let (l, generation) = layers.peek_oldest_open().unwrap();
            assert!(l.get_seg_tag().segno == expected_segno);
            assert!(generation == expected_generation);
-            layers.remove_open(layer_id);
+            layers.pop_oldest_open();
        };

        assert_pop_layer(0, gen1); // 0x100
--- a/pageserver/src/layered_repository/metadata.rs
+++ b/pageserver/src/layered_repository/metadata.rs
@@ -1,226 +0,0 @@
-//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
-//! has a metadata that needs to be stored persistently.
-//!
-//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
-//! external storage import and export operations.
-//!
-//! The module contains all structs and related helper methods related to timeline metadata.
-
-use std::{convert::TryInto, path::PathBuf};
-
-use anyhow::ensure;
-use zenith_utils::{
-    bin_ser::BeSer,
-    lsn::Lsn,
-    zid::{ZTenantId, ZTimelineId},
-};
-
-use crate::{
-    layered_repository::{METADATA_CHECKSUM_SIZE, METADATA_MAX_DATA_SIZE, METADATA_MAX_SAFE_SIZE},
-    PageServerConf,
-};
-
-/// The name of the metadata file pageserver creates per timeline.
-pub const METADATA_FILE_NAME: &str = "metadata";
-
-/// Metadata stored on disk for each timeline
-///
-/// The fields correspond to the values we hold in memory, in LayeredTimeline.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
-pub struct TimelineMetadata {
-    disk_consistent_lsn: Lsn,
-    // This is only set if we know it. We track it in memory when the page
-    // server is running, but we only track the value corresponding to
-    // 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
-    // lot. We only store it in the metadata file when we flush *all* the
-    // in-memory data so that 'last_record_lsn' is the same as
-    // 'disk_consistent_lsn'.  That's OK, because after page server restart, as
-    // soon as we reprocess at least one record, we will have a valid
-    // 'prev_record_lsn' value in memory again. This is only really needed when
-    // doing a clean shutdown, so that there is no more WAL beyond
-    // 'disk_consistent_lsn'
-    prev_record_lsn: Option<Lsn>,
-    ancestor_timeline: Option<ZTimelineId>,
-    ancestor_lsn: Lsn,
-    latest_gc_cutoff_lsn: Lsn,
-    initdb_lsn: Lsn,
-}
-
-/// Points to a place in pageserver's local directory,
-/// where certain timeline's metadata file should be located.
-pub fn metadata_path(
-    conf: &'static PageServerConf,
-    timelineid: ZTimelineId,
-    tenantid: ZTenantId,
-) -> PathBuf {
-    conf.timeline_path(&timelineid, &tenantid)
-        .join(METADATA_FILE_NAME)
-}
-
-impl TimelineMetadata {
-    pub fn new(
-        disk_consistent_lsn: Lsn,
-        prev_record_lsn: Option<Lsn>,
-        ancestor_timeline: Option<ZTimelineId>,
-        ancestor_lsn: Lsn,
-        latest_gc_cutoff_lsn: Lsn,
-        initdb_lsn: Lsn,
-    ) -> Self {
-        Self {
-            disk_consistent_lsn,
-            prev_record_lsn,
-            ancestor_timeline,
-            ancestor_lsn,
-            latest_gc_cutoff_lsn,
-            initdb_lsn,
-        }
-    }
-
-    pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
-        ensure!(
-            metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
-            "metadata bytes size is wrong"
-        );
-
-        let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
-        let calculated_checksum = crc32c::crc32c(data);
-
-        let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
-            metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
-        let expected_checksum = u32::from_le_bytes(*checksum_bytes);
-        ensure!(
-            calculated_checksum == expected_checksum,
-            "metadata checksum mismatch"
-        );
-
-        let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?);
-        assert!(data.disk_consistent_lsn.is_aligned());
-
-        Ok(data)
-    }
-
-    pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
-        let serializeable_metadata = serialize::SeTimelineMetadata::from(self);
-        let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?;
-        assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
-        metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
-
-        let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
-        metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
-        Ok(metadata_bytes)
-    }
-
-    /// [`Lsn`] that corresponds to the corresponding timeline directory
-    /// contents, stored locally in the pageserver workdir.
-    pub fn disk_consistent_lsn(&self) -> Lsn {
-        self.disk_consistent_lsn
-    }
-
-    pub fn prev_record_lsn(&self) -> Option<Lsn> {
-        self.prev_record_lsn
-    }
-
-    pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
-        self.ancestor_timeline
-    }
-
-    pub fn ancestor_lsn(&self) -> Lsn {
-        self.ancestor_lsn
-    }
-
-    pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
-        self.latest_gc_cutoff_lsn
-    }
-
-    pub fn initdb_lsn(&self) -> Lsn {
-        self.initdb_lsn
-    }
-}
-
-/// This module is for direct conversion of metadata to bytes and back.
-/// For a certain metadata, besides the conversion a few verification steps has to
-/// be done, so all serde derives are hidden from the user, to avoid accidental
-/// verification-less metadata creation.
-mod serialize {
-    use serde::{Deserialize, Serialize};
-    use zenith_utils::{lsn::Lsn, zid::ZTimelineId};
-
-    use super::TimelineMetadata;
-
-    #[derive(Serialize)]
-    pub(super) struct SeTimelineMetadata<'a> {
-        disk_consistent_lsn: &'a Lsn,
-        prev_record_lsn: &'a Option<Lsn>,
-        ancestor_timeline: &'a Option<ZTimelineId>,
-        ancestor_lsn: &'a Lsn,
-        latest_gc_cutoff_lsn: &'a Lsn,
-        initdb_lsn: &'a Lsn,
-    }
-
-    impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
-        fn from(other: &'a TimelineMetadata) -> Self {
-            Self {
-                disk_consistent_lsn: &other.disk_consistent_lsn,
-                prev_record_lsn: &other.prev_record_lsn,
-                ancestor_timeline: &other.ancestor_timeline,
-                ancestor_lsn: &other.ancestor_lsn,
-                latest_gc_cutoff_lsn: &other.latest_gc_cutoff_lsn,
-                initdb_lsn: &other.initdb_lsn,
-            }
-        }
-    }
-
-    #[derive(Deserialize)]
-    pub(super) struct DeTimelineMetadata {
-        disk_consistent_lsn: Lsn,
-        prev_record_lsn: Option<Lsn>,
-        ancestor_timeline: Option<ZTimelineId>,
-        ancestor_lsn: Lsn,
-        latest_gc_cutoff_lsn: Lsn,
-        initdb_lsn: Lsn,
-    }
-
-    impl From<DeTimelineMetadata> for TimelineMetadata {
-        fn from(other: DeTimelineMetadata) -> Self {
-            Self {
-                disk_consistent_lsn: other.disk_consistent_lsn,
-                prev_record_lsn: other.prev_record_lsn,
-                ancestor_timeline: other.ancestor_timeline,
-                ancestor_lsn: other.ancestor_lsn,
-                latest_gc_cutoff_lsn: other.latest_gc_cutoff_lsn,
-                initdb_lsn: other.initdb_lsn,
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::repository::repo_harness::TIMELINE_ID;
-
-    use super::*;
-
-    #[test]
-    fn metadata_serializes_correctly() {
-        let original_metadata = TimelineMetadata {
-            disk_consistent_lsn: Lsn(0x200),
-            prev_record_lsn: Some(Lsn(0x100)),
-            ancestor_timeline: Some(TIMELINE_ID),
-            ancestor_lsn: Lsn(0),
-            latest_gc_cutoff_lsn: Lsn(0),
-            initdb_lsn: Lsn(0),
-        };
-
-        let metadata_bytes = original_metadata
-            .to_bytes()
-            .expect("Should serialize correct metadata to bytes");
-
-        let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
-            .expect("Should deserialize its own bytes");
-
-        assert_eq!(
-            deserialized_metadata, original_metadata,
-            "Metadata that was serialized to bytes and deserialized back should not change"
-        );
-    }
-}
--- a/pageserver/src/layered_repository/page_versions.rs
+++ b/pageserver/src/layered_repository/page_versions.rs
@@ -1,252 +0,0 @@
-//!
-//! Data structure to ingest incoming WAL into an append-only file.
-//!
-//! - The file is considered temporary, and will be discarded on crash
-//! - based on a B-tree
-//!
-
-use std::os::unix::fs::FileExt;
-use std::{collections::HashMap, ops::RangeBounds, slice};
-
-use anyhow::Result;
-
-use std::cmp::min;
-use std::io::Seek;
-
-use zenith_utils::{lsn::Lsn, vec_map::VecMap};
-
-use super::storage_layer::PageVersion;
-use crate::layered_repository::ephemeral_file::EphemeralFile;
-
-use zenith_utils::bin_ser::BeSer;
-
-const EMPTY_SLICE: &[(Lsn, u64)] = &[];
-
-pub struct PageVersions {
-    map: HashMap<u32, VecMap<Lsn, u64>>,
-
-    /// The PageVersion structs are stored in a serialized format in this file.
-    /// Each serialized PageVersion is preceded by a 'u32' length field.
-    /// The 'map' stores offsets into this file.
-    file: EphemeralFile,
-}
-
-impl PageVersions {
-    pub fn new(file: EphemeralFile) -> PageVersions {
-        PageVersions {
-            map: HashMap::new(),
-            file,
-        }
-    }
-
-    pub fn append_or_update_last(
-        &mut self,
-        blknum: u32,
-        lsn: Lsn,
-        page_version: PageVersion,
-    ) -> Result<Option<u64>> {
-        // remember starting position
-        let pos = self.file.stream_position()?;
-
-        // make room for the 'length' field by writing zeros as a placeholder.
-        self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap();
-
-        page_version.ser_into(&mut self.file).unwrap();
-
-        // write the 'length' field.
-        let len = self.file.stream_position()? - pos - 4;
-        let lenbuf = u32::to_ne_bytes(len as u32);
-        self.file.write_all_at(&lenbuf, pos)?;
-
-        let map = self.map.entry(blknum).or_insert_with(VecMap::default);
-        Ok(map.append_or_update_last(lsn, pos as u64).unwrap().0)
-    }
-
-    /// Get all [`PageVersion`]s in a block
-    fn get_block_slice(&self, blknum: u32) -> &[(Lsn, u64)] {
-        self.map
-            .get(&blknum)
-            .map(VecMap::as_slice)
-            .unwrap_or(EMPTY_SLICE)
-    }
-
-    /// Get a range of [`PageVersions`] in a block
-    pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(&self, blknum: u32, range: R) -> &[(Lsn, u64)] {
-        self.map
-            .get(&blknum)
-            .map(|vec_map| vec_map.slice_range(range))
-            .unwrap_or(EMPTY_SLICE)
-    }
-
-    /// Iterate through [`PageVersion`]s in (block, lsn) order.
-    /// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
-    pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
-        let mut ordered_blocks: Vec<u32> = self.map.keys().cloned().collect();
-        ordered_blocks.sort_unstable();
-
-        let slice = ordered_blocks
-            .first()
-            .map(|&blknum| self.get_block_slice(blknum))
-            .unwrap_or(EMPTY_SLICE);
-
-        OrderedPageVersionIter {
-            page_versions: self,
-            ordered_blocks,
-            cur_block_idx: 0,
-            cutoff_lsn,
-            cur_slice_iter: slice.iter(),
-        }
-    }
-
-    /// Returns a 'Read' that reads the page version at given offset.
-    pub fn reader(&self, pos: u64) -> Result<PageVersionReader, std::io::Error> {
-        // read length
-        let mut lenbuf = [0u8; 4];
-        self.file.read_exact_at(&mut lenbuf, pos)?;
-        let len = u32::from_ne_bytes(lenbuf);
-
-        Ok(PageVersionReader {
-            file: &self.file,
-            pos: pos + 4,
-            end_pos: pos + 4 + len as u64,
-        })
-    }
-
-    pub fn get_page_version(&self, pos: u64) -> Result<PageVersion> {
-        let mut reader = self.reader(pos)?;
-        Ok(PageVersion::des_from(&mut reader)?)
-    }
-}
-
-pub struct PageVersionReader<'a> {
-    file: &'a EphemeralFile,
-    pos: u64,
-    end_pos: u64,
-}
-
-impl<'a> std::io::Read for PageVersionReader<'a> {
-    fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
-        let len = min(buf.len(), (self.end_pos - self.pos) as usize);
-        let n = self.file.read_at(&mut buf[..len], self.pos)?;
-        self.pos += n as u64;
-        Ok(n)
-    }
-}
-
-pub struct OrderedPageVersionIter<'a> {
-    page_versions: &'a PageVersions,
-
-    ordered_blocks: Vec<u32>,
-    cur_block_idx: usize,
-
-    cutoff_lsn: Option<Lsn>,
-
-    cur_slice_iter: slice::Iter<'a, (Lsn, u64)>,
-}
-
-impl OrderedPageVersionIter<'_> {
-    fn is_lsn_before_cutoff(&self, lsn: &Lsn) -> bool {
-        if let Some(cutoff_lsn) = self.cutoff_lsn.as_ref() {
-            lsn < cutoff_lsn
-        } else {
-            true
-        }
-    }
-}
-
-impl<'a> Iterator for OrderedPageVersionIter<'a> {
-    type Item = (u32, Lsn, u64);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        loop {
-            if let Some((lsn, pos)) = self.cur_slice_iter.next() {
-                if self.is_lsn_before_cutoff(lsn) {
-                    let blknum = self.ordered_blocks[self.cur_block_idx];
-                    return Some((blknum, *lsn, *pos));
-                }
-            }
-
-            let next_block_idx = self.cur_block_idx + 1;
-            let blknum: u32 = *self.ordered_blocks.get(next_block_idx)?;
-            self.cur_block_idx = next_block_idx;
-            self.cur_slice_iter = self.page_versions.get_block_slice(blknum).iter();
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use bytes::Bytes;
-
-    use super::*;
-    use crate::PageServerConf;
-    use std::fs;
-    use std::str::FromStr;
-    use zenith_utils::zid::{ZTenantId, ZTimelineId};
-
-    fn repo_harness(test_name: &str) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId)> {
-        let repo_dir = PageServerConf::test_repo_dir(test_name);
-        let _ = fs::remove_dir_all(&repo_dir);
-        let conf = PageServerConf::dummy_conf(repo_dir);
-        // Make a static copy of the config. This can never be free'd, but that's
-        // OK in a test.
-        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-        let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
-        let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
-
-        Ok((conf, tenantid, timelineid))
-    }
-
-    #[test]
-    fn test_ordered_iter() -> Result<()> {
-        let (conf, tenantid, timelineid) = repo_harness("test_ordered_iter")?;
-
-        let file = EphemeralFile::create(conf, tenantid, timelineid)?;
-
-        let mut page_versions = PageVersions::new(file);
-
-        const BLOCKS: u32 = 1000;
-        const LSNS: u64 = 50;
-
-        let empty_page = Bytes::from_static(&[0u8; 8192]);
-        let empty_page_version = PageVersion::Page(empty_page);
-
-        for blknum in 0..BLOCKS {
-            for lsn in 0..LSNS {
-                let old = page_versions.append_or_update_last(
-                    blknum,
-                    Lsn(lsn),
-                    empty_page_version.clone(),
-                )?;
-                assert!(old.is_none());
-            }
-        }
-
-        let mut iter = page_versions.ordered_page_version_iter(None);
-        for blknum in 0..BLOCKS {
-            for lsn in 0..LSNS {
-                let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
-                assert_eq!(actual_blknum, blknum);
-                assert_eq!(Lsn(lsn), actual_lsn);
-            }
-        }
-        assert!(iter.next().is_none());
-        assert!(iter.next().is_none()); // should be robust against excessive next() calls
-
-        const CUTOFF_LSN: Lsn = Lsn(30);
-        let mut iter = page_versions.ordered_page_version_iter(Some(CUTOFF_LSN));
-        for blknum in 0..BLOCKS {
-            for lsn in 0..CUTOFF_LSN.0 {
-                let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
-                assert_eq!(actual_blknum, blknum);
-                assert_eq!(Lsn(lsn), actual_lsn);
-            }
-        }
-        assert!(iter.next().is_none());
-        assert!(iter.next().is_none()); // should be robust against excessive next() calls
-
-        Ok(())
-    }
-}
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -4,12 +4,13 @@

 use crate::relish::RelishTag;
 use crate::repository::WALRecord;
-use crate::{ZTenantId, ZTimelineId};
+use crate::ZTimelineId;
 use anyhow::Result;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::path::PathBuf;
+use std::sync::Arc;

 use zenith_utils::lsn::Lsn;

@@ -51,10 +52,23 @@ impl SegmentTag {
 ///
 /// A page version can be stored as a full page image, or as WAL record that needs
 /// to be applied over the previous page version to reconstruct this version.
+///
+/// It's also possible to have both a WAL record and a page image in the same
+/// PageVersion. That happens if page version is originally stored as a WAL record
+/// but it is later reconstructed by a GetPage@LSN request by performing WAL
+/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
+/// the WAL record in that case. TODO: That's pretty accidental, not the result
+/// of any grand design. If we want to keep reconstructed page versions around, we
+/// probably should have a separate buffer cache so that we could control the
+/// replacement policy globally. Or if we keep a reconstructed page image, we
+/// could throw away the WAL record.
+///
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum PageVersion {
-    Page(Bytes),
-    Wal(WALRecord),
+pub struct PageVersion {
+    /// an 8kb page image
+    pub page_image: Option<Bytes>,
+    /// WAL record to get from previous page version to this one.
+    pub record: Option<WALRecord>,
 }

 ///
@@ -65,7 +79,7 @@ pub enum PageVersion {
 /// 'records' contains the records to apply over the base image.
 ///
 pub struct PageReconstructData {
-    pub records: Vec<(Lsn, WALRecord)>,
+    pub records: Vec<WALRecord>,
    pub page_img: Option<Bytes>,
 }

@@ -73,15 +87,13 @@ pub struct PageReconstructData {
 pub enum PageReconstructResult {
    /// Got all the data needed to reconstruct the requested page
    Complete,
-    /// This layer didn't contain all the required data, the caller should look up
-    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue(Lsn),
+    /// This layer didn't contain all the required data, the caller should collect
+    /// more data from the returned predecessor layer at the returned LSN.
+    Continue(Lsn, Arc<dyn Layer>),
    /// This layer didn't contain data needed to reconstruct the page version at
    /// the returned LSN. This is usually considered an error, but might be OK
    /// in some circumstances.
    Missing(Lsn),
-    /// Use the cached image at `cached_img_lsn` as the base image
-    Cached,
 }

 ///
@@ -93,8 +105,6 @@ pub enum PageReconstructResult {
 /// in-memory and on-disk layers.
 ///
 pub trait Layer: Send + Sync {
-    fn get_tenant_id(&self) -> ZTenantId;
-
    /// Identify the timeline this relish belongs to
    fn get_timeline_id(&self) -> ZTimelineId;

@@ -114,6 +124,10 @@ pub trait Layer: Send + Sync {
    /// Is the segment represented by this layer dropped by PostgreSQL?
    fn is_dropped(&self) -> bool;

+    /// Gets the physical location of the layer on disk.
+    /// Some layers, such as in-memory, might not have the location.
+    fn path(&self) -> Option<PathBuf>;
+
    /// Filename used to store this layer on disk. (Even in-memory layers
    /// implement this, to print a handy unique identifier for the layer for
    /// log messages, even though they're never not on disk.)
@@ -129,19 +143,15 @@ pub trait Layer: Send + Sync {
    /// of the *relish*, not the beginning of the segment. The requested
    /// 'blknum' must be covered by this segment.
    ///
-    /// `cached_img_lsn` should be set to a cached page image's lsn < `lsn`.
-    /// This function will only return data after `cached_img_lsn`.
-    ///
    /// See PageReconstructResult for possible return values. The collected data
    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call. If this returns PageReconstructResult::Continue, look up
-    /// the predecessor layer and call again with the same 'reconstruct_data'
+    /// on first call. If this returns PageReconstructResult::Continue, call
+    /// again on the returned predecessor layer with the same 'reconstruct_data'
    /// to collect more data.
    fn get_page_reconstruct_data(
        &self,
        blknum: u32,
        lsn: Lsn,
-        cached_img_lsn: Option<Lsn>,
        reconstruct_data: &mut PageReconstructData,
    ) -> Result<PageReconstructResult>;

@@ -157,9 +167,6 @@ pub trait Layer: Send + Sync {
    /// the previous non-incremental layer.
    fn is_incremental(&self) -> bool;

-    /// Returns true for layers that are represented in memory.
-    fn is_in_memory(&self) -> bool;
-
    /// Release memory used by this layer. There is no corresponding 'load'
    /// function, that's done implicitly when you call one of the get-functions.
    fn unload(&self) -> Result<()>;
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,8 +1,6 @@
-use layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use zenith_utils::postgres_backend::AuthType;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

-use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::PathBuf;
 use std::time::Duration;

@@ -13,15 +11,12 @@ pub mod basebackup;
 pub mod branches;
 pub mod http;
 pub mod layered_repository;
-pub mod page_cache;
 pub mod page_service;
 pub mod relish;
-pub mod remote_storage;
+mod relish_storage;
 pub mod repository;
 pub mod restore_local_repo;
 pub mod tenant_mgr;
-pub mod tenant_threads;
-pub mod virtual_file;
 pub mod waldecoder;
 pub mod walreceiver;
 pub mod walredo;
@@ -39,18 +34,12 @@ pub mod defaults {
    // would be more appropriate. But a low value forces the code to be exercised more,
    // which is good for now to trigger bugs.
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(10);
+    pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(1);

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
+    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);

    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
-    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100;
-    pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
-
-    pub const DEFAULT_OPEN_MEM_LIMIT: usize = 128 * 1024 * 1024;
-    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
-    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
 }

 lazy_static! {
@@ -79,10 +68,6 @@ pub struct PageServerConf {
    pub gc_period: Duration,
    pub superuser: String,

-    pub open_mem_limit: usize,
-    pub page_cache_size: usize,
-    pub max_file_descriptors: usize,
-
    // Repository directory, relative to current working directory.
    // Normally, the page server changes the current working directory
    // to the repository, and 'workdir' is always '.'. But we don't do
@@ -96,7 +81,7 @@ pub struct PageServerConf {
    pub auth_type: AuthType,

    pub auth_validation_public_key_path: Option<PathBuf>,
-    pub remote_storage_config: Option<RemoteStorageConfig>,
+    pub relish_storage_config: Option<RelishStorageConfig>,
 }

 impl PageServerConf {
@@ -105,7 +90,7 @@ impl PageServerConf {
    //

    fn tenants_path(&self) -> PathBuf {
-        self.workdir.join(TENANTS_SEGMENT_NAME)
+        self.workdir.join("tenants")
    }

    fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
@@ -129,13 +114,21 @@ impl PageServerConf {
    }

    fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
-        self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
+        self.tenant_path(tenantid).join("timelines")
    }

    fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
        self.timelines_path(tenantid).join(timelineid.to_string())
    }

+    fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
+        self.timeline_path(timelineid, tenantid).join("ancestor")
+    }
+
+    fn wal_dir_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
+        self.timeline_path(timelineid, tenantid).join("wal")
+    }
+
    //
    // Postgres distribution paths
    //
@@ -161,9 +154,6 @@ impl PageServerConf {
            checkpoint_period: Duration::from_secs(10),
            gc_horizon: defaults::DEFAULT_GC_HORIZON,
            gc_period: Duration::from_secs(10),
-            open_mem_limit: defaults::DEFAULT_OPEN_MEM_LIMIT,
-            page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
-            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            superuser: "zenith_admin".to_string(),
@@ -171,55 +161,25 @@ impl PageServerConf {
            pg_distrib_dir: "".into(),
            auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
-            remote_storage_config: None,
+            relish_storage_config: None,
        }
    }
 }

-/// Config for the Repository checkpointer
-#[derive(Debug, Clone, Copy)]
-pub enum CheckpointConfig {
-    // Flush in-memory data that is older than this
-    Distance(u64),
-    // Flush all in-memory data
-    Forced,
-}
-
-/// External backup storage configuration, enough for creating a client for that storage.
+/// External relish storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone)]
-pub struct RemoteStorageConfig {
-    /// Max allowed number of concurrent sync operations between pageserver and the remote storage.
-    pub max_concurrent_sync: NonZeroUsize,
-    /// Max allowed errors before the sync task is considered failed and evicted.
-    pub max_sync_errors: NonZeroU32,
-    /// The storage connection configuration.
-    pub storage: RemoteStorageKind,
-}
-
-/// A kind of a remote storage to connect to, with its connection configuration.
-#[derive(Debug, Clone)]
-pub enum RemoteStorageKind {
-    /// Storage based on local file system.
-    /// Specify a root folder to place all stored relish data into.
+pub enum RelishStorageConfig {
+    /// Root folder to place all stored relish data into.
    LocalFs(PathBuf),
-    /// AWS S3 based storage, storing all relishes into the root
-    /// of the S3 bucket from the config.
    AwsS3(S3Config),
 }

 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
 #[derive(Clone)]
 pub struct S3Config {
-    /// Name of the bucket to connect to.
    pub bucket_name: String,
-    /// The region where the bucket is located at.
    pub bucket_region: String,
-    /// "Login" to use when connecting to bucket.
-    /// Can be empty for cases like AWS k8s IAM
-    /// where we can allow certain pods to connect
-    /// to the bucket directly without any credentials.
    pub access_key_id: Option<String>,
-    /// "Password" to use when connecting to bucket.
    pub secret_access_key: Option<String>,
 }

--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -1,778 +0,0 @@
-//!
-//! Global page cache
-//!
-//! The page cache uses up most of the memory in the page server. It is shared
-//! by all tenants, and it is used to store different kinds of pages. Sharing
-//! the cache allows memory to be dynamically allocated where it's needed the
-//! most.
-//!
-//! The page cache consists of fixed-size buffers, 8 kB each to match the
-//! PostgreSQL buffer size, and a Slot struct for each buffer to contain
-//! information about what's stored in the buffer.
-//!
-//! # Locking
-//!
-//! There are two levels of locking involved: There's one lock for the "mapping"
-//! from page identifier (tenant ID, timeline ID, rel, block, LSN) to the buffer
-//! slot, and a separate lock on each slot. To read or write the contents of a
-//! slot, you must hold the lock on the slot in read or write mode,
-//! respectively. To change the mapping of a slot, i.e. to evict a page or to
-//! assign a buffer for a page, you must hold the mapping lock and the lock on
-//! the slot at the same time.
-//!
-//! Whenever you need to hold both locks simultenously, the slot lock must be
-//! acquired first. This consistent ordering avoids deadlocks. To look up a page
-//! in the cache, you would first look up the mapping, while holding the mapping
-//! lock, and then lock the slot. You must release the mapping lock in between,
-//! to obey the lock ordering and avoid deadlock.
-//!
-//! A slot can momentarily have invalid contents, even if it's already been
-//! inserted to the mapping, but you must hold the write-lock on the slot until
-//! the contents are valid. If you need to release the lock without initializing
-//! the contents, you must remove the mapping first. We make that easy for the
-//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
-//! page, the caller must explicitly call guard.mark_valid() after it has
-//! initialized it. If the guard is dropped without calling mark_valid(), the
-//! mapping is automatically removed and the slot is marked free.
-//!
-
-use std::{
-    collections::{hash_map::Entry, HashMap},
-    convert::TryInto,
-    sync::{
-        atomic::{AtomicU8, AtomicUsize, Ordering},
-        RwLock, RwLockReadGuard, RwLockWriteGuard,
-    },
-};
-
-use once_cell::sync::OnceCell;
-use tracing::error;
-use zenith_utils::{
-    lsn::Lsn,
-    zid::{ZTenantId, ZTimelineId},
-};
-
-use crate::layered_repository::writeback_ephemeral_file;
-use crate::{relish::RelTag, PageServerConf};
-
-static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
-const TEST_PAGE_CACHE_SIZE: usize = 10;
-
-///
-/// Initialize the page cache. This must be called once at page server startup.
-///
-pub fn init(conf: &'static PageServerConf) {
-    if PAGE_CACHE
-        .set(PageCache::new(conf.page_cache_size))
-        .is_err()
-    {
-        panic!("page cache already initialized");
-    }
-}
-
-///
-/// Get a handle to the page cache.
-///
-pub fn get() -> &'static PageCache {
-    //
-    // In unit tests, page server startup doesn't happen and no one calls
-    // page_cache::init(). Initialize it here with a tiny cache, so that the
-    // page cache is usable in unit tests.
-    //
-    if cfg!(test) {
-        PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
-    } else {
-        PAGE_CACHE.get().expect("page cache not initialized")
-    }
-}
-
-pub const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize;
-const MAX_USAGE_COUNT: u8 = 5;
-
-///
-/// CacheKey uniquely identifies a "thing" to cache in the page cache.
-///
-#[derive(Debug, PartialEq, Eq, Clone)]
-enum CacheKey {
-    MaterializedPage {
-        hash_key: MaterializedPageHashKey,
-        lsn: Lsn,
-    },
-    EphemeralPage {
-        file_id: u64,
-        blkno: u32,
-    },
-}
-
-#[derive(Debug, PartialEq, Eq, Hash, Clone)]
-struct MaterializedPageHashKey {
-    tenant_id: ZTenantId,
-    timeline_id: ZTimelineId,
-    rel_tag: RelTag,
-    blknum: u32,
-}
-
-#[derive(Clone)]
-struct Version {
-    lsn: Lsn,
-    slot_idx: usize,
-}
-
-struct Slot {
-    inner: RwLock<SlotInner>,
-    usage_count: AtomicU8,
-}
-
-struct SlotInner {
-    key: Option<CacheKey>,
-    buf: &'static mut [u8; PAGE_SZ],
-    dirty: bool,
-}
-
-impl Slot {
-    /// Increment usage count on the buffer, with ceiling at MAX_USAGE_COUNT.
-    fn inc_usage_count(&self) {
-        let _ = self
-            .usage_count
-            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
-                if val == MAX_USAGE_COUNT {
-                    None
-                } else {
-                    Some(val + 1)
-                }
-            });
-    }
-
-    /// Decrement usage count on the buffer, unless it's already zero.  Returns
-    /// the old usage count.
-    fn dec_usage_count(&self) -> u8 {
-        let count_res =
-            self.usage_count
-                .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
-                    if val == 0 {
-                        None
-                    } else {
-                        Some(val - 1)
-                    }
-                });
-
-        match count_res {
-            Ok(usage_count) => usage_count,
-            Err(usage_count) => usage_count,
-        }
-    }
-}
-
-pub struct PageCache {
-    /// This contains the mapping from the cache key to buffer slot that currently
-    /// contains the page, if any.
-    ///
-    /// TODO: This is protected by a single lock. If that becomes a bottleneck,
-    /// this HashMap can be replaced with a more concurrent version, there are
-    /// plenty of such crates around.
-    ///
-    /// If you add support for caching different kinds of objects, each object kind
-    /// can have a separate mapping map, next to this field.
-    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
-
-    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
-
-    /// The actual buffers with their metadata.
-    slots: Box<[Slot]>,
-
-    /// Index of the next candidate to evict, for the Clock replacement algorithm.
-    /// This is interpreted modulo the page cache size.
-    next_evict_slot: AtomicUsize,
-}
-
-///
-/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
-/// until the guard is dropped.
-///
-pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);
-
-impl std::ops::Deref for PageReadGuard<'_> {
-    type Target = [u8; PAGE_SZ];
-
-    fn deref(&self) -> &Self::Target {
-        self.0.buf
-    }
-}
-
-///
-/// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked
-/// until the guard is dropped.
-///
-/// Counterintuitively, this is used even for a read, if the requested page is not
-/// currently found in the page cache. In that case, the caller of lock_for_read()
-/// is expected to fill in the page contents and call mark_valid(). Similarly
-/// lock_for_write() can return an invalid buffer that the caller is expected to
-/// to initialize.
-///
-pub struct PageWriteGuard<'i> {
-    inner: RwLockWriteGuard<'i, SlotInner>,
-
-    // Are the page contents currently valid?
-    valid: bool,
-}
-
-impl std::ops::DerefMut for PageWriteGuard<'_> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.inner.buf
-    }
-}
-
-impl std::ops::Deref for PageWriteGuard<'_> {
-    type Target = [u8; PAGE_SZ];
-
-    fn deref(&self) -> &Self::Target {
-        self.inner.buf
-    }
-}
-
-impl PageWriteGuard<'_> {
-    /// Mark that the buffer contents are now valid.
-    pub fn mark_valid(&mut self) {
-        assert!(self.inner.key.is_some());
-        assert!(
-            !self.valid,
-            "mark_valid called on a buffer that was already valid"
-        );
-        self.valid = true;
-    }
-    pub fn mark_dirty(&mut self) {
-        // only ephemeral pages can be dirty ATM.
-        assert!(matches!(
-            self.inner.key,
-            Some(CacheKey::EphemeralPage { .. })
-        ));
-        self.inner.dirty = true;
-    }
-}
-
-impl Drop for PageWriteGuard<'_> {
-    ///
-    /// If the buffer was allocated for a page that was not already in the
-    /// cache, but the lock_for_read/write() caller dropped the buffer without
-    /// initializing it, remove the mapping from the page cache.
-    ///
-    fn drop(&mut self) {
-        assert!(self.inner.key.is_some());
-        if !self.valid {
-            let self_key = self.inner.key.as_ref().unwrap();
-            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
-            self.inner.key = None;
-            self.inner.dirty = false;
-        }
-    }
-}
-
-/// lock_for_read() return value
-pub enum ReadBufResult<'a> {
-    Found(PageReadGuard<'a>),
-    NotFound(PageWriteGuard<'a>),
-}
-
-/// lock_for_write() return value
-pub enum WriteBufResult<'a> {
-    Found(PageWriteGuard<'a>),
-    NotFound(PageWriteGuard<'a>),
-}
-
-impl PageCache {
-    //
-    // Section 1.1: Public interface functions for looking up and memorizing materialized page
-    // versions in the page cache
-    //
-
-    /// Look up a materialized page version.
-    ///
-    /// The 'lsn' is an upper bound, this will return the latest version of
-    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
-    /// returned page.
-    pub fn lookup_materialized_page(
-        &self,
-        tenant_id: ZTenantId,
-        timeline_id: ZTimelineId,
-        rel_tag: RelTag,
-        blknum: u32,
-        lsn: Lsn,
-    ) -> Option<(Lsn, PageReadGuard)> {
-        let mut cache_key = CacheKey::MaterializedPage {
-            hash_key: MaterializedPageHashKey {
-                tenant_id,
-                timeline_id,
-                rel_tag,
-                blknum,
-            },
-            lsn,
-        };
-
-        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
-            if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
-                Some((lsn, guard))
-            } else {
-                panic!("unexpected key type in slot");
-            }
-        } else {
-            None
-        }
-    }
-
-    ///
-    /// Store an image of the given page in the cache.
-    ///
-    pub fn memorize_materialized_page(
-        &self,
-        tenant_id: ZTenantId,
-        timeline_id: ZTimelineId,
-        rel_tag: RelTag,
-        blknum: u32,
-        lsn: Lsn,
-        img: &[u8],
-    ) {
-        let cache_key = CacheKey::MaterializedPage {
-            hash_key: MaterializedPageHashKey {
-                tenant_id,
-                timeline_id,
-                rel_tag,
-                blknum,
-            },
-            lsn,
-        };
-
-        match self.lock_for_write(&cache_key) {
-            WriteBufResult::Found(write_guard) => {
-                // We already had it in cache. Another thread must've put it there
-                // concurrently. Check that it had the same contents that we
-                // replayed.
-                assert!(*write_guard == img);
-            }
-            WriteBufResult::NotFound(mut write_guard) => {
-                write_guard.copy_from_slice(img);
-                write_guard.mark_valid();
-            }
-        }
-    }
-
-    // Section 1.2: Public interface functions for working with Ephemeral pages.
-
-    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
-        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
-
-        self.lock_for_read(&mut cache_key)
-    }
-
-    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult {
-        let cache_key = CacheKey::EphemeralPage { file_id, blkno };
-
-        self.lock_for_write(&cache_key)
-    }
-
-    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
-        for slot_idx in 0..self.slots.len() {
-            let slot = &self.slots[slot_idx];
-
-            let mut inner = slot.inner.write().unwrap();
-            if let Some(key) = &inner.key {
-                match key {
-                    CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
-                        // remove mapping for old buffer
-                        self.remove_mapping(key);
-                        inner.key = None;
-                        inner.dirty = false;
-                    }
-                    _ => {}
-                }
-            }
-        }
-    }
-
-    //
-    // Section 2: Internal interface functions for lookup/update.
-    //
-    // To add support for a new kind of "thing" to cache, you will need
-    // to add public interface routines above, and code to deal with the
-    // "mappings" after this section. But the routines in this section should
-    // not require changes.
-
-    /// Look up a page in the cache.
-    ///
-    /// If the search criteria is not exact, *cache_key is updated with the key
-    /// for exact key of the returned page. (For materialized pages, that means
-    /// that the LSN in 'cache_key' is updated with the LSN of the returned page
-    /// version.)
-    ///
-    /// If no page is found, returns None and *cache_key is left unmodified.
-    ///
-    fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
-        let cache_key_orig = cache_key.clone();
-        if let Some(slot_idx) = self.search_mapping(cache_key) {
-            // The page was found in the mapping. Lock the slot, and re-check
-            // that it's still what we expected (because we released the mapping
-            // lock already, another thread could have evicted the page)
-            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.read().unwrap();
-            if inner.key.as_ref() == Some(cache_key) {
-                slot.inc_usage_count();
-                return Some(PageReadGuard(inner));
-            } else {
-                // search_mapping might have modified the search key; restore it.
-                *cache_key = cache_key_orig;
-            }
-        }
-        None
-    }
-
-    /// Return a locked buffer for given block.
-    ///
-    /// Like try_lock_for_read(), if the search criteria is not exact and the
-    /// page is already found in the cache, *cache_key is updated.
-    ///
-    /// If the page is not found in the cache, this allocates a new buffer for
-    /// it. The caller may then initialize the buffer with the contents, and
-    /// call mark_valid().
-    ///
-    /// Example usage:
-    ///
-    /// ```ignore
-    /// let cache = page_cache::get();
-    ///
-    /// match cache.lock_for_read(&key) {
-    ///     ReadBufResult::Found(read_guard) => {
-    ///         // The page was found in cache. Use it
-    ///     },
-    ///     ReadBufResult::NotFound(write_guard) => {
-    ///         // The page was not found in cache. Read it from disk into the
-    ///         // buffer.
-    ///         //read_my_page_from_disk(write_guard);
-    ///
-    ///         // The buffer contents are now valid. Tell the page cache.
-    ///         write_guard.mark_valid();
-    ///     },
-    /// }
-    /// ```
-    ///
-    fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult {
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
-                return ReadBufResult::Found(read_guard);
-            }
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self.find_victim();
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
-            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            inner.dirty = false;
-            slot.usage_count.store(1, Ordering::Relaxed);
-
-            return ReadBufResult::NotFound(PageWriteGuard {
-                inner,
-                valid: false,
-            });
-        }
-    }
-
-    /// Look up a page in the cache and lock it in write mode. If it's not
-    /// found, returns None.
-    ///
-    /// When locking a page for writing, the search criteria is always "exact".
-    fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
-        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
-            // The page was found in the mapping. Lock the slot, and re-check
-            // that it's still what we expected (because we don't released the mapping
-            // lock already, another thread could have evicted the page)
-            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.write().unwrap();
-            if inner.key.as_ref() == Some(cache_key) {
-                slot.inc_usage_count();
-                return Some(PageWriteGuard { inner, valid: true });
-            }
-        }
-        None
-    }
-
-    /// Return a write-locked buffer for given block.
-    ///
-    /// Similar to lock_for_read(), but the returned buffer is write-locked and
-    /// may be modified by the caller even if it's already found in the cache.
-    fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult {
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key) {
-                return WriteBufResult::Found(write_guard);
-            }
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) = self.find_victim();
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
-            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            inner.dirty = false;
-            slot.usage_count.store(1, Ordering::Relaxed);
-
-            return WriteBufResult::NotFound(PageWriteGuard {
-                inner,
-                valid: false,
-            });
-        }
-    }
-
-    //
-    // Section 3: Mapping functions
-    //
-
-    /// Search for a page in the cache using the given search key.
-    ///
-    /// Returns the slot index, if any. If the search criteria is not exact,
-    /// *cache_key is updated with the actual key of the found page.
-    ///
-    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
-    /// get recycled for an unrelated page immediately after this function
-    /// returns.  The caller is responsible for re-checking that the slot still
-    /// contains the page with the same key before using it.
-    ///
-    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
-        match cache_key {
-            CacheKey::MaterializedPage { hash_key, lsn } => {
-                let map = self.materialized_page_map.read().unwrap();
-                let versions = map.get(hash_key)?;
-
-                let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
-                    Ok(version_idx) => version_idx,
-                    Err(0) => return None,
-                    Err(version_idx) => version_idx - 1,
-                };
-                let version = &versions[version_idx];
-                *lsn = version.lsn;
-                Some(version.slot_idx)
-            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = self.ephemeral_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
-        }
-    }
-
-    /// Search for a page in the cache using the given search key.
-    ///
-    /// Like 'search_mapping, but performs an "exact" search. Used for
-    /// allocating a new buffer.
-    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
-        match key {
-            CacheKey::MaterializedPage { hash_key, lsn } => {
-                let map = self.materialized_page_map.read().unwrap();
-                let versions = map.get(hash_key)?;
-
-                if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
-                    Some(versions[version_idx].slot_idx)
-                } else {
-                    None
-                }
-            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = self.ephemeral_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
-        }
-    }
-
-    ///
-    /// Remove mapping for given key.
-    ///
-    fn remove_mapping(&self, old_key: &CacheKey) {
-        match old_key {
-            CacheKey::MaterializedPage {
-                hash_key: old_hash_key,
-                lsn: old_lsn,
-            } => {
-                let mut map = self.materialized_page_map.write().unwrap();
-                if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
-                    let versions = old_entry.get_mut();
-
-                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
-                        versions.remove(version_idx);
-                        if versions.is_empty() {
-                            old_entry.remove_entry();
-                        }
-                    }
-                } else {
-                    panic!("could not find old key in mapping")
-                }
-            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let mut map = self.ephemeral_page_map.write().unwrap();
-                map.remove(&(*file_id, *blkno))
-                    .expect("could not find old key in mapping");
-            }
-        }
-    }
-
-    ///
-    /// Insert mapping for given key.
-    ///
-    /// If a mapping already existed for the given key, returns the slot index
-    /// of the existing mapping and leaves it untouched.
-    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
-        match new_key {
-            CacheKey::MaterializedPage {
-                hash_key: new_key,
-                lsn: new_lsn,
-            } => {
-                let mut map = self.materialized_page_map.write().unwrap();
-                let versions = map.entry(new_key.clone()).or_default();
-                match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
-                    Ok(version_idx) => Some(versions[version_idx].slot_idx),
-                    Err(version_idx) => {
-                        versions.insert(
-                            version_idx,
-                            Version {
-                                lsn: *new_lsn,
-                                slot_idx,
-                            },
-                        );
-                        None
-                    }
-                }
-            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let mut map = self.ephemeral_page_map.write().unwrap();
-                match map.entry((*file_id, *blkno)) {
-                    Entry::Occupied(entry) => Some(*entry.get()),
-                    Entry::Vacant(entry) => {
-                        entry.insert(slot_idx);
-                        None
-                    }
-                }
-            }
-        }
-    }
-
-    //
-    // Section 4: Misc internal helpers
-    //
-
-    /// Find a slot to evict.
-    ///
-    /// On return, the slot is empty and write-locked.
-    fn find_victim(&self) -> (usize, RwLockWriteGuard<SlotInner>) {
-        let iter_limit = self.slots.len() * 2;
-        let mut iters = 0;
-        loop {
-            let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len();
-
-            let slot = &self.slots[slot_idx];
-
-            if slot.dec_usage_count() == 0 || iters >= iter_limit {
-                let mut inner = slot.inner.write().unwrap();
-
-                if let Some(old_key) = &inner.key {
-                    if inner.dirty {
-                        if let Err(err) = Self::writeback(old_key, inner.buf) {
-                            // Writing the page to disk failed.
-                            //
-                            // FIXME: What to do here, when? We could propagate the error to the
-                            // caller, but victim buffer is generally unrelated to the original
-                            // call. It can even belong to a different tenant. Currently, we
-                            // report the error to the log and continue the clock sweep to find
-                            // a different victim. But if the problem persists, the page cache
-                            // could fill up with dirty pages that we cannot evict, and we will
-                            // loop retrying the writebacks indefinitely.
-                            error!("writeback of buffer {:?} failed: {}", old_key, err);
-                            continue;
-                        }
-                    }
-
-                    // remove mapping for old buffer
-                    self.remove_mapping(old_key);
-                    inner.dirty = false;
-                    inner.key = None;
-                }
-                return (slot_idx, inner);
-            }
-
-            iters += 1;
-        }
-    }
-
-    fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
-        match cache_key {
-            CacheKey::MaterializedPage {
-                hash_key: _,
-                lsn: _,
-            } => {
-                panic!("unexpected dirty materialized page");
-            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                writeback_ephemeral_file(*file_id, *blkno, buf)
-            }
-        }
-    }
-
-    /// Initialize a new page cache
-    ///
-    /// This should be called only once at page server startup.
-    fn new(num_pages: usize) -> Self {
-        assert!(num_pages > 0, "page cache size must be > 0");
-
-        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
-
-        let slots = page_buffer
-            .chunks_exact_mut(PAGE_SZ)
-            .map(|chunk| {
-                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
-
-                Slot {
-                    inner: RwLock::new(SlotInner {
-                        key: None,
-                        buf,
-                        dirty: false,
-                    }),
-                    usage_count: AtomicU8::new(0),
-                }
-            })
-            .collect();
-
-        Self {
-            materialized_page_map: Default::default(),
-            ephemeral_page_map: Default::default(),
-            slots,
-            next_evict_slot: AtomicUsize::new(0),
-        }
-    }
-}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,9 +10,10 @@
 //     *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
 //

-use anyhow::{anyhow, bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use lazy_static::lazy_static;
+use log::*;
 use regex::Regex;
 use std::net::TcpListener;
 use std::str;
@@ -20,12 +21,10 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::thread;
 use std::{io, net::TcpStream};
-use tracing::*;
 use zenith_metrics::{register_histogram_vec, HistogramVec};
 use zenith_utils::auth::{self, JwtAuth};
 use zenith_utils::auth::{Claims, Scope};
 use zenith_utils::lsn::Lsn;
-use zenith_utils::postgres_backend::is_socket_read_timed_out;
 use zenith_utils::postgres_backend::PostgresBackend;
 use zenith_utils::postgres_backend::{self, AuthType};
 use zenith_utils::pq_proto::{
@@ -188,32 +187,17 @@ pub fn thread_main(
    listener: TcpListener,
    auth_type: AuthType,
 ) -> anyhow::Result<()> {
-    let mut join_handles = Vec::new();
-
-    while !tenant_mgr::shutdown_requested() {
+    loop {
        let (socket, peer_addr) = listener.accept()?;
        debug!("accepted connection from {}", peer_addr);
        socket.set_nodelay(true).unwrap();
        let local_auth = auth.clone();
-
-        let handle = thread::Builder::new()
-            .name("serving Page Service thread".into())
-            .spawn(move || {
-                if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
-                    error!(%err, "page server thread exited with error");
-                }
-            })
-            .unwrap();
-
-        join_handles.push(handle);
+        thread::spawn(move || {
+            if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
+                error!("error: {}", err);
+            }
+        });
    }
-
-    debug!("page_service loop terminated. wait for connections to cancel");
-    for handle in join_handles.into_iter() {
-        handle.join().unwrap();
-    }
-
-    Ok(())
 }

 fn page_service_conn_main(
@@ -232,7 +216,7 @@ fn page_service_conn_main(
    }

    let mut conn_handler = PageServerHandler::new(conf, auth);
-    let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
+    let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
    pgbackend.run(&mut conn_handler)
 }

@@ -276,66 +260,50 @@ impl PageServerHandler {
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
    ) -> anyhow::Result<()> {
-        let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
-
        // Check that the timeline exists
        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;

        /* switch client to COPYBOTH */
        pgb.write_message(&BeMessage::CopyBothResponse)?;

-        while !tenant_mgr::shutdown_requested() {
-            match pgb.read_message() {
-                Ok(message) => {
-                    if let Some(message) = message {
-                        trace!("query: {:?}", message);
+        while let Some(message) = pgb.read_message()? {
+            trace!("query({:?}): {:?}", timelineid, message);

-                        let copy_data_bytes = match message {
-                            FeMessage::CopyData(bytes) => bytes,
-                            _ => continue,
-                        };
+            let copy_data_bytes = match message {
+                FeMessage::CopyData(bytes) => bytes,
+                _ => continue,
+            };

-                        let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
+            let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;

-                        let response = match zenith_fe_msg {
-                            PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_rel_exists"])
-                                .observe_closure_duration(|| {
-                                    self.handle_get_rel_exists_request(&*timeline, &req)
-                                }),
-                            PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_rel_size"])
-                                .observe_closure_duration(|| {
-                                    self.handle_get_nblocks_request(&*timeline, &req)
-                                }),
-                            PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_page_at_lsn"])
-                                .observe_closure_duration(|| {
-                                    self.handle_get_page_at_lsn_request(&*timeline, &req)
-                                }),
-                        };
+            let response = match zenith_fe_msg {
+                PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
+                    .with_label_values(&["get_rel_exists"])
+                    .observe_closure_duration(|| {
+                        self.handle_get_rel_exists_request(&*timeline, &req)
+                    }),
+                PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
+                    .with_label_values(&["get_rel_size"])
+                    .observe_closure_duration(|| self.handle_get_nblocks_request(&*timeline, &req)),
+                PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
+                    .with_label_values(&["get_page_at_lsn"])
+                    .observe_closure_duration(|| {
+                        self.handle_get_page_at_lsn_request(&*timeline, &req)
+                    }),
+            };

-                        let response = response.unwrap_or_else(|e| {
-                            // print the all details to the log with {:#}, but for the client the
-                            // error message is enough
-                            error!("error reading relation or page version: {:#}", e);
-                            PagestreamBeMessage::Error(PagestreamErrorResponse {
-                                message: e.to_string(),
-                            })
-                        });
+            let response = response.unwrap_or_else(|e| {
+                // print the all details to the log with {:#}, but for the client the
+                // error message is enough
+                error!("error reading relation or page version: {:#}", e);
+                PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    message: e.to_string(),
+                })
+            });

-                        pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
-                    } else {
-                        break;
-                    }
-                }
-                Err(e) => {
-                    if !is_socket_read_timed_out(&e) {
-                        return Err(e);
-                    }
-                }
-            }
+            pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
        }
+
        Ok(())
    }

@@ -395,8 +363,6 @@ impl PageServerHandler {
        timeline: &dyn Timeline,
        req: &PagestreamExistsRequest,
    ) -> Result<PagestreamBeMessage> {
-        let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
-
        let tag = RelishTag::Relation(req.rel);
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;

@@ -412,7 +378,6 @@ impl PageServerHandler {
        timeline: &dyn Timeline,
        req: &PagestreamNblocksRequest,
    ) -> Result<PagestreamBeMessage> {
-        let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
        let tag = RelishTag::Relation(req.rel);
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;

@@ -432,8 +397,6 @@ impl PageServerHandler {
        timeline: &dyn Timeline,
        req: &PagestreamGetPageRequest,
    ) -> Result<PagestreamBeMessage> {
-        let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
-            .entered();
        let tag = RelishTag::Relation(req.rel);
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;

@@ -451,25 +414,17 @@ impl PageServerHandler {
        lsn: Option<Lsn>,
        tenantid: ZTenantId,
    ) -> anyhow::Result<()> {
-        let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
-        let _enter = span.enter();
-
        // check that the timeline exists
        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
-        if let Some(lsn) = lsn {
-            timeline
-                .check_lsn_is_in_scope(lsn)
-                .context("invalid basebackup lsn")?;
-        }

-        // switch client to COPYOUT
+        /* switch client to COPYOUT */
        pgb.write_message(&BeMessage::CopyOutResponse)?;
+        info!("sent CopyOut");

        /* Send a tarball of the latest layer on the timeline */
        {
            let mut writer = CopyDataSink { pgb };
            let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
-            span.record("lsn", &basebackup.lsn.to_string().as_str());
            basebackup.send_tarball()?;
        }
        pgb.write_message(&BeMessage::CopyDone)?;
@@ -574,6 +529,11 @@ impl postgres_backend::Handler for PageServerHandler {
                None
            };

+            info!(
+                "got basebackup command. tenantid=\"{}\" timelineid=\"{}\" lsn=\"{:#?}\"",
+                tenantid, timelineid, lsn
+            );
+
            // Check that the timeline exists
            self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
@@ -591,9 +551,6 @@ impl postgres_backend::Handler for PageServerHandler {

            self.check_permission(Some(tenantid))?;

-            let _enter =
-                info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
-
            // Check that the timeline exists
            tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;

@@ -616,9 +573,6 @@ impl postgres_backend::Handler for PageServerHandler {

            self.check_permission(Some(tenantid))?;

-            let _enter =
-                info_span!("branch_create", name = %branchname, tenant = %tenantid).entered();
-
            let branch =
                branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
            let branch = serde_json::to_vec(&branch)?;
@@ -635,16 +589,14 @@ impl postgres_backend::Handler for PageServerHandler {

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;

-            // since these handlers for tenant/branch commands are deprecated (in favor of http based ones)
-            // just use false in place of include non incremental logical size
-            let branches = crate::branches::get_branches(self.conf, &tenantid, false)?;
+            let branches = crate::branches::get_branches(self.conf, &tenantid)?;
            let branches_buf = serde_json::to_vec(&branches)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("tenant_list") {
-            let tenants = crate::tenant_mgr::list_tenants()?;
+            let tenants = crate::branches::get_tenants(self.conf)?;
            let tenants_buf = serde_json::to_vec(&tenants)?;

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
@@ -696,7 +648,9 @@ impl postgres_backend::Handler for PageServerHandler {
                .unwrap_or(Ok(self.conf.gc_horizon))?;

            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
+
            let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
+
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"layer_relfiles_total"),
                RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
--- a/pageserver/src/relish_storage.rs
+++ b/pageserver/src/relish_storage.rs
@@ -0,0 +1,54 @@
+//! Abstractions for the page server to store its relish layer data in the external storage.
+//!
+//! Main purpose of this module subtree is to provide a set of abstractions to manage the storage state
+//! in a way, optimal for page server.
+//!
+//! The abstractions hide multiple custom external storage API implementations,
+//! such as AWS S3, local filesystem, etc., located in the submodules.
+
+mod local_fs;
+mod rust_s3;
+/// A queue and the background machinery behind it to upload
+/// local page server layer files to external storage.
+pub mod storage_uploader;
+
+use std::path::Path;
+
+use anyhow::Context;
+
+/// Storage (potentially remote) API to manage its state.
+#[async_trait::async_trait]
+pub trait RelishStorage: Send + Sync {
+    type RelishStoragePath;
+
+    fn derive_destination(
+        page_server_workdir: &Path,
+        relish_local_path: &Path,
+    ) -> anyhow::Result<Self::RelishStoragePath>;
+
+    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;
+
+    async fn download_relish(
+        &self,
+        from: &Self::RelishStoragePath,
+        to: &Path,
+    ) -> anyhow::Result<()>;
+
+    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;
+
+    async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()>;
+}
+
+fn strip_workspace_prefix<'a>(
+    page_server_workdir: &'a Path,
+    relish_local_path: &'a Path,
+) -> anyhow::Result<&'a Path> {
+    relish_local_path
+        .strip_prefix(page_server_workdir)
+        .with_context(|| {
+            format!(
+                "Unexpected: relish local path '{}' is not relevant to server workdir",
+                relish_local_path.display(),
+            )
+        })
+}
--- a/pageserver/src/relish_storage/local_fs.rs
+++ b/pageserver/src/relish_storage/local_fs.rs
@@ -0,0 +1,158 @@
+//! Local filesystem relish storage.
+//!
+//! Page server already stores layer data on the server, when freezing it.
+//! This storage serves a way to
+//!
+//! * test things locally simply
+//! * allow to compabre both binary sets
+//! * help validating the relish storage API
+
+use std::{
+    future::Future,
+    path::{Path, PathBuf},
+    pin::Pin,
+};
+
+use anyhow::{bail, Context};
+
+use super::{strip_workspace_prefix, RelishStorage};
+
+pub struct LocalFs {
+    root: PathBuf,
+}
+
+impl LocalFs {
+    /// Atetmpts to create local FS relish storage, also creates the directory provided, if not exists.
+    pub fn new(root: PathBuf) -> anyhow::Result<Self> {
+        if !root.exists() {
+            std::fs::create_dir_all(&root).with_context(|| {
+                format!(
+                    "Failed to create all directories in the given root path {}",
+                    root.display(),
+                )
+            })?;
+        }
+        Ok(Self { root })
+    }
+
+    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
+        if path.is_relative() {
+            Ok(self.root.join(path))
+        } else if path.starts_with(&self.root) {
+            Ok(path.to_path_buf())
+        } else {
+            bail!(
+                "Path '{}' does not belong to the current storage",
+                path.display()
+            )
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl RelishStorage for LocalFs {
+    type RelishStoragePath = PathBuf;
+
+    fn derive_destination(
+        page_server_workdir: &Path,
+        relish_local_path: &Path,
+    ) -> anyhow::Result<Self::RelishStoragePath> {
+        Ok(strip_workspace_prefix(page_server_workdir, relish_local_path)?.to_path_buf())
+    }
+
+    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
+        Ok(get_all_files(&self.root).await?.into_iter().collect())
+    }
+
+    async fn download_relish(
+        &self,
+        from: &Self::RelishStoragePath,
+        to: &Path,
+    ) -> anyhow::Result<()> {
+        let file_path = self.resolve_in_storage(from)?;
+        if file_path.exists() && file_path.is_file() {
+            create_target_directory(to).await?;
+            tokio::fs::copy(file_path, to).await?;
+            Ok(())
+        } else {
+            bail!(
+                "File '{}' either does not exist or is not a file",
+                file_path.display()
+            )
+        }
+    }
+
+    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
+        let file_path = self.resolve_in_storage(path)?;
+        if file_path.exists() && file_path.is_file() {
+            Ok(tokio::fs::remove_file(file_path).await?)
+        } else {
+            bail!(
+                "File '{}' either does not exist or is not a file",
+                file_path.display()
+            )
+        }
+    }
+
+    async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()> {
+        let target_file_path = self.resolve_in_storage(to)?;
+        create_target_directory(&target_file_path).await?;
+
+        tokio::fs::copy(&from, &target_file_path)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to upload relish '{}' to local storage",
+                    from.display(),
+                )
+            })?;
+        Ok(())
+    }
+}
+
+fn get_all_files<'a, P>(
+    directory_path: P,
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
+where
+    P: AsRef<Path> + Send + Sync + 'a,
+{
+    Box::pin(async move {
+        let directory_path = directory_path.as_ref();
+        if directory_path.exists() {
+            if directory_path.is_dir() {
+                let mut paths = Vec::new();
+                let mut dir_contents = tokio::fs::read_dir(directory_path).await?;
+                while let Some(dir_entry) = dir_contents.next_entry().await? {
+                    let file_type = dir_entry.file_type().await?;
+                    let entry_path = dir_entry.path();
+                    if file_type.is_symlink() {
+                        log::debug!("{:?} us a symlink, skipping", entry_path)
+                    } else if file_type.is_dir() {
+                        paths.extend(get_all_files(entry_path).await?.into_iter())
+                    } else {
+                        paths.push(dir_entry.path());
+                    }
+                }
+                Ok(paths)
+            } else {
+                bail!("Path '{}' is not a directory", directory_path.display())
+            }
+        } else {
+            Ok(Vec::new())
+        }
+    })
+}
+
+async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
+    let target_dir = match target_file_path.parent() {
+        Some(parent_dir) => parent_dir,
+        None => bail!(
+            "Relish path '{}' has no parent directory",
+            target_file_path.display()
+        ),
+    };
+    if !target_dir.exists() {
+        tokio::fs::create_dir_all(target_dir).await?;
+    }
+    Ok(())
+}
--- a/pageserver/src/relish_storage/rust_s3.rs
+++ b/pageserver/src/relish_storage/rust_s3.rs
@@ -0,0 +1,144 @@
+//! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage.
+
+use std::path::Path;
+
+use anyhow::Context;
+use s3::{bucket::Bucket, creds::Credentials, region::Region};
+
+use crate::{relish_storage::strip_workspace_prefix, S3Config};
+
+use super::RelishStorage;
+
+const S3_FILE_SEPARATOR: char = '/';
+
+#[derive(Debug)]
+pub struct S3ObjectKey(String);
+
+impl S3ObjectKey {
+    fn key(&self) -> &str {
+        &self.0
+    }
+}
+
+/// AWS S3 relish storage.
+pub struct RustS3 {
+    bucket: Bucket,
+}
+
+impl RustS3 {
+    /// Creates the relish storage, errors if incorrect AWS S3 configuration provided.
+    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
+        let region = aws_config
+            .bucket_region
+            .parse::<Region>()
+            .context("Failed to parse the s3 region from config")?;
+        let credentials = Credentials::new(
+            aws_config.access_key_id.as_deref(),
+            aws_config.secret_access_key.as_deref(),
+            None,
+            None,
+            None,
+        )
+        .context("Failed to create the s3 credentials")?;
+        Ok(Self {
+            bucket: Bucket::new_with_path_style(
+                aws_config.bucket_name.as_str(),
+                region,
+                credentials,
+            )
+            .context("Failed to create the s3 bucket")?,
+        })
+    }
+}
+
+#[async_trait::async_trait]
+impl RelishStorage for RustS3 {
+    type RelishStoragePath = S3ObjectKey;
+
+    fn derive_destination(
+        page_server_workdir: &Path,
+        relish_local_path: &Path,
+    ) -> anyhow::Result<Self::RelishStoragePath> {
+        let relative_path = strip_workspace_prefix(page_server_workdir, relish_local_path)?;
+        let mut key = String::new();
+        for segment in relative_path {
+            key.push(S3_FILE_SEPARATOR);
+            key.push_str(&segment.to_string_lossy());
+        }
+        Ok(S3ObjectKey(key))
+    }
+
+    async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
+        let list_response = self
+            .bucket
+            .list(String::new(), None)
+            .await
+            .context("Failed to list s3 objects")?;
+
+        Ok(list_response
+            .into_iter()
+            .flat_map(|response| response.contents)
+            .map(|s3_object| S3ObjectKey(s3_object.key))
+            .collect())
+    }
+
+    async fn download_relish(
+        &self,
+        from: &Self::RelishStoragePath,
+        to: &Path,
+    ) -> anyhow::Result<()> {
+        let mut target_file = std::fs::OpenOptions::new()
+            .write(true)
+            .open(to)
+            .with_context(|| format!("Failed to open target s3 destination at {}", to.display()))?;
+        let code = self
+            .bucket
+            .get_object_stream(from.key(), &mut target_file)
+            .await
+            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
+        if code != 200 {
+            Err(anyhow::format_err!(
+                "Received non-200 exit code during downloading object from directory, code: {}",
+                code
+            ))
+        } else {
+            Ok(())
+        }
+    }
+
+    async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
+        let (_, code) = self
+            .bucket
+            .delete_object(path.key())
+            .await
+            .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
+        if code != 200 {
+            Err(anyhow::format_err!(
+                "Received non-200 exit code during deleting object with key '{}', code: {}",
+                path.key(),
+                code
+            ))
+        } else {
+            Ok(())
+        }
+    }
+
+    async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()> {
+        let mut local_file = tokio::fs::OpenOptions::new().read(true).open(from).await?;
+
+        let code = self
+            .bucket
+            .put_object_stream(&mut local_file, to.key())
+            .await
+            .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
+        if code != 200 {
+            Err(anyhow::format_err!(
+                "Received non-200 exit code during creating object with key '{}', code: {}",
+                to.key(),
+                code
+            ))
+        } else {
+            Ok(())
+        }
+    }
+}
--- a/pageserver/src/relish_storage/storage_uploader.rs
+++ b/pageserver/src/relish_storage/storage_uploader.rs
@@ -0,0 +1,116 @@
+use std::{
+    collections::VecDeque,
+    path::{Path, PathBuf},
+    sync::{Arc, Mutex},
+    thread,
+};
+
+use zenith_utils::zid::ZTimelineId;
+
+use crate::{relish_storage::RelishStorage, RelishStorageConfig};
+
+use super::{local_fs::LocalFs, rust_s3::RustS3};
+
+pub struct QueueBasedRelishUploader {
+    upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
+}
+
+impl QueueBasedRelishUploader {
+    pub fn new(
+        config: &RelishStorageConfig,
+        page_server_workdir: &'static Path,
+    ) -> anyhow::Result<Self> {
+        let upload_queue = Arc::new(Mutex::new(VecDeque::new()));
+        let _handle = match config {
+            RelishStorageConfig::LocalFs(root) => {
+                let relish_storage = LocalFs::new(root.clone())?;
+                create_upload_thread(
+                    Arc::clone(&upload_queue),
+                    relish_storage,
+                    page_server_workdir,
+                )?
+            }
+            RelishStorageConfig::AwsS3(s3_config) => {
+                let relish_storage = RustS3::new(s3_config)?;
+                create_upload_thread(
+                    Arc::clone(&upload_queue),
+                    relish_storage,
+                    page_server_workdir,
+                )?
+            }
+        };
+
+        Ok(Self { upload_queue })
+    }
+
+    pub fn schedule_upload(&self, timeline_id: ZTimelineId, relish_path: PathBuf) {
+        self.upload_queue
+            .lock()
+            .unwrap()
+            .push_back((timeline_id, relish_path))
+    }
+}
+
+fn create_upload_thread<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
+    upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
+    relish_storage: S,
+    page_server_workdir: &'static Path,
+) -> std::io::Result<thread::JoinHandle<()>> {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()?;
+    thread::Builder::new()
+        .name("Queue based relish uploader".to_string())
+        .spawn(move || loop {
+            runtime.block_on(async {
+                upload_loop_step(&upload_queue, &relish_storage, page_server_workdir).await;
+            })
+        })
+}
+
+async fn upload_loop_step<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
+    upload_queue: &Mutex<VecDeque<(ZTimelineId, PathBuf)>>,
+    relish_storage: &S,
+    page_server_workdir: &Path,
+) {
+    let mut queue_accessor = upload_queue.lock().unwrap();
+    log::debug!("current upload queue length: {}", queue_accessor.len());
+    let next_upload = queue_accessor.pop_front();
+    drop(queue_accessor);
+
+    let (relish_timeline_id, relish_local_path) = match next_upload {
+        Some(data) => data,
+        None => {
+            // Don't spin and allow others to use the queue.
+            // In future, could be improved to be more clever about delays depending on relish upload stats
+            thread::sleep(std::time::Duration::from_secs(1));
+            return;
+        }
+    };
+
+    if let Err(e) = upload_relish(relish_storage, page_server_workdir, &relish_local_path).await {
+        log::error!(
+            "Failed to upload relish '{}' for timeline {}, reason: {}",
+            relish_local_path.display(),
+            relish_timeline_id,
+            e
+        );
+        upload_queue
+            .lock()
+            .unwrap()
+            .push_back((relish_timeline_id, relish_local_path))
+    } else {
+        log::debug!("Relish successfully uploaded");
+    }
+}
+
+async fn upload_relish<P, S: RelishStorage<RelishStoragePath = P>>(
+    relish_storage: &S,
+    page_server_workdir: &Path,
+    relish_local_path: &Path,
+) -> anyhow::Result<()> {
+    let destination = S::derive_destination(page_server_workdir, relish_local_path)?;
+    relish_storage
+        .upload_relish(relish_local_path, &destination)
+        .await
+}
--- a/pageserver/src/remote_storage.rs
+++ b/pageserver/src/remote_storage.rs
@@ -1,182 +0,0 @@
-//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
-//! This particular module serves as a public API border between pageserver and the internal storage machinery.
-//! No other modules from this tree are supposed to be used directly by the external code.
-//!
-//! There are a few components the storage machinery consists of:
-//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
-//!     * [`local_fs`] allows to use local file system as an external storage
-//!     * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
-//!
-//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
-//!
-//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_checkpoint_upload`]
-//!
-//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform:
-//!
-//! +------------------------+                                    +--------->-------+
-//! |                        |  - - - (init async loop) - - - ->  |                 |
-//! |                        |                                    |                 |
-//! |                        |  ------------------------------->  |      async      |
-//! |       pageserver       |    (schedule checkpoint upload)    | upload/download |
-//! |                        |                                    |      loop       |
-//! |                        |  <-------------------------------  |                 |
-//! |                        |    (register downloaded layers)    |                 |
-//! +------------------------+                                    +---------<-------+
-//!                                                                         |
-//!                                                                         |
-//!                                          CRUD layer file operations     |
-//!                                     (upload/download/delete/list, etc.) |
-//!                                                                         V
-//!                                                            +------------------------+
-//!                                                            |                        |
-//!                                                            | [`RemoteStorage`] impl |
-//!                                                            |                        |
-//!                                                            | pageserver assumes it  |
-//!                                                            | owns exclusive write   |
-//!                                                            | access to this storage |
-//!                                                            +------------------------+
-//!
-//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop unitialised, if configured so.
-//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
-//! If the storage sync loop was successfully started before, pageserver schedules the new image uploads after every checkpoint.
-//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
-//!
-//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
-//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
-//! by the storage upload, if enabled.
-//! When a certain image gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same image state.
-//! No files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
-//! when the newer timeline is downloaded.
-//!
-//! Meanwhile, the loop inits the storage connection and checks the remote files stored.
-//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
-//! Based on the remote image data, the storage sync logic queues image downloads, while accepting any potential upload tasks from pageserver and managing the tasks by their priority.
-//! On the image download, a [`crate::tenant_mgr::register_relish_download`] function is called to register the new image in pageserver, initializing all related threads and internal state.
-//!
-//! When the pageserver terminates, the upload loop finishes a current image sync task (if any) and exits.
-//!
-//! NOTES:
-//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
-//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
-//!
-//! * the uploads do not happen right after pageserver startup, they are registered when
-//!     1. pageserver does the checkpoint, which happens further in the future after the server start
-//!     2. pageserver loads the timeline from disk for the first time
-//!
-//! * the uploads do not happen right after the upload registration: the sync loop might be occupied with other tasks, or tasks with bigger priority could be waiting already
-//!
-//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of remote files,
-//! enough to represent (and recover, if needed) a certain timeline state. On the contrary, all internal storage CRUD calls are made per reilsh file from those images.
-//! This way, the synchronization is able to download the image partially, if some state was synced before, but exposes correctly synced images only.
-
-mod local_fs;
-mod rust_s3;
-mod storage_sync;
-
-use std::{
-    path::{Path, PathBuf},
-    thread,
-};
-
-use anyhow::Context;
-use tokio::io;
-use zenith_utils::zid::{ZTenantId, ZTimelineId};
-
-pub use self::storage_sync::schedule_timeline_checkpoint_upload;
-use self::{local_fs::LocalFs, rust_s3::S3};
-use crate::{PageServerConf, RemoteStorageKind};
-
-/// Any timeline has its own id and its own tenant it belongs to,
-/// the sync processes group timelines by both for simplicity.
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TimelineSyncId(ZTenantId, ZTimelineId);
-
-/// Based on the config, initiates the remote storage connection and starts a separate thread
-/// that ensures that pageserver and the remote storage are in sync with each other.
-/// If no external configuraion connection given, no thread or storage initialization is done.
-pub fn run_storage_sync_thread(
-    config: &'static PageServerConf,
-) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
-    match &config.remote_storage_config {
-        Some(storage_config) => {
-            let max_concurrent_sync = storage_config.max_concurrent_sync;
-            let max_sync_errors = storage_config.max_sync_errors;
-            let handle = match &storage_config.storage {
-                RemoteStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
-                    config,
-                    LocalFs::new(root.clone(), &config.workdir)?,
-                    max_concurrent_sync,
-                    max_sync_errors,
-                ),
-                RemoteStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
-                    config,
-                    S3::new(s3_config, &config.workdir)?,
-                    max_concurrent_sync,
-                    max_sync_errors,
-                ),
-            };
-            handle.map(Some)
-        }
-        None => Ok(None),
-    }
-}
-
-/// Storage (potentially remote) API to manage its state.
-/// This storage tries to be unaware of any layered repository context,
-/// providing basic CRUD operations with storage files.
-#[async_trait::async_trait]
-trait RemoteStorage: Send + Sync {
-    /// A way to uniquely reference a file in the remote storage.
-    type StoragePath;
-
-    /// Attempts to derive the storage path out of the local path, if the latter is correct.
-    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath>;
-
-    /// Gets the download path of the given storage file.
-    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf>;
-
-    /// Lists all items the storage has right now.
-    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>>;
-
-    /// Streams the local file contents into remote into the remote storage entry.
-    async fn upload(
-        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
-        to: &Self::StoragePath,
-    ) -> anyhow::Result<()>;
-
-    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
-    async fn download(
-        &self,
-        from: &Self::StoragePath,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()>;
-
-    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
-    async fn download_range(
-        &self,
-        from: &Self::StoragePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()>;
-
-    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>;
-}
-
-fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
-    if prefix == path {
-        anyhow::bail!(
-            "Prefix and the path are equal, cannot strip: '{}'",
-            prefix.display()
-        )
-    } else {
-        path.strip_prefix(prefix).with_context(|| {
-            format!(
-                "Path '{}' is not prefixed with '{}'",
-                path.display(),
-                prefix.display(),
-            )
-        })
-    }
-}
--- a/pageserver/src/remote_storage/README.md
+++ b/pageserver/src/remote_storage/README.md
@@ -1,75 +0,0 @@
-# Non-implementation details
-
-This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
-Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../remote_storage.rs) and its submodules.
-Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
-
-## Approach
-
-Backup functionality is a new component, appeared way after the core DB functionality was implemented.
-Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time.
-
-To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop.
-This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver.
-
-## What's done
-
-Current implementation
-* provides remote storage wrappers for AWS S3 and local FS
-* uploads layers, frozen by pageserver checkpoint thread
-* downloads and registers layers, found on the remote storage, but missing locally
-
-No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
-It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
-
-### Peculiarities
-
-As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
-Here's the list of known compromises with comments:
-
-* Remote storage model is the same as the `tenants/` directory contents of the pageserver's local workdir storage.
-This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish files and a metadata file, ~31 MB combined.
-AWS charges per API call and for traffic either, layers are expected to be updated frequently, so this model most probably is ineffective.
-Additionally, pageservers might need to migrate images between tenants, which does not improve the situation.
-
-Storage sync API operates images when backing up or restoring a backup, so we're fluent to repack the layer contents the way we want to, which most probably will be done later.
-
-* no proper file comparison
-
-Currently, every layer contains `Lsn` in their name, to map the data it holds against a certain DB state.
-Then the images with same ids and different `Lsn`'s are compared, files are considered equal if their local file paths are equal (for remote files, "local file path" is their download destination).
-No file contents assertion is done currently, but should be.
-AWS S3 returns file checksums during the `list` operation, so that can be used to ensure the backup consistency, but that needs further research and, since current pageserver impl also needs to deal with layer file checksums.
-
-For now, due to this, we consider local workdir files as source of truth, not removing them ever and adjusting remote files instead, if image files mismatch.
-
-* sad rust-s3 api
-
-rust-s3 is not very pleasant to use:
-1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance
-2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091)
-3. it's a prerelease library with unclear maintenance status
-4. noisy on debug level
-
-But it's already used in the project, so for now it's reused to avoid bloating the dependency tree.
-Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.
-
-
-* gc and branches are ignored
-
-So far, we don't consider non-main images and don't adjust the remote storage based on GC thread loop results.
-Only checkpointer loop affects the remote storage.
-
-* more layers should be downloaded on demand
-
-Since we download and load remote layers into pageserver, there's a possibility a need for those layers' ancestors arise.
-Most probably, every downloaded image's ancestor is not present in locally too, but currently there's no logic for downloading such ancestors and their metadata,
-so the pageserver is unable to respond property on requests to such ancestors.
-
-To implement the downloading, more `tenant_mgr` refactoring is needed to properly handle web requests for layers and handle the state changes.
-[Here](https://github.com/zenithdb/zenith/pull/689#issuecomment-931216193) are the details about initial state management updates needed.
-
-* no IT tests
-
-Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
-After it's fixed, benchmark runs should also be carried out to find bottlenecks.
--- a/pageserver/src/remote_storage/local_fs.rs
+++ b/pageserver/src/remote_storage/local_fs.rs
@@ -1,689 +0,0 @@
-//! Local filesystem acting as a remote storage.
-//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
-//!
-//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
-//! volume is mounted to the local FS.
-
-use std::{
-    future::Future,
-    path::{Path, PathBuf},
-    pin::Pin,
-};
-
-use anyhow::{bail, ensure, Context};
-use tokio::{
-    fs,
-    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
-};
-use tracing::*;
-
-use super::{strip_path_prefix, RemoteStorage};
-
-pub struct LocalFs {
-    pageserver_workdir: &'static Path,
-    root: PathBuf,
-}
-
-impl LocalFs {
-    /// Attempts to create local FS storage, along with its root directory.
-    pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
-        if !root.exists() {
-            std::fs::create_dir_all(&root).with_context(|| {
-                format!(
-                    "Failed to create all directories in the given root path '{}'",
-                    root.display(),
-                )
-            })?;
-        }
-        Ok(Self {
-            pageserver_workdir,
-            root,
-        })
-    }
-
-    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
-        if path.is_relative() {
-            Ok(self.root.join(path))
-        } else if path.starts_with(&self.root) {
-            Ok(path.to_path_buf())
-        } else {
-            bail!(
-                "Path '{}' does not belong to the current storage",
-                path.display()
-            )
-        }
-    }
-}
-
-#[async_trait::async_trait]
-impl RemoteStorage for LocalFs {
-    type StoragePath = PathBuf;
-
-    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
-        Ok(self.root.join(
-            strip_path_prefix(self.pageserver_workdir, local_path)
-                .context("local path does not belong to this storage")?,
-        ))
-    }
-
-    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
-        let relative_path = strip_path_prefix(&self.root, storage_path)
-            .context("local path does not belong to this storage")?;
-        Ok(self.pageserver_workdir.join(relative_path))
-    }
-
-    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
-        Ok(get_all_files(&self.root).await?.into_iter().collect())
-    }
-
-    async fn upload(
-        &self,
-        mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
-        to: &Self::StoragePath,
-    ) -> anyhow::Result<()> {
-        let target_file_path = self.resolve_in_storage(to)?;
-        create_target_directory(&target_file_path).await?;
-        let mut destination = io::BufWriter::new(
-            fs::OpenOptions::new()
-                .write(true)
-                .create(true)
-                .open(&target_file_path)
-                .await
-                .with_context(|| {
-                    format!(
-                        "Failed to open target fs destination at '{}'",
-                        target_file_path.display()
-                    )
-                })?,
-        );
-
-        io::copy(&mut from, &mut destination)
-            .await
-            .with_context(|| {
-                format!(
-                    "Failed to upload file to the local storage at '{}'",
-                    target_file_path.display()
-                )
-            })?;
-        destination.flush().await.with_context(|| {
-            format!(
-                "Failed to upload file to the local storage at '{}'",
-                target_file_path.display()
-            )
-        })?;
-        Ok(())
-    }
-
-    async fn download(
-        &self,
-        from: &Self::StoragePath,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()> {
-        let file_path = self.resolve_in_storage(from)?;
-
-        if file_path.exists() && file_path.is_file() {
-            let mut source = io::BufReader::new(
-                fs::OpenOptions::new()
-                    .read(true)
-                    .open(&file_path)
-                    .await
-                    .with_context(|| {
-                        format!(
-                            "Failed to open source file '{}' to use in the download",
-                            file_path.display()
-                        )
-                    })?,
-            );
-            io::copy(&mut source, to).await.with_context(|| {
-                format!(
-                    "Failed to download file '{}' from the local storage",
-                    file_path.display()
-                )
-            })?;
-            source.flush().await?;
-            Ok(())
-        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
-        }
-    }
-
-    async fn download_range(
-        &self,
-        from: &Self::StoragePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()> {
-        if let Some(end_exclusive) = end_exclusive {
-            ensure!(
-                end_exclusive > start_inclusive,
-                "Invalid range, start ({}) is bigger then end ({:?})",
-                start_inclusive,
-                end_exclusive
-            );
-            if start_inclusive == end_exclusive.saturating_sub(1) {
-                return Ok(());
-            }
-        }
-        let file_path = self.resolve_in_storage(from)?;
-
-        if file_path.exists() && file_path.is_file() {
-            let mut source = io::BufReader::new(
-                fs::OpenOptions::new()
-                    .read(true)
-                    .open(&file_path)
-                    .await
-                    .with_context(|| {
-                        format!(
-                            "Failed to open source file '{}' to use in the download",
-                            file_path.display()
-                        )
-                    })?,
-            );
-            source
-                .seek(io::SeekFrom::Start(start_inclusive))
-                .await
-                .context("Failed to seek to the range start in a local storage file")?;
-            match end_exclusive {
-                Some(end_exclusive) => {
-                    io::copy(&mut source.take(end_exclusive - start_inclusive), to).await
-                }
-                None => io::copy(&mut source, to).await,
-            }
-            .with_context(|| {
-                format!(
-                    "Failed to download file '{}' range from the local storage",
-                    file_path.display()
-                )
-            })?;
-            Ok(())
-        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
-        }
-    }
-
-    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
-        let file_path = self.resolve_in_storage(path)?;
-        if file_path.exists() && file_path.is_file() {
-            Ok(fs::remove_file(file_path).await?)
-        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
-        }
-    }
-}
-
-fn get_all_files<'a, P>(
-    directory_path: P,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
-where
-    P: AsRef<Path> + Send + Sync + 'a,
-{
-    Box::pin(async move {
-        let directory_path = directory_path.as_ref();
-        if directory_path.exists() {
-            if directory_path.is_dir() {
-                let mut paths = Vec::new();
-                let mut dir_contents = fs::read_dir(directory_path).await?;
-                while let Some(dir_entry) = dir_contents.next_entry().await? {
-                    let file_type = dir_entry.file_type().await?;
-                    let entry_path = dir_entry.path();
-                    if file_type.is_symlink() {
-                        debug!("{:?} us a symlink, skipping", entry_path)
-                    } else if file_type.is_dir() {
-                        paths.extend(get_all_files(entry_path).await?.into_iter())
-                    } else {
-                        paths.push(dir_entry.path());
-                    }
-                }
-                Ok(paths)
-            } else {
-                bail!("Path '{}' is not a directory", directory_path.display())
-            }
-        } else {
-            Ok(Vec::new())
-        }
-    })
-}
-
-async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
-    let target_dir = match target_file_path.parent() {
-        Some(parent_dir) => parent_dir,
-        None => bail!(
-            "File path '{}' has no parent directory",
-            target_file_path.display()
-        ),
-    };
-    if !target_dir.exists() {
-        fs::create_dir_all(target_dir).await?;
-    }
-    Ok(())
-}
-
-#[cfg(test)]
-mod pure_tests {
-    use crate::{
-        layered_repository::metadata::METADATA_FILE_NAME,
-        repository::repo_harness::{RepoHarness, TIMELINE_ID},
-    };
-
-    use super::*;
-
-    #[test]
-    fn storage_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("storage_path_positive")?;
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root.clone(),
-        };
-
-        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name");
-        let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
-
-        assert_eq!(
-            expected_path,
-            storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
-            "File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn storage_path_negatives() -> anyhow::Result<()> {
-        #[track_caller]
-        fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
-            match storage.storage_path(mismatching_path) {
-                Ok(wrong_path) => panic!(
-                    "Expected path '{}' to error, but got storage path: {:?}",
-                    mismatching_path.display(),
-                    wrong_path,
-                ),
-                Err(e) => format!("{:?}", e),
-            }
-        }
-
-        let repo_harness = RepoHarness::create("storage_path_negatives")?;
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root,
-        };
-
-        let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
-        assert!(error_string.contains("does not belong to this storage"));
-        assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
-
-        let mismatching_path_str = "/something/else";
-        let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
-        assert!(
-            error_message.contains(mismatching_path_str),
-            "Error should mention wrong path"
-        );
-        assert!(
-            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
-            "Error should mention server workdir"
-        );
-        assert!(error_message.contains("does not belong to this storage"));
-
-        Ok(())
-    }
-
-    #[test]
-    fn local_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("local_path_positive")?;
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root.clone(),
-        };
-
-        let name = "not a metadata";
-        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
-        assert_eq!(
-            local_path,
-            storage
-                .local_path(
-                    &storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?)
-                )
-                .expect("For a valid input, valid S3 info should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote delta file"
-        );
-
-        let local_metadata_path = repo_harness
-            .timeline_path(&TIMELINE_ID)
-            .join(METADATA_FILE_NAME);
-        let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
-        assert_eq!(
-            local_metadata_path,
-            storage
-                .local_path(&remote_metadata_path)
-                .expect("For a valid input, valid local path should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote metadata file"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn local_path_negatives() -> anyhow::Result<()> {
-        #[track_caller]
-        #[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements
-        fn local_path_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
-            match storage.local_path(storage_path) {
-                Ok(wrong_path) => panic!(
-                    "Expected local path input {:?} to cause an error, but got file path: {:?}",
-                    storage_path, wrong_path,
-                ),
-                Err(e) => format!("{:?}", e),
-            }
-        }
-
-        let repo_harness = RepoHarness::create("local_path_negatives")?;
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root,
-        };
-
-        let totally_wrong_path = "wrong_wrong_wrong";
-        let error_message = local_path_error(&storage, &PathBuf::from(totally_wrong_path));
-        assert!(error_message.contains(totally_wrong_path));
-
-        Ok(())
-    }
-
-    #[test]
-    fn download_destination_matches_original_path() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
-        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
-
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let dummy_storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root,
-        };
-
-        let storage_path = dummy_storage.storage_path(&original_path)?;
-        let download_destination = dummy_storage.local_path(&storage_path)?;
-
-        assert_eq!(
-            original_path, download_destination,
-            "'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
-        );
-
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-mod fs_tests {
-    use super::*;
-    use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
-
-    use std::io::Write;
-    use tempfile::tempdir;
-
-    #[tokio::test]
-    async fn upload_file() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("upload_file")?;
-        let storage = create_storage()?;
-
-        let source = create_file_for_upload(
-            &storage.pageserver_workdir.join("whatever"),
-            "whatever_contents",
-        )
-        .await?;
-        let target_path = PathBuf::from("/").join("somewhere").join("else");
-        match storage.upload(source, &target_path).await {
-            Ok(()) => panic!("Should not allow storing files with wrong target path"),
-            Err(e) => {
-                let message = format!("{:?}", e);
-                assert!(message.contains(&target_path.display().to_string()));
-                assert!(message.contains("does not belong to the current storage"));
-            }
-        }
-        assert!(storage.list().await?.is_empty());
-
-        let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
-        assert_eq!(
-            storage.list().await?,
-            vec![target_path_1.clone()],
-            "Should list a single file after first upload"
-        );
-
-        let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
-        assert_eq!(
-            list_files_sorted(&storage).await?,
-            vec![target_path_1.clone(), target_path_2.clone()],
-            "Should list a two different files after second upload"
-        );
-
-        Ok(())
-    }
-
-    fn create_storage() -> anyhow::Result<LocalFs> {
-        let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
-        let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
-        Ok(storage)
-    }
-
-    #[tokio::test]
-    async fn download_file() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file")?;
-        let storage = create_storage()?;
-        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
-
-        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        storage.download(&upload_target, &mut content_bytes).await?;
-        content_bytes.flush().await?;
-
-        let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
-        assert_eq!(
-            dummy_contents(upload_name),
-            contents,
-            "We should upload and download the same contents"
-        );
-
-        let non_existing_path = PathBuf::from("somewhere").join("else");
-        match storage.download(&non_existing_path, &mut io::sink()).await {
-            Ok(_) => panic!("Should not allow downloading non-existing storage files"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("does not exist"));
-                assert!(error_string.contains(&non_existing_path.display().to_string()));
-            }
-        }
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn download_file_range_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file_range_positive")?;
-        let storage = create_storage()?;
-        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
-
-        let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        storage
-            .download_range(&upload_target, 0, None, &mut full_range_bytes)
-            .await?;
-        full_range_bytes.flush().await?;
-        assert_eq!(
-            dummy_contents(upload_name),
-            String::from_utf8(full_range_bytes.into_inner().into_inner())?,
-            "Download full range should return the whole upload"
-        );
-
-        let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let same_byte = 1_000_000_000;
-        storage
-            .download_range(
-                &upload_target,
-                same_byte,
-                Some(same_byte + 1), // exclusive end
-                &mut zero_range_bytes,
-            )
-            .await?;
-        zero_range_bytes.flush().await?;
-        assert!(
-            zero_range_bytes.into_inner().into_inner().is_empty(),
-            "Zero byte range should not download any part of the file"
-        );
-
-        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
-        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
-
-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        storage
-            .download_range(
-                &upload_target,
-                0,
-                Some(first_part_local.len() as u64),
-                &mut first_part_remote,
-            )
-            .await?;
-        first_part_remote.flush().await?;
-        let first_part_remote = first_part_remote.into_inner().into_inner();
-        assert_eq!(
-            first_part_local,
-            first_part_remote.as_slice(),
-            "First part bytes should be returned when requrested"
-        );
-
-        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        storage
-            .download_range(
-                &upload_target,
-                first_part_local.len() as u64,
-                Some((first_part_local.len() + second_part_local.len()) as u64),
-                &mut second_part_remote,
-            )
-            .await?;
-        second_part_remote.flush().await?;
-        let second_part_remote = second_part_remote.into_inner().into_inner();
-        assert_eq!(
-            second_part_local,
-            second_part_remote.as_slice(),
-            "Second part bytes should be returned when requrested"
-        );
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn download_file_range_negative() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file_range_negative")?;
-        let storage = create_storage()?;
-        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
-
-        let start = 10000;
-        let end = 234;
-        assert!(start > end, "Should test an incorrect range");
-        match storage
-            .download_range(&upload_target, start, Some(end), &mut io::sink())
-            .await
-        {
-            Ok(_) => panic!("Should not allow downloading wrong ranges"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("Invalid range"));
-                assert!(error_string.contains(&start.to_string()));
-                assert!(error_string.contains(&end.to_string()));
-            }
-        }
-
-        let non_existing_path = PathBuf::from("somewhere").join("else");
-        match storage
-            .download_range(&non_existing_path, 1, Some(3), &mut io::sink())
-            .await
-        {
-            Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("does not exist"));
-                assert!(error_string.contains(&non_existing_path.display().to_string()));
-            }
-        }
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn delete_file() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("delete_file")?;
-        let storage = create_storage()?;
-        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
-
-        storage.delete(&upload_target).await?;
-        assert!(storage.list().await?.is_empty());
-
-        match storage.delete(&upload_target).await {
-            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("does not exist"));
-                assert!(error_string.contains(&upload_target.display().to_string()));
-            }
-        }
-        Ok(())
-    }
-
-    async fn upload_dummy_file(
-        harness: &RepoHarness,
-        storage: &LocalFs,
-        name: &str,
-    ) -> anyhow::Result<PathBuf> {
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
-        let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
-        let storage_path = storage.root.join(relative_timeline_path).join(name);
-        storage
-            .upload(
-                create_file_for_upload(
-                    &storage.pageserver_workdir.join(name),
-                    &dummy_contents(name),
-                )
-                .await?,
-                &storage_path,
-            )
-            .await?;
-        Ok(storage_path)
-    }
-
-    async fn create_file_for_upload(
-        path: &Path,
-        contents: &str,
-    ) -> anyhow::Result<io::BufReader<fs::File>> {
-        std::fs::create_dir_all(path.parent().unwrap())?;
-        let mut file_for_writing = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .open(path)?;
-        write!(file_for_writing, "{}", contents)?;
-        drop(file_for_writing);
-        Ok(io::BufReader::new(
-            fs::OpenOptions::new().read(true).open(&path).await?,
-        ))
-    }
-
-    fn dummy_contents(name: &str) -> String {
-        format!("contents for {}", name)
-    }
-
-    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
-        let mut files = storage.list().await?;
-        files.sort();
-        Ok(files)
-    }
-}
--- a/pageserver/src/remote_storage/rust_s3.rs
+++ b/pageserver/src/remote_storage/rust_s3.rs
@@ -1,373 +0,0 @@
-//! AWS S3 storage wrapper around `rust_s3` library.
-//! Currently does not allow multiple pageservers to use the same bucket concurrently: objects are
-//! placed in the root of the bucket.
-
-use std::path::{Path, PathBuf};
-
-use anyhow::Context;
-use s3::{bucket::Bucket, creds::Credentials, region::Region};
-use tokio::io::{self, AsyncWriteExt};
-
-use crate::{
-    remote_storage::{strip_path_prefix, RemoteStorage},
-    S3Config,
-};
-
-const S3_FILE_SEPARATOR: char = '/';
-
-#[derive(Debug, Eq, PartialEq)]
-pub struct S3ObjectKey(String);
-
-impl S3ObjectKey {
-    fn key(&self) -> &str {
-        &self.0
-    }
-
-    fn download_destination(&self, pageserver_workdir: &Path) -> PathBuf {
-        pageserver_workdir.join(self.0.split(S3_FILE_SEPARATOR).collect::<PathBuf>())
-    }
-}
-
-/// AWS S3 storage.
-pub struct S3 {
-    pageserver_workdir: &'static Path,
-    bucket: Bucket,
-}
-
-impl S3 {
-    /// Creates the storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
-        let region = aws_config
-            .bucket_region
-            .parse::<Region>()
-            .context("Failed to parse the s3 region from config")?;
-        let credentials = Credentials::new(
-            aws_config.access_key_id.as_deref(),
-            aws_config.secret_access_key.as_deref(),
-            None,
-            None,
-            None,
-        )
-        .context("Failed to create the s3 credentials")?;
-        Ok(Self {
-            bucket: Bucket::new_with_path_style(
-                aws_config.bucket_name.as_str(),
-                region,
-                credentials,
-            )
-            .context("Failed to create the s3 bucket")?,
-            pageserver_workdir,
-        })
-    }
-}
-
-#[async_trait::async_trait]
-impl RemoteStorage for S3 {
-    type StoragePath = S3ObjectKey;
-
-    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
-        let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
-        let mut key = String::new();
-        for segment in relative_path {
-            key.push(S3_FILE_SEPARATOR);
-            key.push_str(&segment.to_string_lossy());
-        }
-        Ok(S3ObjectKey(key))
-    }
-
-    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
-        Ok(storage_path.download_destination(self.pageserver_workdir))
-    }
-
-    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
-        let list_response = self
-            .bucket
-            .list(String::new(), None)
-            .await
-            .context("Failed to list s3 objects")?;
-
-        Ok(list_response
-            .into_iter()
-            .flat_map(|response| response.contents)
-            .map(|s3_object| S3ObjectKey(s3_object.key))
-            .collect())
-    }
-
-    async fn upload(
-        &self,
-        mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
-        to: &Self::StoragePath,
-    ) -> anyhow::Result<()> {
-        let mut upload_contents = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(&mut from, &mut upload_contents)
-            .await
-            .context("Failed to read the upload contents")?;
-        upload_contents
-            .flush()
-            .await
-            .context("Failed to read the upload contents")?;
-        let upload_contents = upload_contents.into_inner().into_inner();
-
-        let (_, code) = self
-            .bucket
-            .put_object(to.key(), &upload_contents)
-            .await
-            .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
-        if code != 200 {
-            Err(anyhow::format_err!(
-                "Received non-200 exit code during creating object with key '{}', code: {}",
-                to.key(),
-                code
-            ))
-        } else {
-            Ok(())
-        }
-    }
-
-    async fn download(
-        &self,
-        from: &Self::StoragePath,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()> {
-        let (data, code) = self
-            .bucket
-            .get_object(from.key())
-            .await
-            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
-        if code != 200 {
-            Err(anyhow::format_err!(
-                "Received non-200 exit code during downloading object, code: {}",
-                code
-            ))
-        } else {
-            // we don't have to write vector into the destination this way, `to_write_all` would be enough.
-            // but we want to prepare for migration on `rusoto`, that has a streaming HTTP body instead here, with
-            // which it makes more sense to use `io::copy`.
-            io::copy(&mut data.as_slice(), to)
-                .await
-                .context("Failed to write downloaded data into the destination buffer")?;
-            Ok(())
-        }
-    }
-
-    async fn download_range(
-        &self,
-        from: &Self::StoragePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()> {
-        // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
-        // and needs both ends to be exclusive
-        let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
-        let (data, code) = self
-            .bucket
-            .get_object_range(from.key(), start_inclusive, end_inclusive)
-            .await
-            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
-        if code != 206 {
-            Err(anyhow::format_err!(
-                "Received non-206 exit code during downloading object range, code: {}",
-                code
-            ))
-        } else {
-            // see `download` function above for the comment on why `Vec<u8>` buffer is copied this way
-            io::copy(&mut data.as_slice(), to)
-                .await
-                .context("Failed to write downloaded range into the destination buffer")?;
-            Ok(())
-        }
-    }
-
-    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
-        let (_, code) = self
-            .bucket
-            .delete_object(path.key())
-            .await
-            .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
-        if code != 204 {
-            Err(anyhow::format_err!(
-                "Received non-204 exit code during deleting object with key '{}', code: {}",
-                path.key(),
-                code
-            ))
-        } else {
-            Ok(())
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::{
-        layered_repository::metadata::METADATA_FILE_NAME,
-        repository::repo_harness::{RepoHarness, TIMELINE_ID},
-    };
-
-    use super::*;
-
-    #[test]
-    fn download_destination() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_destination")?;
-
-        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
-        let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
-
-        let key = S3ObjectKey(format!(
-            "{}{}",
-            S3_FILE_SEPARATOR,
-            relative_path
-                .iter()
-                .map(|segment| segment.to_str().unwrap())
-                .collect::<Vec<_>>()
-                .join(&S3_FILE_SEPARATOR.to_string()),
-        ));
-
-        assert_eq!(
-            local_path,
-            key.download_destination(&repo_harness.conf.workdir),
-            "Download destination should consist of s3 path joined with the pageserver workdir prefix"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn storage_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("storage_path_positive")?;
-
-        let segment_1 = "matching";
-        let segment_2 = "file";
-        let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
-        let expected_key = S3ObjectKey(format!(
-            "{SEPARATOR}{}{SEPARATOR}{}",
-            segment_1,
-            segment_2,
-            SEPARATOR = S3_FILE_SEPARATOR,
-        ));
-
-        let actual_key = dummy_storage(&repo_harness.conf.workdir)
-            .storage_path(local_path)
-            .expect("Matching path should map to S3 path normally");
-        assert_eq!(
-            expected_key,
-            actual_key,
-            "S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn storage_path_negatives() -> anyhow::Result<()> {
-        #[track_caller]
-        fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String {
-            match storage.storage_path(mismatching_path) {
-                Ok(wrong_key) => panic!(
-                    "Expected path '{}' to error, but got S3 key: {:?}",
-                    mismatching_path.display(),
-                    wrong_key,
-                ),
-                Err(e) => e.to_string(),
-            }
-        }
-
-        let repo_harness = RepoHarness::create("storage_path_negatives")?;
-        let storage = dummy_storage(&repo_harness.conf.workdir);
-
-        let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
-        assert!(
-            error_message.contains("Prefix and the path are equal"),
-            "Message '{}' does not contain the required string",
-            error_message
-        );
-
-        let mismatching_path = PathBuf::from("somewhere").join("else");
-        let error_message = storage_path_error(&storage, &mismatching_path);
-        assert!(
-            error_message.contains(mismatching_path.to_str().unwrap()),
-            "Error should mention wrong path"
-        );
-        assert!(
-            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
-            "Error should mention server workdir"
-        );
-        assert!(
-            error_message.contains("is not prefixed with"),
-            "Message '{}' does not contain a required string",
-            error_message
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn local_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("local_path_positive")?;
-        let storage = dummy_storage(&repo_harness.conf.workdir);
-        let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
-        let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?;
-
-        let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata"));
-        assert_eq!(
-            s3_key.download_destination(&repo_harness.conf.workdir),
-            storage
-                .local_path(&s3_key)
-                .expect("For a valid input, valid S3 info should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote delta file"
-        );
-
-        let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME));
-        assert_eq!(
-            s3_key.download_destination(&repo_harness.conf.workdir),
-            storage
-                .local_path(&s3_key)
-                .expect("For a valid input, valid S3 info should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote metadata file"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn download_destination_matches_original_path() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
-        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
-
-        let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
-
-        let key = dummy_storage.storage_path(&original_path)?;
-        let download_destination = dummy_storage.local_path(&key)?;
-
-        assert_eq!(
-            original_path, download_destination,
-            "'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
-        );
-
-        Ok(())
-    }
-
-    fn dummy_storage(pageserver_workdir: &'static Path) -> S3 {
-        S3 {
-            pageserver_workdir,
-            bucket: Bucket::new(
-                "dummy-bucket",
-                "us-east-1".parse().unwrap(),
-                Credentials::anonymous().unwrap(),
-            )
-            .unwrap(),
-        }
-    }
-
-    fn create_s3_key(relative_file_path: &Path) -> S3ObjectKey {
-        S3ObjectKey(
-            relative_file_path
-                .iter()
-                .fold(String::new(), |mut path_string, segment| {
-                    path_string.push(S3_FILE_SEPARATOR);
-                    path_string.push_str(segment.to_str().unwrap());
-                    path_string
-                }),
-        )
-    }
-}
--- a/pageserver/src/remote_storage/storage_sync.rs
+++ b/pageserver/src/remote_storage/storage_sync.rs
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,10 +1,9 @@
 use crate::relish::*;
-use crate::CheckpointConfig;
 use anyhow::Result;
-use bytes::Bytes;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
-use std::ops::{AddAssign, Deref};
+use std::ops::AddAssign;
 use std::sync::Arc;
 use std::time::Duration;
 use zenith_utils::lsn::{Lsn, RecordLsn};
@@ -14,28 +13,18 @@ use zenith_utils::zid::ZTimelineId;
 /// A repository corresponds to one .zenith directory. One repository holds multiple
 /// timelines, forked off from the same initial call to 'initdb'.
 pub trait Repository: Send + Sync {
-    fn shutdown(&self) -> Result<()>;
-
-    /// Stops all timeline-related process in the repository and removes the timeline data from memory.
-    fn unload_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
-
    /// Get Timeline handle for given zenith timeline ID.
    fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;

    /// Create a new, empty timeline. The caller is responsible for loading data into it
-    /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
-    fn create_empty_timeline(
-        &self,
-        timelineid: ZTimelineId,
-        initdb_lsn: Lsn,
-    ) -> Result<Arc<dyn Timeline>>;
+    fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;

    /// Branch a timeline
    fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;

-    /// perform one garbage collection iteration, removing old data files from disk.
-    /// this funtion is periodically called by gc thread.
-    /// also it can be explicitly requested through page server api 'do_gc' command.
+    /// perform one garbage collection iteration.
+    /// garbage collection is periodically performed by gc thread,
+    /// but it can be explicitly requested through page server api.
    ///
    /// 'timelineid' specifies the timeline to GC, or None for all.
    /// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
@@ -48,10 +37,6 @@ pub trait Repository: Send + Sync {
        horizon: u64,
        checkpoint_before_gc: bool,
    ) -> Result<GcResult>;
-
-    /// perform one checkpoint iteration, flushing in-memory data on disk.
-    /// this function is periodically called by checkponter thread.
-    fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>;
 }

 ///
@@ -132,64 +117,17 @@ pub trait Timeline: Send + Sync {
    /// Get a list of all existing non-relational objects
    fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;

-    /// Get the ancestor's timeline id
-    fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId>;
-
-    /// Get the LSN where this branch was created
-    fn get_ancestor_lsn(&self) -> Lsn;
-
    //------------------------------------------------------------------------------
    // Public PUT functions, to update the repository with new page versions.
    //
    // These are called by the WAL receiver to digest WAL records.
    //------------------------------------------------------------------------------

-    /// Atomically get both last and prev.
-    fn get_last_record_rlsn(&self) -> RecordLsn;
-    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
-    fn get_last_record_lsn(&self) -> Lsn;
-    fn get_prev_record_lsn(&self) -> Lsn;
-    fn get_start_lsn(&self) -> Lsn;
-    fn get_disk_consistent_lsn(&self) -> Lsn;
-
-    /// Mutate the timeline with a [`TimelineWriter`].
-    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
-
-    ///
-    /// Flush to disk all data that was written with the put_* functions
-    ///
-    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
-    /// know anything about them here in the repository.
-    fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;
-
-    ///
-    /// Check that it is valid to request operations with that lsn.
-    fn check_lsn_is_in_scope(&self, lsn: Lsn) -> Result<()>;
-
-    /// Retrieve current logical size of the timeline
-    ///
-    /// NOTE: counted incrementally, includes ancestors,
-    /// doesnt support TwoPhase relishes yet
-    fn get_current_logical_size(&self) -> usize;
-
-    /// Does the same as get_current_logical_size but counted on demand.
-    /// Used in tests to ensure thet incremental and non incremental variants match.
-    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
-
-    /// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline.
-    fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline;
-}
-
-/// Various functions to mutate the timeline.
-// TODO Currently, Deref is used to allow easy access to read methods from this trait.
-// This is probably considered a bad practice in Rust and should be fixed eventually,
-// but will cause large code changes.
-pub trait TimelineWriter: Deref<Target = dyn Timeline> {
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
-    fn put_wal_record(&self, lsn: Lsn, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;
+    fn put_wal_record(&self, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;

    /// Like put_wal_record, but with ready-made image of the page.
    fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>;
@@ -205,10 +143,34 @@ pub trait TimelineWriter: Deref<Target = dyn Timeline> {
    /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
    /// Previous last record LSN is stored alongside the latest and can be read.
    fn advance_last_record_lsn(&self, lsn: Lsn);
+    /// Atomically get both last and prev.
+    fn get_last_record_rlsn(&self) -> RecordLsn;
+    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
+    fn get_last_record_lsn(&self) -> Lsn;
+    fn get_prev_record_lsn(&self) -> Lsn;
+    fn get_start_lsn(&self) -> Lsn;
+
+    ///
+    /// Flush to disk all data that was written with the put_* functions
+    ///
+    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
+    /// know anything about them here in the repository.
+    fn checkpoint(&self) -> Result<()>;
+
+    /// Retrieve current logical size of the timeline
+    ///
+    /// NOTE: counted incrementally, includes ancestors,
+    /// doesnt support TwoPhase relishes yet
+    fn get_current_logical_size(&self) -> usize;
+
+    /// Does the same as get_current_logical_size but counted on demand.
+    /// Used in tests to ensure thet incremental and non incremental variants match.
+    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct WALRecord {
+    pub lsn: Lsn, // LSN at the *end* of the record
    pub will_init: bool,
    pub rec: Bytes,
    // Remember the offset of main_data in rec,
@@ -217,101 +179,25 @@ pub struct WALRecord {
    pub main_data_offset: u32,
 }

-#[cfg(test)]
-pub mod repo_harness {
-    use bytes::BytesMut;
-    use std::{fs, path::PathBuf};
-
-    use crate::{
-        layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME},
-        walredo::{WalRedoError, WalRedoManager},
-        PageServerConf,
-    };
-
-    use super::*;
-    use hex_literal::hex;
-    use zenith_utils::zid::ZTenantId;
-
-    pub const TIMELINE_ID: ZTimelineId =
-        ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
-    pub const NEW_TIMELINE_ID: ZTimelineId =
-        ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
-
-    /// Convenience function to create a page image with given string as the only content
-    #[allow(non_snake_case)]
-    pub fn TEST_IMG(s: &str) -> Bytes {
-        let mut buf = BytesMut::new();
-        buf.extend_from_slice(s.as_bytes());
-        buf.resize(8192, 0);
-
-        buf.freeze()
+impl WALRecord {
+    pub fn pack(&self, buf: &mut BytesMut) {
+        buf.put_u64(self.lsn.0);
+        buf.put_u8(self.will_init as u8);
+        buf.put_u32(self.main_data_offset);
+        buf.put_u32(self.rec.len() as u32);
+        buf.put_slice(&self.rec[..]);
    }
-
-    pub struct RepoHarness {
-        pub conf: &'static PageServerConf,
-        pub tenant_id: ZTenantId,
-    }
-
-    impl RepoHarness {
-        pub fn create(test_name: &'static str) -> Result<Self> {
-            let repo_dir = PageServerConf::test_repo_dir(test_name);
-            let _ = fs::remove_dir_all(&repo_dir);
-            fs::create_dir_all(&repo_dir)?;
-            fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?;
-
-            let conf = PageServerConf::dummy_conf(repo_dir);
-            // Make a static copy of the config. This can never be free'd, but that's
-            // OK in a test.
-            let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-            let tenant_id = ZTenantId::generate();
-            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
-
-            Ok(Self { conf, tenant_id })
-        }
-
-        pub fn load(&self) -> Box<dyn Repository> {
-            let walredo_mgr = Arc::new(TestRedoManager);
-
-            Box::new(LayeredRepository::new(
-                self.conf,
-                walredo_mgr,
-                self.tenant_id,
-                false,
-            ))
-        }
-
-        pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
-            self.conf.timeline_path(timeline_id, &self.tenant_id)
-        }
-    }
-
-    // Mock WAL redo manager that doesn't do much
-    struct TestRedoManager;
-
-    impl WalRedoManager for TestRedoManager {
-        fn request_redo(
-            &self,
-            rel: RelishTag,
-            blknum: u32,
-            lsn: Lsn,
-            base_img: Option<Bytes>,
-            records: Vec<(Lsn, WALRecord)>,
-        ) -> Result<Bytes, WalRedoError> {
-            let s = format!(
-                "redo for {} blk {} to get to {}, with {} and {} records",
-                rel,
-                blknum,
-                lsn,
-                if base_img.is_some() {
-                    "base image"
-                } else {
-                    "no base image"
-                },
-                records.len()
-            );
-            println!("{}", s);
-            Ok(TEST_IMG(&s))
+    pub fn unpack(buf: &mut Bytes) -> WALRecord {
+        let lsn = Lsn::from(buf.get_u64());
+        let will_init = buf.get_u8() != 0;
+        let main_data_offset = buf.get_u32();
+        let mut dst = vec![0u8; buf.get_u32() as usize];
+        buf.copy_to_slice(&mut dst);
+        WALRecord {
+            lsn,
+            will_init,
+            rec: Bytes::from(dst),
+            main_data_offset,
        }
    }
 }
@@ -322,21 +208,23 @@ pub mod repo_harness {
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
-    use crate::layered_repository::metadata::METADATA_FILE_NAME;
-
-    use super::repo_harness::*;
    use super::*;
-    use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
+    use crate::layered_repository::LayeredRepository;
+    use crate::walredo::{WalRedoError, WalRedoManager};
+    use crate::PageServerConf;
+    use postgres_ffi::pg_constants;
+    use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
    use std::fs;
+    use std::str::FromStr;
+    use zenith_utils::zid::ZTenantId;

    /// Arbitrary relation tag, for testing.
-    const TESTREL_A_REL_TAG: RelTag = RelTag {
+    const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
        spcnode: 0,
        dbnode: 111,
        relnode: 1000,
        forknum: 0,
-    };
-    const TESTREL_A: RelishTag = RelishTag::Relation(TESTREL_A_REL_TAG);
+    });
    const TESTREL_B: RelishTag = RelishTag::Relation(RelTag {
        spcnode: 0,
        dbnode: 111,
@@ -344,6 +232,16 @@ mod tests {
        forknum: 0,
    });

+    /// Convenience function to create a page image with given string as the only content
+    #[allow(non_snake_case)]
+    fn TEST_IMG(s: &str) -> Bytes {
+        let mut buf = BytesMut::new();
+        buf.extend_from_slice(s.as_bytes());
+        buf.resize(8192, 0);
+
+        buf.freeze()
+    }
+
    fn assert_current_logical_size(timeline: &Arc<dyn Timeline>, lsn: Lsn) {
        let incremental = timeline.get_current_logical_size();
        let non_incremental = timeline
@@ -355,23 +253,47 @@ mod tests {
    static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);

+    fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
+        let repo_dir = PageServerConf::test_repo_dir(test_name);
+        let _ = fs::remove_dir_all(&repo_dir);
+        fs::create_dir_all(&repo_dir)?;
+        fs::create_dir_all(&repo_dir.join("timelines"))?;
+
+        let conf = PageServerConf::dummy_conf(repo_dir);
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+        let tenantid = ZTenantId::generate();
+        fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
+
+        let walredo_mgr = TestRedoManager {};
+
+        let repo = Box::new(LayeredRepository::new(
+            conf,
+            Arc::new(walredo_mgr),
+            tenantid,
+        ));
+
+        Ok(repo)
+    }
+
    #[test]
    fn test_relsize() -> Result<()> {
-        let repo = RepoHarness::create("test_relsize")?.load();
+        let repo = get_test_repo("test_relsize")?;
        // get_timeline() with non-existent timeline id should fail
        //repo.get_timeline("11223344556677881122334455667788");

        // Create timeline to work on
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        let writer = tline.writer();
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid)?;

-        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
-        writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
-        writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
+        tline.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
+        tline.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;

-        writer.advance_last_record_lsn(Lsn(0x50));
+        tline.advance_last_record_lsn(Lsn(0x50));

        assert_current_logical_size(&tline, Lsn(0x50));

@@ -417,8 +339,8 @@ mod tests {
        );

        // Truncate last block
-        writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
-        writer.advance_last_record_lsn(Lsn(0x60));
+        tline.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
+        tline.advance_last_record_lsn(Lsn(0x60));
        assert_current_logical_size(&tline, Lsn(0x60));

        // Check reported size and contents after truncation
@@ -440,13 +362,13 @@ mod tests {
        );

        // Truncate to zero length
-        writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
-        writer.advance_last_record_lsn(Lsn(0x68));
+        tline.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
+        tline.advance_last_record_lsn(Lsn(0x68));
        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0);

        // Extend from 0 to 2 blocks, leaving a gap
-        writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
-        writer.advance_last_record_lsn(Lsn(0x70));
+        tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
+        tline.advance_last_record_lsn(Lsn(0x70));
        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2);
        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE);
        assert_eq!(
@@ -477,30 +399,30 @@ mod tests {
    // and then created it again within the same layer.
    #[test]
    fn test_drop_extend() -> Result<()> {
-        let repo = RepoHarness::create("test_drop_extend")?.load();
+        let repo = get_test_repo("test_drop_extend")?;

        // Create timeline to work on
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        let writer = tline.writer();
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid)?;

-        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        writer.advance_last_record_lsn(Lsn(0x20));
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        tline.advance_last_record_lsn(Lsn(0x20));

        // Check that rel exists and size is correct
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1);

        // Drop relish
-        writer.drop_relish(TESTREL_A, Lsn(0x30))?;
-        writer.advance_last_record_lsn(Lsn(0x30));
+        tline.drop_relish(TESTREL_A, Lsn(0x30))?;
+        tline.advance_last_record_lsn(Lsn(0x30));

        // Check that rel is not visible anymore
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
        assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none());

        // Extend it again
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
-        writer.advance_last_record_lsn(Lsn(0x40));
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
+        tline.advance_last_record_lsn(Lsn(0x40));

        // Check that rel exists and size is correct
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
@@ -514,11 +436,11 @@ mod tests {
    // and then extended it again within the same layer.
    #[test]
    fn test_truncate_extend() -> Result<()> {
-        let repo = RepoHarness::create("test_truncate_extend")?.load();
+        let repo = get_test_repo("test_truncate_extend")?;

        // Create timeline to work on
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        let writer = tline.writer();
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid)?;

        //from storage_layer.rs
        const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
@@ -528,10 +450,10 @@ mod tests {
        for blkno in 0..relsize {
            let lsn = Lsn(0x20);
            let data = format!("foo blk {} at {}", blkno, lsn);
-            writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
+            tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
        }

-        writer.advance_last_record_lsn(Lsn(0x20));
+        tline.advance_last_record_lsn(Lsn(0x20));

        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
@@ -555,8 +477,8 @@ mod tests {

        // Truncate relation so that second segment was dropped
        // - only leave one page
-        writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
-        writer.advance_last_record_lsn(Lsn(0x60));
+        tline.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
+        tline.advance_last_record_lsn(Lsn(0x60));

        // Check reported size and contents after truncation
        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1);
@@ -589,9 +511,9 @@ mod tests {
        for blkno in 0..relsize {
            let lsn = Lsn(0x80);
            let data = format!("foo blk {} at {}", blkno, lsn);
-            writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
+            tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
        }
-        writer.advance_last_record_lsn(Lsn(0x80));
+        tline.advance_last_record_lsn(Lsn(0x80));

        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
        assert_eq!(
@@ -615,17 +537,17 @@ mod tests {
    /// split into multiple 1 GB segments in Postgres.
    #[test]
    fn test_large_rel() -> Result<()> {
-        let repo = RepoHarness::create("test_large_rel")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        let writer = tline.writer();
+        let repo = get_test_repo("test_large_rel")?;
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid)?;

        let mut lsn = 0x10;
        for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
            lsn += 0x10;
-            writer.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
+            tline.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
        }
-        writer.advance_last_record_lsn(Lsn(lsn));
+        tline.advance_last_record_lsn(Lsn(lsn));

        assert_current_logical_size(&tline, Lsn(lsn));

@@ -636,8 +558,8 @@ mod tests {

        // Truncate one block
        lsn += 0x10;
-        writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
-        writer.advance_last_record_lsn(Lsn(lsn));
+        tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
+        tline.advance_last_record_lsn(Lsn(lsn));
        assert_eq!(
            tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
            pg_constants::RELSEG_SIZE
@@ -646,8 +568,8 @@ mod tests {

        // Truncate another block
        lsn += 0x10;
-        writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
-        writer.advance_last_record_lsn(Lsn(lsn));
+        tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
+        tline.advance_last_record_lsn(Lsn(lsn));
        assert_eq!(
            tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
            pg_constants::RELSEG_SIZE - 1
@@ -659,8 +581,8 @@ mod tests {
        let mut size: i32 = 3000;
        while size >= 0 {
            lsn += 0x10;
-            writer.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
-            writer.advance_last_record_lsn(Lsn(lsn));
+            tline.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
+            tline.advance_last_record_lsn(Lsn(lsn));
            assert_eq!(
                tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
                size as u32
@@ -678,19 +600,19 @@ mod tests {
    ///
    #[test]
    fn test_list_rels_drop() -> Result<()> {
-        let repo = RepoHarness::create("test_list_rels_drop")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        let writer = tline.writer();
+        let repo = get_test_repo("test_list_rels_drop")?;
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid)?;
        const TESTDB: u32 = 111;

        // Import initial dummy checkpoint record, otherwise the get_timeline() call
        // after branching fails below
-        writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
+        tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;

        // Create a relation on the timeline
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;

-        writer.advance_last_record_lsn(Lsn(0x30));
+        tline.advance_last_record_lsn(Lsn(0x30));

        // Check that list_rels() lists it after LSN 2, but no before it
        assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A));
@@ -698,19 +620,17 @@ mod tests {
        assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));

        // Create a branch, check that the relation is visible there
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
-        let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
-        let new_writer = newtline.writer();
+        let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
+        repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
+        let newtline = repo.get_timeline(newtimelineid)?;

        assert!(newtline
            .list_rels(0, TESTDB, Lsn(0x30))?
            .contains(&TESTREL_A));

        // Drop it on the branch
-        new_writer.drop_relish(TESTREL_A, Lsn(0x40))?;
-        new_writer.advance_last_record_lsn(Lsn(0x40));
-
-        drop(new_writer);
+        newtline.drop_relish(TESTREL_A, Lsn(0x40))?;
+        newtline.advance_last_record_lsn(Lsn(0x40));

        // Check that it's no longer listed on the branch after the point where it was dropped
        assert!(newtline
@@ -721,8 +641,8 @@ mod tests {
            .contains(&TESTREL_A));

        // Run checkpoint and garbage collection and check that it's still not visible
-        newtline.checkpoint(CheckpointConfig::Forced)?;
-        repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;
+        newtline.checkpoint()?;
+        repo.gc_iteration(Some(newtimelineid), 0, true)?;

        assert!(!newtline
            .list_rels(0, TESTDB, Lsn(0x40))?
@@ -736,32 +656,32 @@ mod tests {
    ///
    #[test]
    fn test_branch() -> Result<()> {
-        let repo = RepoHarness::create("test_branch")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        let writer = tline.writer();
+        let repo = get_test_repo("test_branch")?;
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid)?;

        // Import initial dummy checkpoint record, otherwise the get_timeline() call
        // after branching fails below
-        writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
+        tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;

        // Create a relation on the timeline
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;

        // Create another relation
-        writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
+        tline.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;

-        writer.advance_last_record_lsn(Lsn(0x40));
+        tline.advance_last_record_lsn(Lsn(0x40));
        assert_current_logical_size(&tline, Lsn(0x40));

        // Branch the history, modify relation differently on the new timeline
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
-        let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
-        let new_writer = newtline.writer();
+        let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
+        repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
+        let newtline = repo.get_timeline(newtimelineid)?;

-        new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
-        new_writer.advance_last_record_lsn(Lsn(0x40));
+        newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
+        newtline.advance_last_record_lsn(Lsn(0x40));

        // Check page contents on both branches
        assert_eq!(
@@ -786,302 +706,32 @@ mod tests {
        Ok(())
    }

-    fn make_some_layers(tline: &Arc<dyn Timeline>, start_lsn: Lsn) -> Result<()> {
-        let mut lsn = start_lsn;
-        {
-            let writer = tline.writer();
-            // Create a relation on the timeline
-            writer.put_page_image(
-                TESTREL_A,
-                0,
+    // Mock WAL redo manager that doesn't do much
+    struct TestRedoManager {}
+
+    impl WalRedoManager for TestRedoManager {
+        fn request_redo(
+            &self,
+            rel: RelishTag,
+            blknum: u32,
+            lsn: Lsn,
+            base_img: Option<Bytes>,
+            records: Vec<WALRecord>,
+        ) -> Result<Bytes, WalRedoError> {
+            let s = format!(
+                "redo for {} blk {} to get to {}, with {} and {} records",
+                rel,
+                blknum,
                lsn,
-                TEST_IMG(&format!("foo blk 0 at {}", lsn)),
-            )?;
-            lsn += 0x10;
-            writer.put_page_image(
-                TESTREL_A,
-                0,
-                lsn,
-                TEST_IMG(&format!("foo blk 0 at {}", lsn)),
-            )?;
-            writer.advance_last_record_lsn(lsn);
-        }
-        tline.checkpoint(CheckpointConfig::Forced)?;
-        {
-            let writer = tline.writer();
-            lsn += 0x10;
-            writer.put_page_image(
-                TESTREL_A,
-                0,
-                lsn,
-                TEST_IMG(&format!("foo blk 0 at {}", lsn)),
-            )?;
-            lsn += 0x10;
-            writer.put_page_image(
-                TESTREL_A,
-                0,
-                lsn,
-                TEST_IMG(&format!("foo blk 0 at {}", lsn)),
-            )?;
-            writer.advance_last_record_lsn(lsn);
-        }
-        tline.checkpoint(CheckpointConfig::Forced)
-    }
-
-    #[test]
-    fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> {
-        let repo =
-            RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load();
-
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        make_some_layers(&tline, Lsn(0x20))?;
-
-        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
-
-        // try to branch at lsn 25, should fail because we already garbage collected the data
-        match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
-            Ok(_) => panic!("branching should have failed"),
-            Err(err) => {
-                assert!(err.to_string().contains("invalid branch start lsn"));
-                assert!(err
-                    .source()
-                    .unwrap()
-                    .to_string()
-                    .contains("we might've already garbage collected needed data"))
-            }
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> {
-        let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
-
-        repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
-        // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
-        match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
-            Ok(_) => panic!("branching should have failed"),
-            Err(err) => {
-                assert!(&err.to_string().contains("invalid branch start lsn"));
-                assert!(&err
-                    .source()
-                    .unwrap()
-                    .to_string()
-                    .contains("is earlier than initdb lsn"));
-            }
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_prohibit_get_page_at_lsn_for_garbage_collected_pages() -> Result<()> {
-        let repo =
-            RepoHarness::create("test_prohibit_get_page_at_lsn_for_garbage_collected_pages")?
-                .load();
-
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        make_some_layers(&tline, Lsn(0x20))?;
-
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
-
-        match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) {
-            Ok(_) => panic!("request for page should have failed"),
-            Err(err) => assert!(err
-                .to_string()
-                .contains("tried to request a page version that was garbage collected")),
-        }
-        Ok(())
-    }
-
-    #[test]
-    fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
-        let repo =
-            RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-
-        make_some_layers(&tline, Lsn(0x20))?;
-
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
-        let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
-
-        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
-        assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok());
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
-        let harness = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?;
-        let repo = harness.load();
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-
-        make_some_layers(&tline, Lsn(0x20))?;
-
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
-        let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
-
-        make_some_layers(&newtline, Lsn(0x60))?;
-
-        // run gc on parent
-        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
-
-        // check that the layer in parent before the branching point is still there
-        let tline_dir = harness.conf.timeline_path(&TIMELINE_ID, &harness.tenant_id);
-
-        let expected_image_layer_path = tline_dir.join(format!(
-            "rel_{}_{}_{}_{}_{}_{:016X}_{:016X}",
-            TESTREL_A_REL_TAG.spcnode,
-            TESTREL_A_REL_TAG.dbnode,
-            TESTREL_A_REL_TAG.relnode,
-            TESTREL_A_REL_TAG.forknum,
-            0, // seg is 0
-            0x20,
-            0x30,
-        ));
-        assert!(fs::metadata(&expected_image_layer_path).is_ok());
-
-        Ok(())
-    }
-
-    #[test]
-    fn corrupt_metadata() -> Result<()> {
-        const TEST_NAME: &str = "corrupt_metadata";
-        let harness = RepoHarness::create(TEST_NAME)?;
-        let repo = harness.load();
-
-        repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
-        drop(repo);
-
-        let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
-
-        assert!(metadata_path.is_file());
-
-        let mut metadata_bytes = std::fs::read(&metadata_path)?;
-        assert_eq!(metadata_bytes.len(), 512);
-        metadata_bytes[512 - 4 - 2] ^= 1;
-        std::fs::write(metadata_path, metadata_bytes)?;
-
-        let new_repo = harness.load();
-        let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap();
-        assert_eq!(err.to_string(), "failed to load metadata");
-        assert_eq!(
-            err.source().unwrap().to_string(),
-            "metadata checksum mismatch"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn future_layerfiles() -> Result<()> {
-        const TEST_NAME: &str = "future_layerfiles";
-        let harness = RepoHarness::create(TEST_NAME)?;
-        let repo = harness.load();
-
-        // Create a timeline with disk_consistent_lsn = 8000
-        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
-        let writer = tline.writer();
-        writer.advance_last_record_lsn(Lsn(0x8000));
-        drop(writer);
-        repo.checkpoint_iteration(CheckpointConfig::Forced)?;
-        drop(repo);
-
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
-
-        let make_empty_file = |filename: &str| -> std::io::Result<()> {
-            let path = timeline_path.join(filename);
-
-            assert!(!path.exists());
-            std::fs::write(&path, &[])?;
-
-            Ok(())
-        };
-
-        // Helper function to check that a relation file exists, and a corresponding
-        // <filename>.0.old file does not.
-        let assert_exists = |filename: &str| {
-            let path = timeline_path.join(filename);
-            assert!(path.exists(), "file {} was removed", filename);
-
-            // Check that there is no .old file
-            let backup_path = timeline_path.join(format!("{}.0.old", filename));
-            assert!(
-                !backup_path.exists(),
-                "unexpected backup file {}",
-                backup_path.display()
+                if base_img.is_some() {
+                    "base image"
+                } else {
+                    "no base image"
+                },
+                records.len()
            );
-        };
-
-        // Helper function to check that a relation file does *not* exists, and a corresponding
-        // <filename>.<num>.old file does.
-        let assert_is_renamed = |filename: &str, num: u32| {
-            let path = timeline_path.join(filename);
-            assert!(
-                !path.exists(),
-                "file {} was not removed as expected",
-                filename
-            );
-
-            let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
-            assert!(
-                backup_path.exists(),
-                "backup file {} was not created",
-                backup_path.display()
-            );
-        };
-
-        // These files are considered to be in the future and will be renamed out
-        // of the way
-        let future_filenames = vec![
-            format!("pg_control_0_{:016X}", 0x8001),
-            format!("pg_control_0_{:016X}_{:016X}", 0x8001, 0x8008),
-        ];
-        // But these are not:
-        let past_filenames = vec![
-            format!("pg_control_0_{:016X}", 0x8000),
-            format!("pg_control_0_{:016X}_{:016X}", 0x7000, 0x8001),
-        ];
-
-        for filename in future_filenames.iter().chain(past_filenames.iter()) {
-            make_empty_file(filename)?;
+            println!("{}", s);
+            Ok(TEST_IMG(&s))
        }
-
-        // Load the timeline. This will cause the files in the "future" to be renamed
-        // away.
-        let new_repo = harness.load();
-        new_repo.get_timeline(TIMELINE_ID).unwrap();
-        drop(new_repo);
-
-        for filename in future_filenames.iter() {
-            assert_is_renamed(filename, 0);
-        }
-        for filename in past_filenames.iter() {
-            assert_exists(filename);
-        }
-
-        // Create the future files again, and load again. They should be renamed to
-        // *.1.old this time.
-        for filename in future_filenames.iter() {
-            make_empty_file(filename)?;
-        }
-
-        let new_repo = harness.load();
-        new_repo.get_timeline(TIMELINE_ID).unwrap();
-        drop(new_repo);
-
-        for filename in future_filenames.iter() {
-            assert_is_renamed(filename, 0);
-            assert_is_renamed(filename, 1);
-        }
-        for filename in past_filenames.iter() {
-            assert_exists(filename);
-        }
-
-        Ok(())
    }
 }
--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -2,17 +2,17 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! zenith Timeline.
 //!
+use log::*;
 use postgres_ffi::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
 use std::cmp::min;
 use std::fs;
 use std::fs::File;
-use std::io::{Read, Seek, SeekFrom};
-use std::path::{Path, PathBuf};
+use std::io::Read;
+use std::path::Path;

-use anyhow::{anyhow, bail, Result};
-use bytes::{Buf, Bytes, BytesMut};
-use tracing::*;
+use anyhow::Result;
+use bytes::{Buf, Bytes};

 use crate::relish::*;
 use crate::repository::*;
@@ -34,11 +34,9 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
 ///
 pub fn import_timeline_from_postgres_datadir(
    path: &Path,
-    writer: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
 ) -> Result<()> {
-    let mut pg_control: Option<ControlFileData> = None;
-
    // Scan 'global'
    for direntry in fs::read_dir(path.join("global"))? {
        let direntry = direntry?;
@@ -46,10 +44,10 @@ pub fn import_timeline_from_postgres_datadir(
            None => continue,

            Some("pg_control") => {
-                pg_control = Some(import_control_file(writer, lsn, &direntry.path())?);
+                import_control_file(timeline, lsn, &direntry.path())?;
            }
            Some("pg_filenode.map") => import_nonrel_file(
-                writer,
+                timeline,
                lsn,
                RelishTag::FileNodeMap {
                    spcnode: pg_constants::GLOBALTABLESPACE_OID,
@@ -61,7 +59,7 @@ pub fn import_timeline_from_postgres_datadir(
            // Load any relation files into the page server
            _ => import_relfile(
                &direntry.path(),
-                writer,
+                timeline,
                lsn,
                pg_constants::GLOBALTABLESPACE_OID,
                0,
@@ -88,7 +86,7 @@ pub fn import_timeline_from_postgres_datadir(

                Some("PG_VERSION") => continue,
                Some("pg_filenode.map") => import_nonrel_file(
-                    writer,
+                    timeline,
                    lsn,
                    RelishTag::FileNodeMap {
                        spcnode: pg_constants::DEFAULTTABLESPACE_OID,
@@ -100,7 +98,7 @@ pub fn import_timeline_from_postgres_datadir(
                // Load any relation files into the page server
                _ => import_relfile(
                    &direntry.path(),
-                    writer,
+                    timeline,
                    lsn,
                    pg_constants::DEFAULTTABLESPACE_OID,
                    dboid,
@@ -110,36 +108,24 @@ pub fn import_timeline_from_postgres_datadir(
    }
    for entry in fs::read_dir(path.join("pg_xact"))? {
        let entry = entry?;
-        import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
+        import_slru_file(timeline, lsn, SlruKind::Clog, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
        let entry = entry?;
-        import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
+        import_slru_file(timeline, lsn, SlruKind::MultiXactMembers, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
        let entry = entry?;
-        import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
+        import_slru_file(timeline, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_twophase"))? {
        let entry = entry?;
        let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
-        import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
+        import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
    }
    // TODO: Scan pg_tblspc

-    writer.advance_last_record_lsn(lsn);
-
-    // Import WAL. This is needed even when starting from a shutdown checkpoint, because
-    // this reads the checkpoint record itself, advancing the tip of the timeline to
-    // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'
-    let pg_control = pg_control.ok_or_else(|| anyhow!("pg_control file not found"))?;
-    import_wal(
-        &path.join("pg_wal"),
-        writer,
-        Lsn(pg_control.checkPointCopy.redo),
-        lsn,
-        &mut pg_control.checkPointCopy.clone(),
-    )?;
+    timeline.advance_last_record_lsn(lsn);

    Ok(())
 }
@@ -147,13 +133,12 @@ pub fn import_timeline_from_postgres_datadir(
 // subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
 fn import_relfile(
    path: &Path,
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    spcoid: Oid,
    dboid: Oid,
 ) -> Result<()> {
    // Does it look like a relation file?
-    trace!("importing rel file {}", path.display());

    let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
    if let Err(e) = p {
@@ -181,14 +166,15 @@ fn import_relfile(
            }

            // TODO: UnexpectedEof is expected
-            Err(err) => match err.kind() {
+            Err(e) => match e.kind() {
                std::io::ErrorKind::UnexpectedEof => {
                    // reached EOF. That's expected.
                    // FIXME: maybe check that we read the full length of the file?
                    break;
                }
                _ => {
-                    bail!("error reading file {}: {:#}", path.display(), err);
+                    error!("error reading file: {:?} ({})", path, e);
+                    break;
                }
            },
        };
@@ -205,7 +191,7 @@ fn import_relfile(
 /// are just slurped into the repository as one blob.
 ///
 fn import_nonrel_file(
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    tag: RelishTag,
    path: &Path,
@@ -215,7 +201,7 @@ fn import_nonrel_file(
    // read the whole file
    file.read_to_end(&mut buffer)?;

-    trace!("importing non-rel file {}", path.display());
+    info!("importing non-rel file {}", path.display());

    timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
    Ok(())
@@ -226,17 +212,13 @@ fn import_nonrel_file(
 ///
 /// The control file is imported as is, but we also extract the checkpoint record
 /// from it and store it separated.
-fn import_control_file(
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
-    path: &Path,
-) -> Result<ControlFileData> {
+fn import_control_file(timeline: &dyn Timeline, lsn: Lsn, path: &Path) -> Result<()> {
    let mut file = File::open(path)?;
    let mut buffer = Vec::new();
    // read the whole file
    file.read_to_end(&mut buffer)?;

-    trace!("importing control file {}", path.display());
+    info!("importing control file {}", path.display());

    // Import it as ControlFile
    timeline.put_page_image(
@@ -251,24 +233,19 @@ fn import_control_file(
    let checkpoint_bytes = pg_control.checkPointCopy.encode();
    timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?;

-    Ok(pg_control)
+    Ok(())
 }

 ///
 /// Import an SLRU segment file
 ///
-fn import_slru_file(
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
-    slru: SlruKind,
-    path: &Path,
-) -> Result<()> {
+fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Path) -> Result<()> {
    // Does it look like an SLRU file?
    let mut file = File::open(path)?;
    let mut buf: [u8; 8192] = [0u8; 8192];
    let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;

-    trace!("importing slru file {}", path.display());
+    info!("importing slru file {}", path.display());

    let mut rpageno = 0;
    loop {
@@ -284,14 +261,15 @@ fn import_slru_file(
            }

            // TODO: UnexpectedEof is expected
-            Err(err) => match err.kind() {
+            Err(e) => match e.kind() {
                std::io::ErrorKind::UnexpectedEof => {
                    // reached EOF. That's expected.
                    // FIXME: maybe check that we read the full length of the file?
                    break;
                }
                _ => {
-                    bail!("error reading file {}: {:#}", path.display(), err);
+                    error!("error reading file: {:?} ({})", path, e);
+                    break;
                }
            },
        };
@@ -303,119 +281,19 @@ fn import_slru_file(
    Ok(())
 }

-/// Scan PostgreSQL WAL files in given directory and load all records between
-/// 'startpoint' and 'endpoint' into the repository.
-fn import_wal(
-    walpath: &Path,
-    timeline: &dyn TimelineWriter,
-    startpoint: Lsn,
-    endpoint: Lsn,
-    checkpoint: &mut CheckPoint,
-) -> Result<()> {
-    let mut waldecoder = WalStreamDecoder::new(startpoint);
-
-    let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
-    let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
-    let mut last_lsn = startpoint;
-
-    while last_lsn <= endpoint {
-        // FIXME: assume postgresql tli 1 for now
-        let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
-        let mut buf = Vec::new();
-
-        // Read local file
-        let mut path = walpath.join(&filename);
-
-        // It could be as .partial
-        if !PathBuf::from(&path).exists() {
-            path = walpath.join(filename + ".partial");
-        }
-
-        // Slurp the WAL file
-        let mut file = File::open(&path)?;
-
-        if offset > 0 {
-            file.seek(SeekFrom::Start(offset as u64))?;
-        }
-
-        let nread = file.read_to_end(&mut buf)?;
-        if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
-            // Maybe allow this for .partial files?
-            error!("read only {} bytes from WAL file", nread);
-        }
-
-        waldecoder.feed_bytes(&buf);
-
-        let mut nrecords = 0;
-        while last_lsn <= endpoint {
-            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                let mut checkpoint_modified = false;
-
-                let decoded = decode_wal_record(recdata.clone());
-                save_decoded_record(
-                    checkpoint,
-                    &mut checkpoint_modified,
-                    timeline,
-                    &decoded,
-                    recdata,
-                    lsn,
-                )?;
-                last_lsn = lsn;
-
-                if checkpoint_modified {
-                    let checkpoint_bytes = checkpoint.encode();
-                    timeline.put_page_image(
-                        RelishTag::Checkpoint,
-                        0,
-                        last_lsn,
-                        checkpoint_bytes,
-                    )?;
-                }
-
-                // Now that this record has been fully handled, including updating the
-                // checkpoint data, let the repository know that it is up-to-date to this LSN
-                timeline.advance_last_record_lsn(last_lsn);
-                nrecords += 1;
-
-                trace!("imported record at {} (end {})", lsn, endpoint);
-            }
-        }
-
-        debug!("imported {} records up to {}", nrecords, last_lsn);
-
-        segno += 1;
-        offset = 0;
-    }
-
-    if last_lsn != startpoint {
-        debug!(
-            "reached end of WAL at {}, updating checkpoint info",
-            last_lsn
-        );
-
-        timeline.advance_last_record_lsn(last_lsn);
-    } else {
-        info!("no WAL to import at {}", last_lsn);
-    }
-
-    Ok(())
-}
-
 ///
 /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
 /// relations/pages that the record affects.
 ///
 pub fn save_decoded_record(
    checkpoint: &mut CheckPoint,
-    checkpoint_modified: &mut bool,
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    decoded: &DecodedWALRecord,
    recdata: Bytes,
    lsn: Lsn,
 ) -> Result<()> {
-    if checkpoint.update_next_xid(decoded.xl_xid) {
-        *checkpoint_modified = true;
-    }
+    checkpoint.update_next_xid(decoded.xl_xid);
+
    // Iterate through all the blocks that the record modifies, and
    // "put" a separate copy of the record for each block.
    for blk in decoded.blocks.iter() {
@@ -426,43 +304,14 @@ pub fn save_decoded_record(
            forknum: blk.forknum as u8,
        });

-        //
-        // Instead of storing full-page-image WAL record,
-        // it is better to store extracted image: we can skip wal-redo
-        // in this case. Also some FPI records may contain multiple (up to 32) pages,
-        // so them have to be copied multiple times.
-        //
-        if blk.apply_image
-            && blk.has_image
-            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
-            && (decoded.xl_info == pg_constants::XLOG_FPI
-                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
-            // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0
-        {
-            // Extract page image from FPI record
-            let img_len = blk.bimg_len as usize;
-            let img_offs = blk.bimg_offset as usize;
-            let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize);
-            image.extend_from_slice(&recdata[img_offs..img_offs + img_len]);
+        let rec = WALRecord {
+            lsn,
+            will_init: blk.will_init || blk.apply_image,
+            rec: recdata.clone(),
+            main_data_offset: decoded.main_data_offset as u32,
+        };

-            if blk.hole_length != 0 {
-                let tail = image.split_off(blk.hole_offset as usize);
-                image.resize(image.len() + blk.hole_length as usize, 0u8);
-                image.unsplit(tail);
-            }
-            image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
-            image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
-            assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
-            timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?;
-        } else {
-            let rec = WALRecord {
-                will_init: blk.will_init || blk.apply_image,
-                rec: recdata.clone(),
-                main_data_offset: decoded.main_data_offset as u32,
-            };
-            timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
-        }
+        timeline.put_wal_record(tag, blk.blkno, rec)?;
    }

    let mut buf = decoded.record.clone();
@@ -527,7 +376,7 @@ pub fn save_decoded_record(
        } else {
            assert!(info == pg_constants::CLOG_TRUNCATE);
            let xlrec = XlClogTruncate::decode(&mut buf);
-            save_clog_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
+            save_clog_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
        }
    } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
        let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
@@ -596,17 +445,10 @@ pub fn save_decoded_record(
            )?;
        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
            let xlrec = XlMultiXactCreate::decode(&mut buf);
-            save_multixact_create_record(
-                checkpoint,
-                checkpoint_modified,
-                timeline,
-                lsn,
-                &xlrec,
-                decoded,
-            )?;
+            save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
        } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
            let xlrec = XlMultiXactTruncate::decode(&mut buf);
-            save_multixact_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
+            save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
        }
    } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
        let xlrec = XlRelmapUpdate::decode(&mut buf);
@@ -615,10 +457,7 @@ pub fn save_decoded_record(
        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
        if info == pg_constants::XLOG_NEXTOID {
            let next_oid = buf.get_u32_le();
-            if checkpoint.nextOid != next_oid {
-                checkpoint.nextOid = next_oid;
-                *checkpoint_modified = true;
-            }
+            checkpoint.nextOid = next_oid;
        } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
            || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
        {
@@ -634,7 +473,6 @@ pub fn save_decoded_record(
            );
            if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
                checkpoint.oldestXid = xlog_checkpoint.oldestXid;
-                *checkpoint_modified = true;
            }
        }
    }
@@ -642,11 +480,7 @@ pub fn save_decoded_record(
 }

 /// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record.
-fn save_xlog_dbase_create(
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
-    rec: &XlCreateDatabase,
-) -> Result<()> {
+fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> {
    let db_id = rec.db_id;
    let tablespace_id = rec.tablespace_id;
    let src_db_id = rec.src_db_id;
@@ -723,11 +557,7 @@ fn save_xlog_dbase_create(
 /// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record.
 ///
 /// This is the same logic as in PostgreSQL's smgr_redo() function.
-fn save_xlog_smgr_truncate(
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
-    rec: &XlSmgrTruncate,
-) -> Result<()> {
+fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> {
    let spcnode = rec.rnode.spcnode;
    let dbnode = rec.rnode.dbnode;
    let relnode = rec.rnode.relnode;
@@ -789,7 +619,7 @@ fn save_xlog_smgr_truncate(
 /// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
 ///
 fn save_xact_record(
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    parsed: &XlXactParsedRecord,
    decoded: &DecodedWALRecord,
@@ -800,12 +630,12 @@ fn save_xact_record(
    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
    let rec = WALRecord {
+        lsn,
        will_init: false,
        rec: decoded.record.clone(),
        main_data_offset: decoded.main_data_offset as u32,
    };
    timeline.put_wal_record(
-        lsn,
        RelishTag::Slru {
            slru: SlruKind::Clog,
            segno,
@@ -821,7 +651,6 @@ fn save_xact_record(
            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
            timeline.put_wal_record(
-                lsn,
                RelishTag::Slru {
                    slru: SlruKind::Clog,
                    segno,
@@ -847,8 +676,7 @@ fn save_xact_record(

 fn save_clog_truncate_record(
    checkpoint: &mut CheckPoint,
-    checkpoint_modified: &mut bool,
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    xlrec: &XlClogTruncate,
 ) -> Result<()> {
@@ -866,7 +694,6 @@ fn save_clog_truncate_record(
    // TODO Figure out if there will be any issues with replica.
    checkpoint.oldestXid = xlrec.oldest_xid;
    checkpoint.oldestXidDB = xlrec.oldest_xid_db;
-    *checkpoint_modified = true;

    // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it

@@ -909,13 +736,13 @@ fn save_clog_truncate_record(

 fn save_multixact_create_record(
    checkpoint: &mut CheckPoint,
-    checkpoint_modified: &mut bool,
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    xlrec: &XlMultiXactCreate,
    decoded: &DecodedWALRecord,
 ) -> Result<()> {
    let rec = WALRecord {
+        lsn,
        will_init: false,
        rec: decoded.record.clone(),
        main_data_offset: decoded.main_data_offset as u32,
@@ -924,7 +751,6 @@ fn save_multixact_create_record(
    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
    timeline.put_wal_record(
-        lsn,
        RelishTag::Slru {
            slru: SlruKind::MultiXactOffsets,
            segno,
@@ -944,7 +770,6 @@ fn save_multixact_create_record(
        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
        timeline.put_wal_record(
-            lsn,
            RelishTag::Slru {
                slru: SlruKind::MultiXactMembers,
                segno,
@@ -967,11 +792,9 @@ fn save_multixact_create_record(
    }
    if xlrec.mid >= checkpoint.nextMulti {
        checkpoint.nextMulti = xlrec.mid + 1;
-        *checkpoint_modified = true;
    }
    if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
        checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
-        *checkpoint_modified = true;
    }
    let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
        if mbr.xid.wrapping_sub(acc) as i32 > 0 {
@@ -981,22 +804,18 @@ fn save_multixact_create_record(
        }
    });

-    if checkpoint.update_next_xid(max_mbr_xid) {
-        *checkpoint_modified = true;
-    }
+    checkpoint.update_next_xid(max_mbr_xid);
    Ok(())
 }

 fn save_multixact_truncate_record(
    checkpoint: &mut CheckPoint,
-    checkpoint_modified: &mut bool,
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    xlrec: &XlMultiXactTruncate,
 ) -> Result<()> {
    checkpoint.oldestMulti = xlrec.end_trunc_off;
    checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
-    *checkpoint_modified = true;

    // PerformMembersTruncation
    let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
@@ -1030,7 +849,7 @@ fn save_multixact_truncate_record(
 }

 fn save_relmap_page(
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    xlrec: &XlRelmapUpdate,
    decoded: &DecodedWALRecord,
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -4,267 +4,74 @@
 use crate::branches;
 use crate::layered_repository::LayeredRepository;
 use crate::repository::{Repository, Timeline};
-use crate::tenant_threads;
 use crate::walredo::PostgresRedoManager;
 use crate::PageServerConf;
 use anyhow::{anyhow, bail, Context, Result};
 use lazy_static::lazy_static;
-use log::*;
-use serde::{Deserialize, Serialize};
+use log::info;
 use std::collections::HashMap;
-use std::fmt;
 use std::fs;
 use std::str::FromStr;
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::{Arc, Mutex, MutexGuard};
+use std::sync::{Arc, Mutex};
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 lazy_static! {
-    static ref TENANTS: Mutex<HashMap<ZTenantId, Tenant>> = Mutex::new(HashMap::new());
+    pub static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
+        Mutex::new(HashMap::new());
 }

-struct Tenant {
-    state: TenantState,
-    repo: Option<Arc<dyn Repository>>,
-}
-
-#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
-pub enum TenantState {
-    // This tenant only exists in cloud storage. It cannot be accessed.
-    CloudOnly,
-    // This tenant exists in cloud storage, and we are currently downloading it to local disk.
-    // It cannot be accessed yet, not until it's been fully downloaded to local disk.
-    Downloading,
-    // All data for this tenant is complete on local disk, but we haven't loaded the Repository,
-    // Timeline and Layer structs into memory yet, so it cannot be accessed yet.
-    //Ready,
-    // This tenant exists on local disk, and the layer map has been loaded into memory.
-    // The local disk might have some newer files that don't exist in cloud storage yet.
-    Active,
-    // Tenant is active, but there is no walreceiver connection.
-    Idle,
-    // This tenant exists on local disk, and the layer map has been loaded into memory.
-    // The local disk might have some newer files that don't exist in cloud storage yet.
-    // The tenant cannot be accessed anymore for any reason, but graceful shutdown.
-    Stopping,
-}
-
-/// A remote storage timeline synchronization event, that needs another step
-/// to be fully completed.
-#[derive(Debug)]
-pub enum PostTimelineSyncStep {
-    /// The timeline cannot be synchronized anymore due to some sync issues.
-    /// Needs to be removed from pageserver, to avoid further data diverging.
-    Evict,
-    /// A new timeline got downloaded and needs to be loaded into pageserver.
-    RegisterDownload,
-}
-
-impl fmt::Display for TenantState {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            TenantState::CloudOnly => f.write_str("CloudOnly"),
-            TenantState::Downloading => f.write_str("Downloading"),
-            TenantState::Active => f.write_str("Active"),
-            TenantState::Idle => f.write_str("Idle"),
-            TenantState::Stopping => f.write_str("Stopping"),
-        }
-    }
-}
-
-fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
-    TENANTS.lock().unwrap()
-}
-
-static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
-
 pub fn init(conf: &'static PageServerConf) {
+    let mut m = REPOSITORY.lock().unwrap();
+
    for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
        let tenantid =
            ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();

-        {
-            let mut m = access_tenants();
-            let tenant = Tenant {
-                state: TenantState::CloudOnly,
-                repo: None,
-            };
-            m.insert(tenantid, tenant);
-        }
+        // Set up a WAL redo manager, for applying WAL records.
+        let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
+
+        // Set up an object repository, for actual data storage.
+        let repo = Arc::new(LayeredRepository::new(
+            conf,
+            Arc::new(walredo_mgr),
+            tenantid,
+        ));
+        LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
+        LayeredRepository::launch_gc_thread(conf, repo.clone());

-        init_repo(conf, tenantid);
        info!("initialized storage for tenant: {}", &tenantid);
+        m.insert(tenantid, repo);
    }
 }

-fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
-    // Set up a WAL redo manager, for applying WAL records.
-    let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
-
-    // Set up an object repository, for actual data storage.
-    let repo = Arc::new(LayeredRepository::new(
-        conf,
-        Arc::new(walredo_mgr),
-        tenant_id,
-        false,
-    ));
-
-    let mut m = access_tenants();
-    let tenant = m.get_mut(&tenant_id).unwrap();
-    tenant.repo = Some(repo);
-    tenant.state = TenantState::Idle;
-}
-
-pub fn perform_post_timeline_sync_steps(
-    conf: &'static PageServerConf,
-    post_sync_steps: HashMap<(ZTenantId, ZTimelineId), PostTimelineSyncStep>,
-) {
-    if post_sync_steps.is_empty() {
-        return;
-    }
-
-    info!("Performing {} post-sync steps", post_sync_steps.len());
-    trace!("Steps: {:?}", post_sync_steps);
-
-    {
-        let mut m = access_tenants();
-        for &(tenant_id, timeline_id) in post_sync_steps.keys() {
-            let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
-                state: TenantState::Downloading,
-                repo: None,
-            });
-            tenant.state = TenantState::Downloading;
-            match &tenant.repo {
-                Some(repo) => {
-                    init_timeline(repo.as_ref(), timeline_id);
-                    tenant.state = TenantState::Idle;
-                    return;
-                }
-                None => log::warn!("Initialize new repo"),
-            }
-            tenant.state = TenantState::Idle;
-        }
-    }
-
-    for ((tenant_id, timeline_id), post_sync_step) in post_sync_steps {
-        match post_sync_step {
-            PostTimelineSyncStep::Evict => {
-                if let Err(e) = get_repository_for_tenant(tenant_id)
-                    .and_then(|repo| repo.unload_timeline(timeline_id))
-                {
-                    error!(
-                        "Failed to remove repository for tenant {}, timeline {}: {:#}",
-                        tenant_id, timeline_id, e
-                    )
-                }
-            }
-            PostTimelineSyncStep::RegisterDownload => {
-                // init repo updates Tenant state
-                init_repo(conf, tenant_id);
-                let new_repo = get_repository_for_tenant(tenant_id).unwrap();
-                init_timeline(new_repo.as_ref(), timeline_id);
-            }
-        }
-    }
-}
-
-fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
-    match repo.get_timeline(timeline_id) {
-        Ok(_timeline) => log::info!("Successfully initialized timeline {}", timeline_id),
-        Err(e) => log::error!("Failed to init timeline {}, reason: {:#}", timeline_id, e),
-    }
-}
-
-// Check this flag in the thread loops to know when to exit
-pub fn shutdown_requested() -> bool {
-    SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
-}
-
-pub fn shutdown_all_tenants() -> Result<()> {
-    SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
-
-    let tenantids = list_tenantids()?;
-
-    for tenantid in &tenantids {
-        set_tenant_state(*tenantid, TenantState::Stopping)?;
-    }
-
-    for tenantid in tenantids {
-        // Wait for checkpointer and GC to finish their job
-        tenant_threads::wait_for_tenant_threads_to_stop(tenantid);
-
-        let repo = get_repository_for_tenant(tenantid)?;
-        debug!("shutdown tenant {}", tenantid);
-        repo.shutdown()?;
-    }
-    Ok(())
-}
-
 pub fn create_repository_for_tenant(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
 ) -> Result<()> {
-    {
-        let mut m = access_tenants();
-        // First check that the tenant doesn't exist already
-        if m.get(&tenantid).is_some() {
-            bail!("tenant {} already exists", tenantid);
-        }
-        let tenant = Tenant {
-            state: TenantState::CloudOnly,
-            repo: None,
-        };
-        m.insert(tenantid, tenant);
-    }
+    let mut m = REPOSITORY.lock().unwrap();

+    // First check that the tenant doesn't exist already
+    if m.get(&tenantid).is_some() {
+        bail!("tenant {} already exists", tenantid);
+    }
    let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
    let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;

-    let mut m = access_tenants();
-    let tenant = m.get_mut(&tenantid).unwrap();
-    tenant.repo = Some(repo);
-    tenant.state = TenantState::Idle;
+    m.insert(tenantid, repo);

    Ok(())
 }

-// If tenant is not found in the repository, return CloudOnly state
-pub fn get_tenant_state(tenantid: ZTenantId) -> TenantState {
-    let m = access_tenants();
-    match m.get(&tenantid) {
-        Some(tenant) => tenant.state,
-        None => TenantState::CloudOnly,
-    }
-}
-
-pub fn set_tenant_state(tenantid: ZTenantId, newstate: TenantState) -> Result<TenantState> {
-    let mut m = access_tenants();
-    let tenant = m.get_mut(&tenantid);
-
-    match tenant {
-        Some(tenant) => {
-            if newstate == TenantState::Idle && tenant.state != TenantState::Active {
-                // Only Active tenant can become Idle
-                return Ok(tenant.state);
-            }
-            info!("set_tenant_state: {} -> {}", tenant.state, newstate);
-            tenant.state = newstate;
-            Ok(tenant.state)
-        }
-        None => bail!("Tenant not found for tenant {}", tenantid),
-    }
+pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
+    let o = &mut REPOSITORY.lock().unwrap();
+    o.insert(tenantid, repo);
 }

 pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
-    let m = access_tenants();
-    let tenant = m
-        .get(&tenantid)
-        .ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid))?;
-
-    match &tenant.repo {
-        Some(repo) => Ok(Arc::clone(repo)),
-        None => anyhow::bail!("Repository for tenant {} is not yet valid", tenantid),
-    }
+    let o = &REPOSITORY.lock().unwrap();
+    o.get(&tenantid)
+        .map(|repo| Arc::clone(repo))
+        .ok_or_else(|| anyhow!("repository not found for tenant name {}", tenantid))
 }

 pub fn get_timeline_for_tenant(
@@ -275,33 +82,3 @@ pub fn get_timeline_for_tenant(
        .get_timeline(timelineid)
        .with_context(|| format!("cannot fetch timeline {}", timelineid))
 }
-
-fn list_tenantids() -> Result<Vec<ZTenantId>> {
-    let m = access_tenants();
-    m.iter()
-        .map(|v| {
-            let (tenantid, _) = v;
-            Ok(*tenantid)
-        })
-        .collect()
-}
-
-#[derive(Serialize, Deserialize, Clone)]
-pub struct TenantInfo {
-    #[serde(with = "hex")]
-    pub id: ZTenantId,
-    pub state: TenantState,
-}
-
-pub fn list_tenants() -> Result<Vec<TenantInfo>> {
-    let m = access_tenants();
-    m.iter()
-        .map(|v| {
-            let (id, tenant) = v;
-            Ok(TenantInfo {
-                id: *id,
-                state: tenant.state,
-            })
-        })
-        .collect()
-}
--- a/pageserver/src/tenant_threads.rs
+++ b/pageserver/src/tenant_threads.rs
@@ -1,149 +0,0 @@
-//! This module contains functions to serve per-tenant background processes,
-//! such as checkpointer and GC
-use crate::tenant_mgr;
-use crate::tenant_mgr::TenantState;
-use crate::CheckpointConfig;
-use crate::PageServerConf;
-use anyhow::Result;
-use lazy_static::lazy_static;
-use std::collections::HashMap;
-use std::sync::Mutex;
-use std::thread::JoinHandle;
-use std::time::Duration;
-use tracing::*;
-use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
-use zenith_utils::zid::ZTenantId;
-
-struct TenantHandleEntry {
-    checkpointer_handle: Option<JoinHandle<()>>,
-    gc_handle: Option<JoinHandle<()>>,
-}
-
-// Preserve handles to wait for thread completion
-// at shutdown
-lazy_static! {
-    static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
-        Mutex::new(HashMap::new());
-}
-
-lazy_static! {
-    static ref TENANT_THREADS_COUNT: IntGaugeVec = register_int_gauge_vec!(
-        "tenant_threads_count",
-        "Number of live tenant threads",
-        &["tenant_thread_type"]
-    )
-    .expect("failed to define a metric");
-}
-
-// Launch checkpointer and GC for the tenant.
-// It's possible that the threads are running already,
-// if so, just don't spawn new ones.
-pub fn start_tenant_threads(conf: &'static PageServerConf, tenantid: ZTenantId) {
-    let mut handles = TENANT_HANDLES.lock().unwrap();
-    let h = handles
-        .entry(tenantid)
-        .or_insert_with(|| TenantHandleEntry {
-            checkpointer_handle: None,
-            gc_handle: None,
-        });
-
-    if h.checkpointer_handle.is_none() {
-        h.checkpointer_handle = std::thread::Builder::new()
-            .name("Checkpointer thread".into())
-            .spawn(move || {
-                checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
-            })
-            .ok();
-    }
-
-    if h.gc_handle.is_none() {
-        h.gc_handle = std::thread::Builder::new()
-            .name("GC thread".into())
-            .spawn(move || {
-                gc_loop(tenantid, conf).expect("GC thread died");
-            })
-            .ok();
-    }
-}
-
-pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
-    let mut handles = TENANT_HANDLES.lock().unwrap();
-    if let Some(h) = handles.get_mut(&tenantid) {
-        h.checkpointer_handle.take().map(JoinHandle::join);
-        trace!("checkpointer for tenant {} has stopped", tenantid);
-        h.gc_handle.take().map(JoinHandle::join);
-        trace!("gc for tenant {} has stopped", tenantid);
-    }
-    handles.remove(&tenantid);
-}
-
-///
-/// Checkpointer thread's main loop
-///
-fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
-    let gauge = TENANT_THREADS_COUNT.with_label_values(&["checkpointer"]);
-    gauge.inc();
-    scopeguard::defer! {
-        gauge.dec();
-    }
-
-    loop {
-        if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
-            break;
-        }
-
-        std::thread::sleep(conf.checkpoint_period);
-        trace!("checkpointer thread for tenant {} waking up", tenantid);
-
-        // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
-        // bytes of WAL since last checkpoint.
-        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-        repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
-    }
-
-    trace!(
-        "checkpointer thread stopped for tenant {} state is {}",
-        tenantid,
-        tenant_mgr::get_tenant_state(tenantid)
-    );
-    Ok(())
-}
-
-///
-/// GC thread's main loop
-///
-fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
-    let gauge = TENANT_THREADS_COUNT.with_label_values(&["gc"]);
-    gauge.inc();
-    scopeguard::defer! {
-        gauge.dec();
-    }
-
-    loop {
-        if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
-            break;
-        }
-
-        trace!("gc thread for tenant {} waking up", tenantid);
-
-        // Garbage collect old files that are not needed for PITR anymore
-        if conf.gc_horizon > 0 {
-            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-            repo.gc_iteration(None, conf.gc_horizon, false).unwrap();
-        }
-
-        // TODO Write it in more adequate way using
-        // condvar.wait_timeout() or something
-        let mut sleep_time = conf.gc_period.as_secs();
-        while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == TenantState::Active {
-            sleep_time -= 1;
-            std::thread::sleep(Duration::from_secs(1));
-        }
-    }
-    trace!(
-        "GC thread stopped for tenant {} state is {}",
-        tenantid,
-        tenant_mgr::get_tenant_state(tenantid)
-    );
-    Ok(())
-}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1,619 +0,0 @@
-//!
-//! VirtualFile is like a normal File, but it's not bound directly to
-//! a file descriptor. Instead, the file is opened when it's read from,
-//! and if too many files are open globally in the system, least-recently
-//! used ones are closed.
-//!
-//! To track which files have been recently used, we use the clock algorithm
-//! with a 'recently_used' flag on each slot.
-//!
-//! This is similar to PostgreSQL's virtual file descriptor facility in
-//! src/backend/storage/file/fd.c
-//!
-use std::fs::{File, OpenOptions};
-use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
-use std::os::unix::fs::FileExt;
-use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::{RwLock, RwLockWriteGuard};
-
-use once_cell::sync::OnceCell;
-
-///
-/// A virtual file descriptor. You can use this just like std::fs::File, but internally
-/// the underlying file is closed if the system is low on file descriptors,
-/// and re-opened when it's accessed again.
-///
-/// Like with std::fs::File, multiple threads can read/write the file concurrently,
-/// holding just a shared reference the same VirtualFile, using the read_at() / write_at()
-/// functions from the FileExt trait. But the functions from the Read/Write/Seek traits
-/// require a mutable reference, because they modify the "current position".
-///
-/// Each VirtualFile has a physical file descriptor in the global OPEN_FILES array, at the
-/// slot that 'handle points to, if the underlying file is currently open. If it's not
-/// currently open, the 'handle' can still point to the slot where it was last kept. The
-/// 'tag' field is used to detect whether the handle still is valid or not.
-///
-pub struct VirtualFile {
-    /// Lazy handle to the global file descriptor cache. The slot that this points to
-    /// might contain our File, or it may be empty, or it may contain a File that
-    /// belongs to a different VirtualFile.
-    handle: RwLock<SlotHandle>,
-
-    /// Current file position
-    pos: u64,
-
-    /// File path and options to use to open it.
-    ///
-    /// Note: this only contains the options needed to re-open it. For example,
-    /// if a new file is created, we only pass the create flag when it's initially
-    /// opened, in the VirtualFile::create() function, and strip the flag before
-    /// storing it here.
-    pub path: PathBuf,
-    open_options: OpenOptions,
-}
-
-#[derive(PartialEq, Clone, Copy)]
-struct SlotHandle {
-    /// Index into OPEN_FILES.slots
-    index: usize,
-
-    /// Value of 'tag' in the slot. If slot's tag doesn't match, then the slot has
-    /// been recycled and no longer contains the FD for this virtual file.
-    tag: u64,
-}
-
-/// OPEN_FILES is the global array that holds the physical file descriptors that
-/// are currently open. Each slot in the array is protected by a separate lock,
-/// so that different files can be accessed independently. The lock must be held
-/// in write mode to replace the slot with a different file, but a read mode
-/// is enough to operate on the file, whether you're reading or writing to it.
-///
-/// OPEN_FILES starts in uninitialized state, and it's initialized by
-/// the virtual_file::init() function. It must be called exactly once at page
-/// server startup.
-static OPEN_FILES: OnceCell<OpenFiles> = OnceCell::new();
-
-struct OpenFiles {
-    slots: &'static [Slot],
-
-    /// clock arm for the clock algorithm
-    next: AtomicUsize,
-}
-
-struct Slot {
-    inner: RwLock<SlotInner>,
-
-    /// has this file been used since last clock sweep?
-    recently_used: AtomicBool,
-}
-
-struct SlotInner {
-    /// Counter that's incremented every time a different file is stored here.
-    /// To avoid the ABA problem.
-    tag: u64,
-
-    /// the underlying file
-    file: Option<File>,
-}
-
-impl OpenFiles {
-    /// Find a slot to use, evicting an existing file descriptor if needed.
-    ///
-    /// On return, we hold a lock on the slot, and its 'tag' has been updated
-    /// recently_used has been set. It's all ready for reuse.
-    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
-        //
-        // Run the clock algorithm to find a slot to replace.
-        //
-        let num_slots = self.slots.len();
-        let mut retries = 0;
-        let mut slot;
-        let mut slot_guard;
-        let index;
-        loop {
-            let next = self.next.fetch_add(1, Ordering::AcqRel) % num_slots;
-            slot = &self.slots[next];
-
-            // If the recently_used flag on this slot is set, continue the clock
-            // sweep. Otherwise try to use this slot. If we cannot acquire the
-            // lock, also continue the clock sweep.
-            //
-            // We only continue in this manner for a while, though. If we loop
-            // through the array twice without finding a victim, just pick the
-            // next slot and wait until we can reuse it. This way, we avoid
-            // spinning in the extreme case that all the slots are busy with an
-            // I/O operation.
-            if retries < num_slots * 2 {
-                if !slot.recently_used.swap(false, Ordering::Release) {
-                    if let Ok(guard) = slot.inner.try_write() {
-                        slot_guard = guard;
-                        index = next;
-                        break;
-                    }
-                }
-                retries += 1;
-            } else {
-                slot_guard = slot.inner.write().unwrap();
-                index = next;
-                break;
-            }
-        }
-
-        //
-        // We now have the victim slot locked. If it was in use previously, close the
-        // old file.
-        //
-        if let Some(old_file) = slot_guard.file.take() {
-            drop(old_file);
-        }
-
-        // Prepare the slot for reuse and return it
-        slot_guard.tag += 1;
-        slot.recently_used.store(true, Ordering::Relaxed);
-        (
-            SlotHandle {
-                index,
-                tag: slot_guard.tag,
-            },
-            slot_guard,
-        )
-    }
-}
-
-impl VirtualFile {
-    /// Open a file in read-only mode. Like File::open.
-    pub fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path, OpenOptions::new().read(true))
-    }
-
-    /// Create a new file for writing. If the file exists, it will be truncated.
-    /// Like File::create.
-    pub fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(
-            path,
-            OpenOptions::new().write(true).create(true).truncate(true),
-        )
-    }
-
-    /// Open a file with given options.
-    ///
-    /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
-    /// they will be applied also when the file is subsequently re-opened, not only
-    /// on the first time. Make sure that's sane!
-    pub fn open_with_options(
-        path: &Path,
-        open_options: &OpenOptions,
-    ) -> Result<VirtualFile, std::io::Error> {
-        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
-
-        let file = open_options.open(path)?;
-
-        // Strip all options other than read and write.
-        //
-        // It would perhaps be nicer to check just for the read and write flags
-        // explicitly, but OpenOptions doesn't contain any functions to read flags,
-        // only to set them.
-        let mut reopen_options = open_options.clone();
-        reopen_options.create(false);
-        reopen_options.create_new(false);
-        reopen_options.truncate(false);
-
-        let vfile = VirtualFile {
-            handle: RwLock::new(handle),
-            pos: 0,
-            path: path.to_path_buf(),
-            open_options: reopen_options,
-        };
-
-        slot_guard.file.replace(file);
-
-        Ok(vfile)
-    }
-
-    /// Call File::sync_all() on the underlying File.
-    pub fn sync_all(&self) -> Result<(), Error> {
-        self.with_file(|file| file.sync_all())?
-    }
-
-    /// Helper function that looks up the underlying File for this VirtualFile,
-    /// opening it and evicting some other File if necessary. It calls 'func'
-    /// with the physical File.
-    fn with_file<F, R>(&self, mut func: F) -> Result<R, Error>
-    where
-        F: FnMut(&File) -> R,
-    {
-        let open_files = get_open_files();
-
-        let mut handle_guard = {
-            // Read the cached slot handle, and see if the slot that it points to still
-            // contains our File.
-            //
-            // We only need to hold the handle lock while we read the current handle. If
-            // another thread closes the file and recycles the slot for a different file,
-            // we will notice that the handle we read is no longer valid and retry.
-            let mut handle = *self.handle.read().unwrap();
-            loop {
-                // Check if the slot contains our File
-                {
-                    let slot = &open_files.slots[handle.index];
-                    let slot_guard = slot.inner.read().unwrap();
-                    if slot_guard.tag == handle.tag {
-                        if let Some(file) = &slot_guard.file {
-                            // Found a cached file descriptor.
-                            slot.recently_used.store(true, Ordering::Relaxed);
-                            return Ok(func(file));
-                        }
-                    }
-                }
-
-                // The slot didn't contain our File. We will have to open it ourselves,
-                // but before that, grab a write lock on handle in the VirtualFile, so
-                // that no other thread will try to concurrently open the same file.
-                let handle_guard = self.handle.write().unwrap();
-
-                // If another thread changed the handle while we were not holding the lock,
-                // then the handle might now be valid again. Loop back to retry.
-                if *handle_guard != handle {
-                    handle = *handle_guard;
-                    continue;
-                }
-                break handle_guard;
-            }
-        };
-
-        // We need to open the file ourselves. The handle in the VirtualFile is
-        // now locked in write-mode. Find a free slot to put it in.
-        let (handle, mut slot_guard) = open_files.find_victim_slot();
-
-        // Open the physical file
-        let file = self.open_options.open(&self.path)?;
-
-        // Perform the requested operation on it
-        //
-        // TODO: We could downgrade the locks to read mode before calling
-        // 'func', to allow a little bit more concurrency, but the standard
-        // library RwLock doesn't allow downgrading without releasing the lock,
-        // and that doesn't seem worth the trouble. (parking_lot RwLock would
-        // allow it)
-        let result = func(&file);
-
-        // Store the File in the slot and update the handle in the VirtualFile
-        // to point to it.
-        slot_guard.file.replace(file);
-
-        *handle_guard = handle;
-
-        Ok(result)
-    }
-}
-
-impl Drop for VirtualFile {
-    /// If a VirtualFile is dropped, close the underlying file if it was open.
-    fn drop(&mut self) {
-        let handle = self.handle.get_mut().unwrap();
-
-        // We could check with a read-lock first, to avoid waiting on an
-        // unrelated I/O.
-        let slot = &get_open_files().slots[handle.index];
-        let mut slot_guard = slot.inner.write().unwrap();
-        if slot_guard.tag == handle.tag {
-            slot.recently_used.store(false, Ordering::Relaxed);
-            slot_guard.file.take();
-        }
-    }
-}
-
-impl Read for VirtualFile {
-    fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
-        let pos = self.pos;
-        let n = self.read_at(buf, pos)?;
-        self.pos += n as u64;
-        Ok(n)
-    }
-}
-
-impl Write for VirtualFile {
-    fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
-        let pos = self.pos;
-        let n = self.write_at(buf, pos)?;
-        self.pos += n as u64;
-        Ok(n)
-    }
-
-    fn flush(&mut self) -> Result<(), std::io::Error> {
-        // flush is no-op for File (at least on unix), so we don't need to do
-        // anything here either.
-        Ok(())
-    }
-}
-
-impl Seek for VirtualFile {
-    fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
-        match pos {
-            SeekFrom::Start(offset) => {
-                self.pos = offset;
-            }
-            SeekFrom::End(offset) => {
-                self.pos = self.with_file(|mut file| file.seek(SeekFrom::End(offset)))??
-            }
-            SeekFrom::Current(offset) => {
-                let pos = self.pos as i128 + offset as i128;
-                if pos < 0 {
-                    return Err(Error::new(
-                        ErrorKind::InvalidInput,
-                        "offset would be negative",
-                    ));
-                }
-                if pos > u64::MAX as i128 {
-                    return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
-                }
-                self.pos = pos as u64;
-            }
-        }
-        Ok(self.pos)
-    }
-}
-
-impl FileExt for VirtualFile {
-    fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        self.with_file(|file| file.read_at(buf, offset))?
-    }
-
-    fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        self.with_file(|file| file.write_at(buf, offset))?
-    }
-}
-
-impl OpenFiles {
-    fn new(num_slots: usize) -> OpenFiles {
-        let mut slots = Box::new(Vec::with_capacity(num_slots));
-        for _ in 0..num_slots {
-            let slot = Slot {
-                recently_used: AtomicBool::new(false),
-                inner: RwLock::new(SlotInner { tag: 0, file: None }),
-            };
-            slots.push(slot);
-        }
-
-        OpenFiles {
-            next: AtomicUsize::new(0),
-            slots: Box::leak(slots),
-        }
-    }
-}
-
-///
-/// Initialize the virtual file module. This must be called once at page
-/// server startup.
-///
-pub fn init(num_slots: usize) {
-    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
-        panic!("virtual_file::init called twice");
-    }
-}
-
-const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
-
-// Get a handle to the global slots array.
-fn get_open_files() -> &'static OpenFiles {
-    //
-    // In unit tests, page server startup doesn't happen and no one calls
-    // virtual_file::init(). Initialize it here, with a small array.
-    //
-    // This applies to the virtual file tests below, but all other unit
-    // tests too, so the virtual file facility is always usable in
-    // unit tests.
-    //
-    if cfg!(test) {
-        OPEN_FILES.get_or_init(|| OpenFiles::new(TEST_MAX_FILE_DESCRIPTORS))
-    } else {
-        OPEN_FILES.get().expect("virtual_file::init not called yet")
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use rand::seq::SliceRandom;
-    use rand::thread_rng;
-    use rand::Rng;
-    use std::sync::Arc;
-    use std::thread;
-
-    // Helper function to slurp contents of a file, starting at the current position,
-    // into a string
-    fn read_string<FD>(vfile: &mut FD) -> Result<String, Error>
-    where
-        FD: Read,
-    {
-        let mut buf = String::new();
-        vfile.read_to_string(&mut buf)?;
-        Ok(buf)
-    }
-
-    // Helper function to slurp a portion of a file into a string
-    fn read_string_at<FD>(vfile: &mut FD, pos: u64, len: usize) -> Result<String, Error>
-    where
-        FD: FileExt,
-    {
-        let mut buf = Vec::new();
-        buf.resize(len, 0);
-        vfile.read_exact_at(&mut buf, pos)?;
-        Ok(String::from_utf8(buf).unwrap())
-    }
-
-    #[test]
-    fn test_virtual_files() -> Result<(), Error> {
-        // The real work is done in the test_files() helper function. This
-        // allows us to run the same set of tests against a native File, and
-        // VirtualFile. We trust the native Files and wouldn't need to test them,
-        // but this allows us to verify that the operations return the same
-        // results with VirtualFiles as with native Files. (Except that with
-        // native files, you will run out of file descriptors if the ulimit
-        // is low enough.)
-        test_files("virtual_files", |path, open_options| {
-            VirtualFile::open_with_options(path, open_options)
-        })
-    }
-
-    #[test]
-    fn test_physical_files() -> Result<(), Error> {
-        test_files("physical_files", |path, open_options| {
-            open_options.open(path)
-        })
-    }
-
-    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
-    where
-        FD: Read + Write + Seek + FileExt,
-        OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
-    {
-        let testdir = crate::PageServerConf::test_repo_dir(testname);
-        std::fs::create_dir_all(&testdir)?;
-
-        let path_a = testdir.join("file_a");
-        let mut file_a = openfunc(
-            &path_a,
-            OpenOptions::new().write(true).create(true).truncate(true),
-        )?;
-        file_a.write_all(b"foobar")?;
-
-        // cannot read from a file opened in write-only mode
-        assert!(read_string(&mut file_a).is_err());
-
-        // Close the file and re-open for reading
-        let mut file_a = openfunc(&path_a, OpenOptions::new().read(true))?;
-
-        // cannot write to a file opened in read-only mode
-        assert!(file_a.write(b"bar").is_err());
-
-        // Try simple read
-        assert_eq!("foobar", read_string(&mut file_a)?);
-
-        // It's positioned at the EOF now.
-        assert_eq!("", read_string(&mut file_a)?);
-
-        // Test seeks.
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-        assert_eq!("oobar", read_string(&mut file_a)?);
-
-        assert_eq!(file_a.seek(SeekFrom::End(-2))?, 4);
-        assert_eq!("ar", read_string(&mut file_a)?);
-
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-        assert_eq!(file_a.seek(SeekFrom::Current(2))?, 3);
-        assert_eq!("bar", read_string(&mut file_a)?);
-
-        assert_eq!(file_a.seek(SeekFrom::Current(-5))?, 1);
-        assert_eq!("oobar", read_string(&mut file_a)?);
-
-        // Test erroneous seeks to before byte 0
-        assert!(file_a.seek(SeekFrom::End(-7)).is_err());
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-        assert!(file_a.seek(SeekFrom::Current(-2)).is_err());
-
-        // the erroneous seek should have left the position unchanged
-        assert_eq!("oobar", read_string(&mut file_a)?);
-
-        // Create another test file, and try FileExt functions on it.
-        let path_b = testdir.join("file_b");
-        let mut file_b = openfunc(
-            &path_b,
-            OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create(true)
-                .truncate(true),
-        )?;
-        file_b.write_all_at(b"BAR", 3)?;
-        file_b.write_all_at(b"FOO", 0)?;
-
-        assert_eq!(read_string_at(&mut file_b, 2, 3)?, "OBA");
-
-        // Open a lot of files, enough to cause some evictions. (Or to be precise,
-        // open the same file many times. The effect is the same.)
-        //
-        // leave file_a positioned at offset 1 before we start
-        assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
-
-        let mut vfiles = Vec::new();
-        for _ in 0..100 {
-            let mut vfile = openfunc(&path_b, OpenOptions::new().read(true))?;
-            assert_eq!("FOOBAR", read_string(&mut vfile)?);
-            vfiles.push(vfile);
-        }
-
-        // make sure we opened enough files to definitely cause evictions.
-        assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2);
-
-        // The underlying file descriptor for 'file_a' should be closed now. Try to read
-        // from it again. We left the file positioned at offset 1 above.
-        assert_eq!("oobar", read_string(&mut file_a)?);
-
-        // Check that all the other FDs still work too. Use them in random order for
-        // good measure.
-        vfiles.as_mut_slice().shuffle(&mut thread_rng());
-        for vfile in vfiles.iter_mut() {
-            assert_eq!("OOBAR", read_string_at(vfile, 1, 5)?);
-        }
-
-        Ok(())
-    }
-
-    /// Test using VirtualFiles from many threads concurrently. This tests both using
-    /// a lot of VirtualFiles concurrently, causing evictions, and also using the same
-    /// VirtualFile from multiple threads concurrently.
-    #[test]
-    fn test_vfile_concurrency() -> Result<(), Error> {
-        const SIZE: usize = 8 * 1024;
-        const VIRTUAL_FILES: usize = 100;
-        const THREADS: usize = 100;
-        const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
-
-        let testdir = crate::PageServerConf::test_repo_dir("vfile_concurrency");
-        std::fs::create_dir_all(&testdir)?;
-
-        // Create a test file.
-        let test_file_path = testdir.join("concurrency_test_file");
-        {
-            let file = File::create(&test_file_path)?;
-            file.write_all_at(&SAMPLE, 0)?;
-        }
-
-        // Open the file many times.
-        let mut files = Vec::new();
-        for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))?;
-            files.push(f);
-        }
-        let files = Arc::new(files);
-
-        // Launch many threads, and use the virtual files concurrently in random order.
-        let mut threads = Vec::new();
-        for threadno in 0..THREADS {
-            let builder =
-                thread::Builder::new().name(format!("test_vfile_concurrency thread {}", threadno));
-
-            let files = files.clone();
-            let thread = builder
-                .spawn(move || {
-                    let mut buf = [0u8; SIZE];
-                    let mut rng = rand::thread_rng();
-                    for _ in 1..1000 {
-                        let f = &files[rng.gen_range(0..files.len())];
-                        f.read_exact_at(&mut buf, 0).unwrap();
-                        assert!(buf == SAMPLE);
-                    }
-                })
-                .unwrap();
-            threads.push(thread);
-        }
-
-        for thread in threads {
-            thread.join().unwrap();
-        }
-
-        Ok(())
-    }
-}
--- a/pageserver/src/waldecoder.rs
+++ b/pageserver/src/waldecoder.rs
@@ -72,10 +72,6 @@ impl WalStreamDecoder {
    ///     Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
    ///
    pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
-        let recordbuf;
-
-        // Run state machine that validates page headers, and reassembles records
-        // that cross page boundaries.
        loop {
            // parse and verify page boundaries as we go
            if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
@@ -124,41 +120,29 @@ impl WalStreamDecoder {
                self.lsn += self.padlen as u64;
                self.padlen = 0;
            } else if self.contlen == 0 {
-                assert!(self.recordbuf.is_empty());
-
                // need to have at least the xl_tot_len field
+
                if self.inputbuf.remaining() < 4 {
                    return Ok(None);
                }

-                // peek xl_tot_len at the beginning of the record.
-                // FIXME: assumes little-endian
+                // read xl_tot_len FIXME: assumes little-endian
                self.startlsn = self.lsn;
-                let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
+                let xl_tot_len = self.inputbuf.get_u32_le();
                if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
                    return Err(WalDecodeError {
                        msg: format!("invalid xl_tot_len {}", xl_tot_len),
                        lsn: self.lsn,
                    });
                }
+                self.lsn += 4;

-                // Fast path for the common case that the whole record fits on the page.
-                let pageleft = self.lsn.remaining_in_block() as u32;
-                if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
-                    // Take the record from the 'inputbuf', and validate it.
-                    recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
-                    self.lsn += xl_tot_len as u64;
-                    break;
-                } else {
-                    // Need to assemble the record from pieces. Remember the size of the
-                    // record, and loop back. On next iteration, we will reach the 'else'
-                    // branch below, and copy the part of the record that was on this page
-                    // to 'recordbuf'.  Subsequent iterations will skip page headers, and
-                    // append the continuations from the next pages to 'recordbuf'.
-                    self.recordbuf.reserve(xl_tot_len as usize);
-                    self.contlen = xl_tot_len;
-                    continue;
-                }
+                self.recordbuf.clear();
+                self.recordbuf.reserve(xl_tot_len as usize);
+                self.recordbuf.put_u32_le(xl_tot_len);
+
+                self.contlen = xl_tot_len - 4;
+                continue;
            } else {
                // we're continuing a record, possibly from previous page.
                let pageleft = self.lsn.remaining_in_block() as u32;
@@ -175,42 +159,47 @@ impl WalStreamDecoder {
                self.contlen -= n as u32;

                if self.contlen == 0 {
-                    // The record is now complete.
-                    recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
-                    break;
+                    let recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new());
+
+                    let recordbuf = recordbuf.freeze();
+                    let mut buf = recordbuf.clone();
+
+                    let xlogrec = XLogRecord::from_bytes(&mut buf);
+
+                    // XLOG_SWITCH records are special. If we see one, we need to skip
+                    // to the next WAL segment.
+                    if xlogrec.is_xlog_switch_record() {
+                        trace!("saw xlog switch record at {}", self.lsn);
+                        self.padlen =
+                            self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
+                    } else {
+                        // Pad to an 8-byte boundary
+                        self.padlen = self.lsn.calc_padding(8u32) as u32;
+                    }
+
+                    let mut crc = crc32c_append(0, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
+                    crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
+                    if crc != xlogrec.xl_crc {
+                        return Err(WalDecodeError {
+                            msg: "WAL record crc mismatch".into(),
+                            lsn: self.lsn,
+                        });
+                    }
+
+                    // Always align resulting LSN on 0x8 boundary -- that is important for getPage()
+                    // and WalReceiver integration. Since this code is used both for WalReceiver and
+                    // initial WAL import let's force alignment right here.
+                    let result = (self.lsn.align(), recordbuf);
+                    return Ok(Some(result));
                }
                continue;
            }
        }
+        // check record boundaries

-        // We now have a record in the 'recordbuf' local variable.
-        let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
+        // deal with continuation records

-        let mut crc = 0;
-        crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
-        crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
-        if crc != xlogrec.xl_crc {
-            return Err(WalDecodeError {
-                msg: "WAL record crc mismatch".into(),
-                lsn: self.lsn,
-            });
-        }
-
-        // XLOG_SWITCH records are special. If we see one, we need to skip
-        // to the next WAL segment.
-        if xlogrec.is_xlog_switch_record() {
-            trace!("saw xlog switch record at {}", self.lsn);
-            self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
-        } else {
-            // Pad to an 8-byte boundary
-            self.padlen = self.lsn.calc_padding(8u32) as u32;
-        }
-
-        // Always align resulting LSN on 0x8 boundary -- that is important for getPage()
-        // and WalReceiver integration. Since this code is used both for WalReceiver and
-        // initial WAL import let's force alignment right here.
-        let result = (self.lsn.align(), recordbuf);
-        Ok(Some(result))
+        // deal with xlog_switch records
    }
 }

@@ -229,18 +218,17 @@ pub struct DecodedBkpBlock {
    pub blkno: u32,

    /* copy of the fork_flags field from the XLogRecordBlockHeader */
-    pub flags: u8,
+    flags: u8,

    /* Information on full-page image, if any */
-    pub has_image: bool,   /* has image, even for consistency checking */
+    has_image: bool,       /* has image, even for consistency checking */
    pub apply_image: bool, /* has image that should be restored */
    pub will_init: bool,   /* record doesn't need previous page version to apply */
    //char	   *bkp_image;
-    pub hole_offset: u16,
-    pub hole_length: u16,
-    pub bimg_offset: u32,
-    pub bimg_len: u16,
-    pub bimg_info: u8,
+    hole_offset: u16,
+    hole_length: u16,
+    bimg_len: u16,
+    bimg_info: u8,

    /* Buffer holding the rmgr-specific data associated with this block */
    has_data: bool,
@@ -860,19 +848,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
    }

    // 3. Decode blocks.
-    let mut ptr = record.len() - buf.remaining();
-    for blk in blocks.iter_mut() {
-        if blk.has_image {
-            blk.bimg_offset = ptr as u32;
-            ptr += blk.bimg_len as usize;
-        }
-        if blk.has_data {
-            ptr += blk.data_len as usize;
-        }
-    }
    // We don't need them, so just skip blocks_total_len bytes
    buf.advance(blocks_total_len as usize);
-    assert_eq!(ptr, record.len() - buf.remaining());

    let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;

--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -8,28 +8,26 @@
 use crate::relish::*;
 use crate::restore_local_repo;
 use crate::tenant_mgr;
-use crate::tenant_mgr::TenantState;
-use crate::tenant_threads;
 use crate::waldecoder::*;
 use crate::PageServerConf;
-use anyhow::{bail, Error, Result};
+use anyhow::{Error, Result};
 use lazy_static::lazy_static;
+use log::*;
 use postgres::fallible_iterator::FallibleIterator;
 use postgres::replication::ReplicationIter;
 use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
+use postgres_ffi::xlog_utils::*;
 use postgres_ffi::*;
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
-use std::cell::Cell;
+use std::cmp::{max, min};
 use std::collections::HashMap;
+use std::fs;
 use std::str::FromStr;
 use std::sync::Mutex;
 use std::thread;
 use std::thread::sleep;
-use std::thread::JoinHandle;
-use std::thread_local;
 use std::time::{Duration, SystemTime};
-use tracing::*;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::zid::ZTenantId;
 use zenith_utils::zid::ZTimelineId;
@@ -39,8 +37,6 @@ use zenith_utils::zid::ZTimelineId;
 //
 struct WalReceiverEntry {
    wal_producer_connstr: String,
-    wal_receiver_handle: Option<JoinHandle<()>>,
-    tenantid: ZTenantId,
 }

 lazy_static! {
@@ -48,43 +44,6 @@ lazy_static! {
        Mutex::new(HashMap::new());
 }

-thread_local! {
-    // Boolean that is true only for WAL receiver threads
-    //
-    // This is used in `wait_lsn` to guard against usage that might lead to a deadlock.
-    pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
-}
-
-// Wait for walreceiver to stop
-// Now it stops when pageserver shutdown is requested.
-// In future we can make this more granular and send shutdown signals
-// per tenant/timeline to cancel inactive walreceivers.
-// TODO deal with blocking pg connections
-pub fn stop_wal_receiver(timelineid: ZTimelineId) {
-    let mut receivers = WAL_RECEIVERS.lock().unwrap();
-    if let Some(r) = receivers.get_mut(&timelineid) {
-        r.wal_receiver_handle.take();
-        // r.wal_receiver_handle.take().map(JoinHandle::join);
-    }
-}
-
-pub fn drop_wal_receiver(timelineid: ZTimelineId, tenantid: ZTenantId) {
-    let mut receivers = WAL_RECEIVERS.lock().unwrap();
-    receivers.remove(&timelineid);
-
-    // Check if it was the last walreceiver of the tenant.
-    // TODO now we store one WalReceiverEntry per timeline,
-    // so this iterator looks a bit strange.
-    for (_timelineid, entry) in receivers.iter() {
-        if entry.tenantid == tenantid {
-            return;
-        }
-    }
-
-    // When last walreceiver of the tenant is gone, change state to Idle
-    tenant_mgr::set_tenant_state(tenantid, TenantState::Idle).unwrap();
-}
-
 // Launch a new WAL receiver, or tell one that's running about change in connection string
 pub fn launch_wal_receiver(
    conf: &'static PageServerConf,
@@ -99,24 +58,21 @@ pub fn launch_wal_receiver(
            receiver.wal_producer_connstr = wal_producer_connstr.into();
        }
        None => {
-            let wal_receiver_handle = thread::Builder::new()
-                .name("WAL receiver thread".into())
-                .spawn(move || {
-                    IS_WAL_RECEIVER.with(|c| c.set(true));
-                    thread_main(conf, timelineid, tenantid);
-                })
-                .unwrap();
-
            let receiver = WalReceiverEntry {
                wal_producer_connstr: wal_producer_connstr.into(),
-                wal_receiver_handle: Some(wal_receiver_handle),
-                tenantid,
            };
            receivers.insert(timelineid, receiver);

-            // Update tenant state and start tenant threads, if they are not running yet.
-            tenant_mgr::set_tenant_state(tenantid, TenantState::Active).unwrap();
-            tenant_threads::start_tenant_threads(conf, tenantid);
+            // Also launch a new thread to handle this connection
+            //
+            // NOTE: This thread name is checked in the assertion in wait_lsn. If you change
+            // this, make sure you update the assertion too.
+            let _walreceiver_thread = thread::Builder::new()
+                .name("WAL receiver thread".into())
+                .spawn(move || {
+                    thread_main(conf, timelineid, tenantid);
+                })
+                .unwrap();
        }
    };
 }
@@ -136,18 +92,16 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
 // This is the entry point for the WAL receiver thread.
 //
 fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
-    let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
-    info!("WAL receiver thread started");
-
-    let mut retry_count = 10;
+    info!(
+        "WAL receiver thread started for timeline : '{}'",
+        timelineid
+    );

    //
    // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
    // and start streaming WAL from it. If the connection is lost, keep retrying.
-    // TODO How long should we retry in case of losing connection?
-    // Should we retry at all or we can wait for the next callmemaybe request?
    //
-    while !tenant_mgr::shutdown_requested() && retry_count > 0 {
+    loop {
        // Look up the current WAL producer address
        let wal_producer_connstr = get_wal_producer_connstr(timelineid);

@@ -158,24 +112,13 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
                "WAL streaming connection failed ({}), retrying in 1 second",
                e
            );
-            retry_count -= 1;
            sleep(Duration::from_secs(1));
-        } else {
-            info!(
-                "walreceiver disconnected tenant {}, timelineid {}",
-                tenantid, timelineid
-            );
-            break;
        }
    }
-    info!("WAL streaming shut down");
-    // Drop it from list of active WAL_RECEIVERS
-    // so that next callmemaybe request launched a new thread
-    drop_wal_receiver(timelineid, tenantid);
 }

 fn walreceiver_main(
-    _conf: &PageServerConf,
+    conf: &PageServerConf,
    timelineid: ZTimelineId,
    wal_producer_connstr: &str,
    tenantid: ZTenantId,
@@ -215,15 +158,15 @@ fn walreceiver_main(
    let mut startpoint = last_rec_lsn;

    if startpoint == Lsn(0) {
-        bail!("No previous WAL position");
+        error!("No previous WAL position");
    }

    // There might be some padding after the last full record, skip it.
    startpoint += startpoint.calc_padding(8u32);

    info!(
-        "last_record_lsn {} starting replication from {}, server is at {}...",
-        last_rec_lsn, startpoint, end_of_wal
+        "last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
+        last_rec_lsn, startpoint, timelineid, end_of_wal
    );

    let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
@@ -245,38 +188,34 @@ fn walreceiver_main(
                let data = xlog_data.data();
                let startlsn = Lsn::from(xlog_data.wal_start());
                let endlsn = startlsn + data.len() as u64;
+                let prev_last_rec_lsn = last_rec_lsn;

                trace!("received XLogData between {} and {}", startlsn, endlsn);

                waldecoder.feed_bytes(data);

                while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                    let _enter = info_span!("processing record", lsn = %lsn).entered();
+                    // Save old checkpoint value to compare with it after decoding WAL record
+                    let old_checkpoint_bytes = checkpoint.encode();
+                    let decoded = decode_wal_record(recdata.clone());

                    // It is important to deal with the aligned records as lsn in getPage@LSN is
                    // aligned and can be several bytes bigger. Without this alignment we are
                    // at risk of hittind a deadlock.
                    assert!(lsn.is_aligned());

-                    let writer = timeline.writer();
-
-                    let mut checkpoint_modified = false;
-
-                    let decoded = decode_wal_record(recdata.clone());
                    restore_local_repo::save_decoded_record(
                        &mut checkpoint,
-                        &mut checkpoint_modified,
-                        writer.as_ref(),
+                        &*timeline,
                        &decoded,
                        recdata,
                        lsn,
                    )?;

+                    let new_checkpoint_bytes = checkpoint.encode();
                    // Check if checkpoint data was updated by save_decoded_record
-                    if checkpoint_modified {
-                        let new_checkpoint_bytes = checkpoint.encode();
-
-                        writer.put_page_image(
+                    if new_checkpoint_bytes != old_checkpoint_bytes {
+                        timeline.put_page_image(
                            RelishTag::Checkpoint,
                            0,
                            lsn,
@@ -286,10 +225,38 @@ fn walreceiver_main(

                    // Now that this record has been fully handled, including updating the
                    // checkpoint data, let the repository know that it is up-to-date to this LSN
-                    writer.advance_last_record_lsn(lsn);
+                    timeline.advance_last_record_lsn(lsn);
                    last_rec_lsn = lsn;
                }

+                // Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
+                // "checkpoint" the repository to flush all the changes from WAL we've processed
+                // so far to disk. After this, we don't need the original WAL anymore, and it
+                // can be removed. This is probably too aggressive for production, but it's useful
+                // to expose bugs now.
+                //
+                // TODO: We don't actually dare to remove the WAL. It's useful for debugging,
+                // and we might it for logical decoding other things in the future. Although
+                // we should also be able to fetch it back from the WAL safekeepers or S3 if
+                // needed.
+                if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
+                    != last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
+                {
+                    info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
+                    let (oldest_segno, newest_segno) = find_wal_file_range(
+                        conf,
+                        &timelineid,
+                        pg_constants::WAL_SEGMENT_SIZE,
+                        last_rec_lsn,
+                        &tenantid,
+                    )?;
+
+                    if newest_segno - oldest_segno >= 10 {
+                        // TODO: This is where we could remove WAL older than last_rec_lsn.
+                        //remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
+                    }
+                }
+
                if !caught_up && endlsn >= end_of_wal {
                    info!("caught up at LSN {}", endlsn);
                    caught_up = true;
@@ -311,7 +278,7 @@ fn walreceiver_main(
                );

                if reply_requested {
-                    Some(last_rec_lsn)
+                    Some(timeline.get_last_record_lsn())
                } else {
                    None
                }
@@ -321,36 +288,59 @@ fn walreceiver_main(
        };

        if let Some(last_lsn) = status_update {
+            // TODO: More thought should go into what values are sent here.
            let last_lsn = PgLsn::from(u64::from(last_lsn));
-
-            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
            let write_lsn = last_lsn;
-            // This value doesn't guarantee data durability, but it's ok.
-            // In setup with WAL service, pageserver durability is guaranteed by safekeepers.
-            // In setup without WAL service, we just don't care.
-            let flush_lsn = write_lsn;
-            // `disk_consistent_lsn` is the LSN at which page server guarantees persistence of all received data
-            // Depending on the setup we recieve WAL directly from Compute Node or
-            // from a WAL service.
-            //
-            // Senders use the feedback to determine if we are caught up:
-            // - Safekeepers are free to remove WAL preceding `apply_lsn`,
-            // as it will never be requested by this page server.
-            // - Compute Node uses 'apply_lsn' to calculate a lag for back pressure mechanism
-            // (delay WAL inserts to avoid lagging pageserver responses and WAL overflow).
-            let apply_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
+            let flush_lsn = last_lsn;
+            let apply_lsn = PgLsn::from(0);
            let ts = SystemTime::now();
            const NO_REPLY: u8 = 0;
+
            physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
        }
+    }
+    Ok(())
+}

-        if tenant_mgr::shutdown_requested() {
-            debug!("stop walreceiver because pageserver shutdown is requested");
-            break;
+fn find_wal_file_range(
+    conf: &PageServerConf,
+    timeline: &ZTimelineId,
+    wal_seg_size: usize,
+    written_upto: Lsn,
+    tenant: &ZTenantId,
+) -> Result<(u64, u64)> {
+    let written_upto_segno = written_upto.segment_number(wal_seg_size);
+
+    let mut oldest_segno = written_upto_segno;
+    let mut newest_segno = written_upto_segno;
+    // Scan the wal directory, and count how many WAL filed we could remove
+    let wal_dir = conf.wal_dir_path(timeline, tenant);
+    for entry in fs::read_dir(wal_dir)? {
+        let entry = entry?;
+        let path = entry.path();
+
+        if path.is_dir() {
+            continue;
+        }
+
+        let filename = path.file_name().unwrap().to_str().unwrap();
+
+        if IsXLogFileName(filename) {
+            let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
+
+            if segno > written_upto_segno {
+                // that's strange.
+                warn!("there is a WAL file from future at {}", path.display());
+                continue;
+            }
+
+            oldest_segno = min(oldest_segno, segno);
+            newest_segno = max(newest_segno, segno);
        }
    }
+    // FIXME: would be good to assert that there are no gaps in the WAL files

-    Ok(())
+    Ok((oldest_segno, newest_segno))
 }

 /// Data returned from the postgres `IDENTIFY_SYSTEM` command
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,24 +22,23 @@ use byteorder::{ByteOrder, LittleEndian};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use lazy_static::lazy_static;
 use log::*;
-use nix::poll::*;
 use serde::Serialize;
 use std::fs;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
-use std::io::{Error, ErrorKind};
-use std::os::unix::io::AsRawFd;
+use std::io::Error;
 use std::path::PathBuf;
 use std::process::Stdio;
-use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Mutex;
 use std::time::Duration;
 use std::time::Instant;
+use tokio::io::AsyncBufReadExt;
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tokio::process::{ChildStdin, ChildStdout, Command};
+use tokio::time::timeout;
 use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;
-use zenith_utils::nonblock::set_nonblock;
 use zenith_utils::zid::ZTenantId;

 use crate::relish::*;
@@ -54,8 +53,6 @@ use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
 use postgres_ffi::pg_constants;
 use postgres_ffi::XLogRecord;

-const N_WAL_REDO_PROCS: usize = 1;
-
 ///
 /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
 ///
@@ -85,7 +82,7 @@ pub trait WalRedoManager: Send + Sync {
        blknum: u32,
        lsn: Lsn,
        base_img: Option<Bytes>,
-        records: Vec<(Lsn, WALRecord)>,
+        records: Vec<WALRecord>,
    ) -> Result<Bytes, WalRedoError>;
 }

@@ -102,7 +99,7 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
        _blknum: u32,
        _lsn: Lsn,
        _base_img: Option<Bytes>,
-        _records: Vec<(Lsn, WALRecord)>,
+        _records: Vec<WALRecord>,
    ) -> Result<Bytes, WalRedoError> {
        Err(WalRedoError::InvalidState)
    }
@@ -142,8 +139,8 @@ pub struct PostgresRedoManager {
    tenantid: ZTenantId,
    conf: &'static PageServerConf,

-    round_robin: AtomicUsize,
-    processes: [Mutex<Option<PostgresRedoProcess>>; N_WAL_REDO_PROCS],
+    runtime: tokio::runtime::Runtime,
+    process: Mutex<Option<PostgresRedoProcess>>,
 }

 #[derive(Debug)]
@@ -153,16 +150,9 @@ struct WalRedoRequest {
    lsn: Lsn,

    base_img: Option<Bytes>,
-    records: Vec<(Lsn, WALRecord)>,
+    records: Vec<WALRecord>,
 }

-impl WalRedoRequest {
-    // Can this request be served by zenith redo funcitons
-    // or we need to pass it to wal-redo postgres process?
-    fn can_apply_in_zenith(&self) -> bool {
-        !matches!(self.rel, RelishTag::Relation(_))
-    }
-}
 /// An error happened in WAL redo
 #[derive(Debug, thiserror::Error)]
 pub enum WalRedoError {
@@ -171,8 +161,6 @@ pub enum WalRedoError {

    #[error("cannot perform WAL redo now")]
    InvalidState,
-    #[error("cannot perform WAL redo for this request")]
-    InvalidRequest,
 }

 ///
@@ -191,9 +179,10 @@ impl WalRedoManager for PostgresRedoManager {
        blknum: u32,
        lsn: Lsn,
        base_img: Option<Bytes>,
-        records: Vec<(Lsn, WALRecord)>,
+        records: Vec<WALRecord>,
    ) -> Result<Bytes, WalRedoError> {
        let start_time;
+        let lock_time;
        let end_time;

        let request = WalRedoRequest {
@@ -205,38 +194,26 @@ impl WalRedoManager for PostgresRedoManager {
        };

        start_time = Instant::now();
-        let result;
-
-        if request.can_apply_in_zenith() {
-            result = self.handle_apply_request_zenith(&request);
-
-            end_time = Instant::now();
-            WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
-        } else {
-            let rr = self.round_robin.fetch_add(1, Ordering::Relaxed) % N_WAL_REDO_PROCS;
-            let mut process_guard = self.processes[rr].lock().unwrap();
-            let lock_time = Instant::now();
+        let result = {
+            let mut process_guard = self.process.lock().unwrap();
+            lock_time = Instant::now();

            // launch the WAL redo process on first use
            if process_guard.is_none() {
-                let p = PostgresRedoProcess::launch(self.conf, &self.tenantid, rr)?;
+                let p = self
+                    .runtime
+                    .block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
                *process_guard = Some(p);
            }
            let process = process_guard.as_mut().unwrap();

-            result = self.handle_apply_request_postgres(process, &request);
+            self.runtime
+                .block_on(self.handle_apply_request(process, &request))
+        };
+        end_time = Instant::now();

-            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
-            end_time = Instant::now();
-            WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
-
-            // If something went wrong, don't try to reuse the process. Kill it, and
-            // next request will launch a new one.
-            if result.is_err() {
-                let process = process_guard.take().unwrap();
-                process.kill();
-            }
-        }
+        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+        WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());

        result
    }
@@ -247,57 +224,31 @@ impl PostgresRedoManager {
    /// Create a new PostgresRedoManager.
    ///
    pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
+        // We block on waiting for requests on the walredo request channel, but
+        // use async I/O to communicate with the child process. Initialize the
+        // runtime for the async part.
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+
        // The actual process is launched lazily, on first request.
        PostgresRedoManager {
+            runtime,
            tenantid,
            conf,
-            round_robin: AtomicUsize::new(0),
-            processes: [(); N_WAL_REDO_PROCS].map(|_| Mutex::new(None)),
+            process: Mutex::new(None),
        }
    }

    ///
-    /// Process one request for WAL redo using wal-redo postgres
+    /// Process one request for WAL redo.
    ///
-    fn handle_apply_request_postgres(
+    async fn handle_apply_request(
        &self,
        process: &mut PostgresRedoProcess,
        request: &WalRedoRequest,
    ) -> Result<Bytes, WalRedoError> {
-        let blknum = request.blknum;
-        let lsn = request.lsn;
-        let base_img = request.base_img.clone();
-        let records = &request.records;
-        let nrecords = records.len();
-
-        let start = Instant::now();
-
-        let apply_result: Result<Bytes, Error>;
-
-        if let RelishTag::Relation(rel) = request.rel {
-            // Relational WAL records are applied using wal-redo-postgres
-            let buf_tag = BufferTag { rel, blknum };
-            apply_result = process.apply_wal_records(buf_tag, base_img, records);
-
-            let duration = start.elapsed();
-
-            debug!(
-                "postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
-                nrecords,
-                duration.as_micros(),
-                lsn
-            );
-
-            apply_result.map_err(WalRedoError::IoError)
-        } else {
-            Err(WalRedoError::InvalidRequest)
-        }
-    }
-
-    ///
-    /// Process one request for WAL redo using custom zenith code
-    ///
-    fn handle_apply_request_zenith(&self, request: &WalRedoRequest) -> Result<Bytes, WalRedoError> {
        let rel = request.rel;
        let blknum = request.blknum;
        let lsn = request.lsn;
@@ -309,158 +260,176 @@ impl PostgresRedoManager {
        let start = Instant::now();

        let apply_result: Result<Bytes, Error>;
-
-        // Non-relational WAL records are handled here, with custom code that has the
-        // same effects as the corresponding Postgres WAL redo function.
-        const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
-        let mut page = BytesMut::new();
-        if let Some(fpi) = base_img {
-            // If full-page image is provided, then use it...
-            page.extend_from_slice(&fpi[..]);
+        if let RelishTag::Relation(rel) = rel {
+            // Relational WAL records are applied using wal-redo-postgres
+            let buf_tag = BufferTag { rel, blknum };
+            apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
        } else {
-            // otherwise initialize page with zeros
-            page.extend_from_slice(&ZERO_PAGE);
-        }
-        // Apply all collected WAL records
-        for (_lsn, record) in records {
-            let mut buf = record.rec.clone();
-
-            WAL_REDO_RECORD_COUNTER.inc();
-
-            // 1. Parse XLogRecord struct
-            // FIXME: refactor to avoid code duplication.
-            let xlogrec = XLogRecord::from_bytes(&mut buf);
-
-            //move to main data
-            // TODO probably, we should store some records in our special format
-            // to avoid this weird parsing on replay
-            let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
-            if buf.remaining() > skip {
-                buf.advance(skip);
+            // Non-relational WAL records are handled here, with custom code that has the
+            // same effects as the corresponding Postgres WAL redo function.
+            const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
+            let mut page = BytesMut::new();
+            if let Some(fpi) = base_img {
+                // If full-page image is provided, then use it...
+                page.extend_from_slice(&fpi[..]);
+            } else {
+                // otherwise initialize page with zeros
+                page.extend_from_slice(&ZERO_PAGE);
            }
+            // Apply all collected WAL records
+            for record in records {
+                let mut buf = record.rec.clone();

-            if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
-                // Transaction manager stuff
-                let rec_segno = match rel {
-                    RelishTag::Slru { slru, segno } => {
-                        assert!(
-                            slru == SlruKind::Clog,
-                            "Not valid XACT relish tag {:?}",
-                            rel
-                        );
-                        segno
-                    }
-                    _ => panic!("Not valid XACT relish tag {:?}", rel),
-                };
-                let parsed_xact =
-                    XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
-                if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
-                    || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
-                {
-                    transaction_id_set_status(
-                        parsed_xact.xid,
-                        pg_constants::TRANSACTION_STATUS_COMMITTED,
-                        &mut page,
-                    );
-                    for subxact in &parsed_xact.subxacts {
-                        let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                        // only update xids on the requested page
-                        if rec_segno == segno && blknum == rpageno {
-                            transaction_id_set_status(
-                                *subxact,
-                                pg_constants::TRANSACTION_STATUS_COMMITTED,
-                                &mut page,
-                            );
-                        }
-                    }
-                } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
-                    || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
-                {
-                    transaction_id_set_status(
-                        parsed_xact.xid,
-                        pg_constants::TRANSACTION_STATUS_ABORTED,
-                        &mut page,
-                    );
-                    for subxact in &parsed_xact.subxacts {
-                        let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                        // only update xids on the requested page
-                        if rec_segno == segno && blknum == rpageno {
-                            transaction_id_set_status(
-                                *subxact,
-                                pg_constants::TRANSACTION_STATUS_ABORTED,
-                                &mut page,
-                            );
-                        }
-                    }
+                WAL_REDO_RECORD_COUNTER.inc();
+
+                // 1. Parse XLogRecord struct
+                // FIXME: refactor to avoid code duplication.
+                let xlogrec = XLogRecord::from_bytes(&mut buf);
+
+                //move to main data
+                // TODO probably, we should store some records in our special format
+                // to avoid this weird parsing on replay
+                let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
+                if buf.remaining() > skip {
+                    buf.advance(skip);
                }
-            } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
-                // Multixact operations
-                let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-                    let xlrec = XlMultiXactCreate::decode(&mut buf);
-                    if let RelishTag::Slru {
-                        slru,
-                        segno: rec_segno,
-                    } = rel
+
+                if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
+                    // Transaction manager stuff
+                    let rec_segno = match rel {
+                        RelishTag::Slru { slru, segno } => {
+                            if slru != SlruKind::Clog {
+                                panic!("Not valid XACT relish tag {:?}", rel);
+                            }
+                            segno
+                        }
+                        _ => panic!("Not valid XACT relish tag {:?}", rel),
+                    };
+                    let parsed_xact =
+                        XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
+                    if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
+                        || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
                    {
-                        if slru == SlruKind::MultiXactMembers {
-                            for i in 0..xlrec.nmembers {
-                                let pageno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-                                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                if segno == rec_segno && rpageno == blknum {
-                                    // update only target block
-                                    let offset = xlrec.moff + i;
-                                    let memberoff = mx_offset_to_member_offset(offset);
-                                    let flagsoff = mx_offset_to_flags_offset(offset);
-                                    let bshift = mx_offset_to_flags_bitshift(offset);
-                                    let mut flagsval =
-                                        LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
-                                    flagsval &=
-                                        !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
+                        transaction_id_set_status(
+                            parsed_xact.xid,
+                            pg_constants::TRANSACTION_STATUS_COMMITTED,
+                            &mut page,
+                        );
+                        for subxact in &parsed_xact.subxacts {
+                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                            // only update xids on the requested page
+                            if rec_segno == segno && blknum == rpageno {
+                                transaction_id_set_status(
+                                    *subxact,
+                                    pg_constants::TRANSACTION_STATUS_COMMITTED,
+                                    &mut page,
+                                );
+                            }
+                        }
+                    } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
+                        || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
+                    {
+                        transaction_id_set_status(
+                            parsed_xact.xid,
+                            pg_constants::TRANSACTION_STATUS_ABORTED,
+                            &mut page,
+                        );
+                        for subxact in &parsed_xact.subxacts {
+                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                            // only update xids on the requested page
+                            if rec_segno == segno && blknum == rpageno {
+                                transaction_id_set_status(
+                                    *subxact,
+                                    pg_constants::TRANSACTION_STATUS_ABORTED,
+                                    &mut page,
+                                );
+                            }
+                        }
+                    }
+                } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
+                    // Multixact operations
+                    let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                    if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+                        let xlrec = XlMultiXactCreate::decode(&mut buf);
+                        if let RelishTag::Slru {
+                            slru,
+                            segno: rec_segno,
+                        } = rel
+                        {
+                            if slru == SlruKind::MultiXactMembers {
+                                for i in 0..xlrec.nmembers {
+                                    let pageno =
+                                        i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                                    if segno == rec_segno && rpageno == blknum {
+                                        // update only target block
+                                        let offset = xlrec.moff + i;
+                                        let memberoff = mx_offset_to_member_offset(offset);
+                                        let flagsoff = mx_offset_to_flags_offset(offset);
+                                        let bshift = mx_offset_to_flags_bitshift(offset);
+                                        let mut flagsval =
+                                            LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                                        flagsval &= !(((1
+                                            << pg_constants::MXACT_MEMBER_BITS_PER_XACT)
+                                            - 1)
                                            << bshift);
-                                    flagsval |= xlrec.members[i as usize].status << bshift;
-                                    LittleEndian::write_u32(
-                                        &mut page[flagsoff..flagsoff + 4],
-                                        flagsval,
-                                    );
-                                    LittleEndian::write_u32(
-                                        &mut page[memberoff..memberoff + 4],
-                                        xlrec.members[i as usize].xid,
-                                    );
+                                        flagsval |= xlrec.members[i as usize].status << bshift;
+                                        LittleEndian::write_u32(
+                                            &mut page[flagsoff..flagsoff + 4],
+                                            flagsval,
+                                        );
+                                        LittleEndian::write_u32(
+                                            &mut page[memberoff..memberoff + 4],
+                                            xlrec.members[i as usize].xid,
+                                        );
+                                    }
                                }
+                            } else {
+                                // Multixact offsets SLRU
+                                let offs = (xlrec.mid
+                                    % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
+                                    * 4) as usize;
+                                LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
                            }
                        } else {
-                            // Multixact offsets SLRU
-                            let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
-                                * 4) as usize;
-                            LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
+                            panic!();
                        }
                    } else {
                        panic!();
                    }
-                } else {
-                    panic!();
                }
            }
-        }

-        apply_result = Ok::<Bytes, Error>(page.freeze());
+            apply_result = Ok::<Bytes, Error>(page.freeze());
+        }

        let duration = start.elapsed();

+        let result: Result<Bytes, WalRedoError>;
+
        debug!(
-            "zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}",
+            "applied {} WAL records in {} ms to reconstruct page image at LSN {}",
            nrecords,
            duration.as_millis(),
            lsn
        );

-        apply_result.map_err(WalRedoError::IoError)
+        if let Err(e) = apply_result {
+            error!("could not apply WAL records: {}", e);
+            result = Err(WalRedoError::IoError(e));
+        } else {
+            let img = apply_result.unwrap();
+
+            result = Ok(img);
+        }
+
+        // The caller is responsible for sending the response
+        result
    }
 }

@@ -468,33 +437,28 @@ impl PostgresRedoManager {
 /// Handle to the Postgres WAL redo process
 ///
 struct PostgresRedoProcess {
-    child: Child,
    stdin: ChildStdin,
    stdout: ChildStdout,
-    stderr: ChildStderr,
 }

 impl PostgresRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    fn launch(
+    async fn launch(
        conf: &PageServerConf,
        tenantid: &ZTenantId,
-        id: usize,
    ) -> Result<PostgresRedoProcess, Error> {
        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
        // just create one with constant name. That fails if you try to launch more than
        // one WAL redo manager concurrently.
-        let datadir = conf
-            .tenant_path(tenantid)
-            .join(format! {"wal-redo-datadir-{}", id});
+        let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir");

        // Create empty data directory for wal-redo postgres, deleting old one first.
        if datadir.exists() {
            info!("directory {:?} exists, removing", &datadir);
            if let Err(e) = fs::remove_dir_all(&datadir) {
-                error!("could not remove old wal-redo-datadir: {:#}", e);
+                error!("could not remove old wal-redo-datadir: {:?}", e);
            }
        }
        info!("running initdb in {:?}", datadir.display());
@@ -505,6 +469,7 @@ impl PostgresRedoProcess {
            .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
            .output()
+            .await
            .expect("failed to execute initdb");

        if !initdb.status.success() {
@@ -541,139 +506,108 @@ impl PostgresRedoProcess {
            datadir.display()
        );

-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
+        let stdin = child.stdin.take().expect("failed to open child's stdin");
+        let stderr = child.stderr.take().expect("failed to open child's stderr");
+        let stdout = child.stdout.take().expect("failed to open child's stdout");

-        set_nonblock(stdin.as_raw_fd())?;
-        set_nonblock(stdout.as_raw_fd())?;
-        set_nonblock(stderr.as_raw_fd())?;
+        // This async block reads the child's stderr, and forwards it to the logger
+        let f_stderr = async {
+            let mut stderr_buffered = tokio::io::BufReader::new(stderr);

-        Ok(PostgresRedoProcess {
-            child,
-            stdin,
-            stdout,
-            stderr,
-        })
-    }
+            let mut line = String::new();
+            loop {
+                let res = stderr_buffered.read_line(&mut line).await;
+                if res.is_err() {
+                    debug!("could not convert line to utf-8");
+                    continue;
+                }
+                if res.unwrap() == 0 {
+                    break;
+                }
+                error!("wal-redo-postgres: {}", line.trim());
+                line.clear();
+            }
+            Ok::<(), Error>(())
+        };
+        tokio::spawn(f_stderr);

-    fn kill(mut self) {
-        let _ = self.child.kill();
-        if let Ok(exit_status) = self.child.wait() {
-            error!("wal-redo-postgres exited with code {}", exit_status);
-        }
-        drop(self);
+        Ok(PostgresRedoProcess { stdin, stdout })
    }

    //
    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
-    fn apply_wal_records(
+    async fn apply_wal_records(
        &mut self,
        tag: BufferTag,
        base_img: Option<Bytes>,
-        records: &[(Lsn, WALRecord)],
+        records: &[WALRecord],
    ) -> Result<Bytes, std::io::Error> {
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        let mut writebuf: Vec<u8> = Vec::new();
-        build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            build_push_page_msg(tag, &img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            build_apply_record_msg(*lsn, &rec.rec, &mut writebuf);
-        }
-        build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        // The input is now in 'writebuf'. Do a blind write first, writing as much as
-        // we can, before calling poll(). That skips one call to poll() if the stdin is
-        // already available for writing, which it almost certainly is because the
-        // process is idle.
-        let mut nwrite = self.stdin.write(&writebuf)?;
-
-        // We expect the WAL redo process to respond with an 8k page image. We read it
-        // into this buffer.
-        let mut resultbuf = vec![0; pg_constants::BLCKSZ.into()];
-        let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-
-        // Prepare for calling poll()
-        let mut pollfds = [
-            PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
-        ];
+        let stdout = &mut self.stdout;
+        // Buffer the writes to avoid a lot of small syscalls.
+        let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);

        // We do three things simultaneously: send the old base image and WAL records to
        // the child process's stdin, read the result from child's stdout, and forward any logging
        // information that the child writes to its stderr to the page server's log.
-        while nresult < pg_constants::BLCKSZ.into() {
-            // If we have more data to write, wake up if 'stdin' becomes writeable or
-            // we have data to read. Otherwise only wake up if there's data to read.
-            let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
-            let n = nix::poll::poll(&mut pollfds[0..nfds], TIMEOUT.as_millis() as i32)?;
-
-            if n == 0 {
-                return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
+        //
+        // 'f_stdin' handles writing the base image and WAL records to the child process.
+        // 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
+        // tokio runtime in the 'launch' function already, forwards the logging.
+        let f_stdin = async {
+            // Send base image, if any. (If the record initializes the page, previous page
+            // version is not needed.)
+            timeout(
+                TIMEOUT,
+                stdin.write_all(&build_begin_redo_for_block_msg(tag)),
+            )
+            .await??;
+            if base_img.is_some() {
+                timeout(
+                    TIMEOUT,
+                    stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
+                )
+                .await??;
            }

-            // If we have some messages in stderr, forward them to the log.
-            let err_revents = pollfds[1].revents().unwrap();
-            if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                let mut errbuf: [u8; 16384] = [0; 16384];
-                let n = self.stderr.read(&mut errbuf)?;
+            // Send WAL records.
+            for rec in records.iter() {
+                let r = rec.clone();

-                // The message might not be split correctly into lines here. But this is
-                // good enough, the important thing is to get the message to the log.
-                if n > 0 {
-                    error!(
-                        "wal-redo-postgres: {}",
-                        String::from_utf8_lossy(&errbuf[0..n])
-                    );
+                WAL_REDO_RECORD_COUNTER.inc();

-                    // To make sure we capture all log from the process if it fails, keep
-                    // reading from the stderr, before checking the stdout.
-                    continue;
-                }
-            } else if err_revents.contains(PollFlags::POLLHUP) {
-                return Err(Error::new(
-                    ErrorKind::BrokenPipe,
-                    "WAL redo process closed its stderr unexpectedly",
-                ));
+                stdin
+                    .write_all(&build_apply_record_msg(r.lsn, r.rec))
+                    .await?;
+
+                //debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
+                //       r.lsn >> 32, r.lsn & 0xffff_ffff);
            }
+            //debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
+            //       records.len(), lsn >> 32, lsn & 0xffff_ffff);

-            // If we have more data to write and 'stdin' is writeable, do write.
-            if nwrite < writebuf.len() {
-                let in_revents = pollfds[2].revents().unwrap();
-                if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                    nwrite += self.stdin.write(&writebuf[nwrite..])?;
-                } else if in_revents.contains(PollFlags::POLLHUP) {
-                    // We still have more data to write, but the process closed the pipe.
-                    return Err(Error::new(
-                        ErrorKind::BrokenPipe,
-                        "WAL redo process closed its stdin unexpectedly",
-                    ));
-                }
-            }
+            // Send GetPage command to get the result back
+            timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
+            timeout(TIMEOUT, stdin.flush()).await??;
+            //debug!("sent GetPage for {}", tag.blknum);
+            Ok::<(), Error>(())
+        };

-            // If we have some data in stdout, read it to the result buffer.
-            let out_revents = pollfds[0].revents().unwrap();
-            if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                nresult += self.stdout.read(&mut resultbuf[nresult..])?;
-            } else if out_revents.contains(PollFlags::POLLHUP) {
-                return Err(Error::new(
-                    ErrorKind::BrokenPipe,
-                    "WAL redo process closed its stdout unexpectedly",
-                ));
-            }
-        }
+        // Read back new page image
+        let f_stdout = async {
+            let mut buf = [0u8; 8192];

-        Ok(Bytes::from(resultbuf))
+            timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
+            //debug!("got response for {}", tag.blknum);
+            Ok::<[u8; 8192], Error>(buf)
+        };
+
+        let res = tokio::try_join!(f_stdout, f_stdin)?;
+
+        let buf = res.0;
+
+        Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
    }
 }

@@ -681,42 +615,81 @@ impl PostgresRedoProcess {
 // process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
 // explanation of the protocol.

-fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
    let len = 4 + 1 + 4 * 4;
+    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'B');
    buf.put_u32(len as u32);

-    tag.ser_into(buf)
+    // FIXME: this is a temporary hack that should go away when we refactor
+    // the postgres protocol serialization + handlers.
+    //
+    // BytesMut is a dynamic growable buffer, used a lot in tokio code but
+    // not in the std library. To write to a BytesMut from a serde serializer,
+    // we need to either:
+    // - pre-allocate the required buffer space. This is annoying because we
+    //   shouldn't care what the exact serialized size is-- that's the
+    //   serializer's job.
+    // - Or, we need to create a temporary "writer" (which implements the
+    //   `Write` trait). It's a bit awkward, because the writer consumes the
+    //   underlying BytesMut, and we need to extract it later with
+    //   `into_inner`.
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
        .expect("serialize BufferTag should always succeed");
+    let buf = writer.into_inner();
+
+    debug_assert!(buf.len() == 1 + len);
+
+    buf.freeze()
 }

-fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
+fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
    assert!(base_img.len() == 8192);

    let len = 4 + 1 + 4 * 4 + base_img.len();
+    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'P');
    buf.put_u32(len as u32);
-    tag.ser_into(buf)
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
        .expect("serialize BufferTag should always succeed");
+    let mut buf = writer.into_inner();
    buf.put(base_img);
+
+    debug_assert!(buf.len() == 1 + len);
+
+    buf.freeze()
 }

-fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
+fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
    let len = 4 + 8 + rec.len();
+    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'A');
    buf.put_u32(len as u32);
    buf.put_u64(endlsn.0);
    buf.put(rec);
+
+    debug_assert!(buf.len() == 1 + len);
+
+    buf.freeze()
 }

-fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+fn build_get_page_msg(tag: BufferTag) -> Bytes {
    let len = 4 + 1 + 4 * 4;
+    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'G');
    buf.put_u32(len as u32);
-    tag.ser_into(buf)
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
        .expect("serialize BufferTag should always succeed");
+    let buf = writer.into_inner();
+
+    debug_assert!(buf.len() == 1 + len);
+
+    buf.freeze()
 }
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -9,6 +9,7 @@

 use crate::pg_constants;
 use crate::CheckPoint;
+use crate::ControlFileData;
 use crate::FullTransactionId;
 use crate::XLogLongPageHeaderData;
 use crate::XLogPageHeaderData;
@@ -17,8 +18,8 @@ use crate::XLOG_PAGE_MAGIC;

 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, LittleEndian};
-use bytes::BytesMut;
 use bytes::{Buf, Bytes};
+use bytes::{BufMut, BytesMut};
 use crc32c::*;
 use log::*;
 use std::cmp::max;
@@ -43,9 +44,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
 #[allow(clippy::identity_op)]
 pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;

-// PG timeline is always 1, changing it doesn't have useful meaning in Zenith.
-pub const PG_TLI: u32 = 1;
-
 pub type XLogRecPtr = u64;
 pub type TimeLineID = u32;
 pub type TimestampTz = i64;
@@ -187,13 +185,8 @@ fn find_end_of_wal_segment(
            let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
            if xl_tot_len == 0 {
                info!(
-                    "find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}",
-                    Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
-                    Lsn(XLogSegNoOffsetToRecPtr(
-                        segno,
-                        last_valid_rec_pos as u32,
-                        wal_seg_size
-                    ))
+                    "find_end_of_wal_segment reached zeros at {:?}",
+                    Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size))
                );
                break; // zeros, reached the end
            }
@@ -308,17 +301,12 @@ pub fn find_end_of_wal(
                    high_segno,
                );
            }
-            let start_offset = if start_lsn.segment_number(wal_seg_size) == high_segno {
-                start_lsn.segment_offset(wal_seg_size)
-            } else {
-                0
-            };
            high_offs = find_end_of_wal_segment(
                data_dir,
                high_segno,
                high_tli,
                wal_seg_size,
-                start_offset,
+                start_lsn.segment_offset(wal_seg_size),
            )?;
        }
        let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
@@ -341,12 +329,7 @@ pub fn main() {
 }

 impl XLogRecord {
-    pub fn from_slice(buf: &[u8]) -> XLogRecord {
-        use zenith_utils::bin_ser::LeSer;
-        XLogRecord::des(buf).unwrap()
-    }
-
-    pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogRecord {
+    pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
        use zenith_utils::bin_ser::LeSer;
        XLogRecord::des_from(&mut buf.reader()).unwrap()
    }
@@ -394,12 +377,10 @@ impl CheckPoint {
        Ok(CheckPoint::des(buf)?)
    }

-    /// Update next XID based on provided new_xid and stored epoch.
-    /// Next XID should be greater than new_xid. This handles 32-bit
-    /// XID wraparound correctly.
-    ///
-    /// Returns 'true' if the XID was updated.
-    pub fn update_next_xid(&mut self, xid: u32) -> bool {
+    // Update next XID based on provided new_xid and stored epoch.
+    // Next XID should be greater than new_xid.
+    // Also take in account 32-bit wrap-around.
+    pub fn update_next_xid(&mut self, xid: u32) {
        let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
        let full_xid = self.nextXid.value;
        let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
@@ -410,37 +391,35 @@ impl CheckPoint {
                // wrap-around
                epoch += 1;
            }
-            let nextXid = (epoch << 32) | new_xid as u64;
-
-            if nextXid != self.nextXid.value {
-                self.nextXid = FullTransactionId { value: nextXid };
-                return true;
-            }
+            self.nextXid = FullTransactionId {
+                value: (epoch << 32) | new_xid as u64,
+            };
        }
-        false
    }
 }

 //
-// Generate new, empty WAL segment.
+// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
 // We need this segment to start compute node.
+// In order to minimize changes in Postgres core, we prefer to
+// provide WAL segment from which is can extract checkpoint record in standard way,
+// rather then implement some alternative mechanism.
 //
-pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
+pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
    let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);

-    let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE);
    let hdr = XLogLongPageHeaderData {
        std: {
            XLogPageHeaderData {
                xlp_magic: XLOG_PAGE_MAGIC as u16,
                xlp_info: pg_constants::XLP_LONG_HEADER,
-                xlp_tli: PG_TLI,
-                xlp_pageaddr: pageaddr,
+                xlp_tli: 1, // FIXME: always use Postgres timeline 1
+                xlp_pageaddr: pg_control.checkPoint - XLOG_SIZE_OF_XLOG_LONG_PHD as u64,
                xlp_rem_len: 0,
                ..Default::default() // Put 0 in padding fields.
            }
        },
-        xlp_sysid: system_id,
+        xlp_sysid: pg_control.system_identifier,
        xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32,
        xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
    };
@@ -448,6 +427,36 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
    let hdr_bytes = hdr.encode();
    seg_buf.extend_from_slice(&hdr_bytes);

+    let rec_hdr = XLogRecord {
+        xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD
+            + SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT
+            + SIZEOF_CHECKPOINT) as u32,
+        xl_xid: 0, //0 is for InvalidTransactionId
+        xl_prev: 0,
+        xl_info: pg_constants::XLOG_CHECKPOINT_SHUTDOWN,
+        xl_rmid: pg_constants::RM_XLOG_ID,
+        xl_crc: 0,
+        ..Default::default() // Put 0 in padding fields.
+    };
+
+    let mut rec_shord_hdr_bytes = BytesMut::new();
+    rec_shord_hdr_bytes.put_u8(pg_constants::XLR_BLOCK_ID_DATA_SHORT);
+    rec_shord_hdr_bytes.put_u8(SIZEOF_CHECKPOINT as u8);
+
+    let rec_bytes = rec_hdr.encode();
+    let checkpoint_bytes = pg_control.checkPointCopy.encode();
+
+    //calculate record checksum
+    let mut crc = 0;
+    crc = crc32c_append(crc, &rec_shord_hdr_bytes[..]);
+    crc = crc32c_append(crc, &checkpoint_bytes[..]);
+    crc = crc32c_append(crc, &rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
+
+    seg_buf.extend_from_slice(&rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
+    seg_buf.put_u32_le(crc);
+    seg_buf.extend_from_slice(&rec_shord_hdr_bytes);
+    seg_buf.extend_from_slice(&checkpoint_bytes);
+
    //zero out the rest of the file
    seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
    seg_buf.freeze()
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -37,27 +37,20 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
    return cmd


-def yapf(fix_inplace: bool) -> str:
-    cmd = "pipenv run yapf --recursive"
-    if fix_inplace:
-        cmd += " --in-place"
-    else:
-        cmd += " --diff"
-    return cmd
-
-
-def mypy() -> str:
-    return "pipenv run mypy"
-
-
 def get_commit_files() -> List[str]:
-    files = subprocess.check_output("git diff --cached --name-only --diff-filter=ACM".split())
+    files = subprocess.check_output(
+        "git diff --cached --name-only --diff-filter=ACM".split()
+    )
    return files.decode().splitlines()


-def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: bool = False):
+def check(
+    name: str, suffix: str, cmd: str, changed_files: List[str], no_color: bool = False
+):
    print(f"Checking: {name} ", end="")
-    applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files))
+    applicable_files = list(
+        filter(lambda fname: fname.strip().endswith(suffix), changed_files)
+    )
    if not applicable_files:
        print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color))
        return
@@ -66,14 +59,7 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
    res = subprocess.run(cmd.split(), capture_output=True)
    if res.returncode != 0:
        print(colorify("[FAILED]", Color.RED, no_color))
-        if name == "mypy":
-            print("Please inspect the output below and fix type mismatches.")
-        else:
-            print("Please inspect the output below and run make fmt to fix automatically.")
-        if suffix == ".py":
-            print("If the output is empty, ensure that you've installed Python tooling by\n"
-                  "running 'pipenv install --dev' in the current directory (no root needed)")
-        print()
+        print("Please inspect the output below and run make fmt to fix automatically\n")
        print(res.stdout.decode())
        exit(1)

@@ -82,11 +68,12 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace")
-    parser.add_argument("--no-color",
-                        action="store_true",
-                        help="disable colored output",
-                        default=not sys.stdout.isatty())
+    parser.add_argument(
+        "--fix-inplace", action="store_true", help="apply fixes inplace"
+    )
+    parser.add_argument(
+        "--no-color", action="store_true", help="disable colored output", default=not sys.stdout.isatty()
+    )
    args = parser.parse_args()

    files = get_commit_files()
@@ -100,17 +87,3 @@ if __name__ == "__main__":
        changed_files=files,
        no_color=args.no_color,
    )
-    check(
-        name="yapf",
-        suffix=".py",
-        cmd=yapf(fix_inplace=args.fix_inplace),
-        changed_files=files,
-        no_color=args.no_color,
-    )
-    check(
-        name="mypy",
-        suffix=".py",
-        cmd=mypy(),
-        changed_files=files,
-        no_color=args.no_color,
-    )
--- a/proxy/src/cplane_api.rs
+++ b/proxy/src/cplane_api.rs
@@ -1,139 +1,60 @@
-use anyhow::{anyhow, bail, Context};
+use anyhow::{bail, Result};
 use serde::{Deserialize, Serialize};
-use std::net::{SocketAddr, ToSocketAddrs};
+use std::net::{IpAddr, SocketAddr};

-use crate::state::ProxyWaiters;
-
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub struct DatabaseInfo {
-    pub host: String,
-    pub port: u16,
-    pub dbname: String,
-    pub user: String,
-    pub password: Option<String>,
+pub struct CPlaneApi {
+    auth_endpoint: &'static str,
 }

 #[derive(Serialize, Deserialize, Debug)]
-#[serde(untagged)]
-enum ProxyAuthResponse {
-    Ready { conn_info: DatabaseInfo },
-    Error { error: String },
-    NotReady { ready: bool }, // TODO: get rid of `ready`
+pub struct DatabaseInfo {
+    pub host: IpAddr, // TODO: allow host name here too
+    pub port: u16,
+    pub dbname: String,
+    pub user: String,
+    pub password: String,
 }

 impl DatabaseInfo {
-    pub fn socket_addr(&self) -> anyhow::Result<SocketAddr> {
-        let host_port = format!("{}:{}", self.host, self.port);
-        host_port
-            .to_socket_addrs()
-            .with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
-            .next()
-            .ok_or_else(|| anyhow!("cannot resolve at least one SocketAddr"))
+    pub fn socket_addr(&self) -> SocketAddr {
+        SocketAddr::new(self.host, self.port)
+    }
+
+    pub fn conn_string(&self) -> String {
+        format!(
+            "dbname={} user={} password={}",
+            self.dbname, self.user, self.password
+        )
    }
 }

-impl From<DatabaseInfo> for tokio_postgres::Config {
-    fn from(db_info: DatabaseInfo) -> Self {
-        let mut config = tokio_postgres::Config::new();
-
-        config
-            .host(&db_info.host)
-            .port(db_info.port)
-            .dbname(&db_info.dbname)
-            .user(&db_info.user);
-
-        if let Some(password) = db_info.password {
-            config.password(password);
-        }
-
-        config
+impl CPlaneApi {
+    pub fn new(auth_endpoint: &'static str) -> CPlaneApi {
+        CPlaneApi { auth_endpoint }
    }
-}

-pub struct CPlaneApi<'a> {
-    auth_endpoint: &'a str,
-    waiters: &'a ProxyWaiters,
-}
-
-impl<'a> CPlaneApi<'a> {
-    pub fn new(auth_endpoint: &'a str, waiters: &'a ProxyWaiters) -> Self {
-        Self {
-            auth_endpoint,
-            waiters,
-        }
-    }
-}
-
-impl CPlaneApi<'_> {
    pub fn authenticate_proxy_request(
        &self,
        user: &str,
-        database: &str,
        md5_response: &[u8],
        salt: &[u8; 4],
-        psql_session_id: &str,
-    ) -> anyhow::Result<DatabaseInfo> {
+    ) -> Result<DatabaseInfo> {
        let mut url = reqwest::Url::parse(self.auth_endpoint)?;
        url.query_pairs_mut()
            .append_pair("login", user)
-            .append_pair("database", database)
            .append_pair("md5response", std::str::from_utf8(md5_response)?)
-            .append_pair("salt", &hex::encode(salt))
-            .append_pair("psql_session_id", psql_session_id);
+            .append_pair("salt", &hex::encode(salt));

-        let waiter = self.waiters.register(psql_session_id.to_owned());
+        println!("cplane request: {}", url.as_str());

-        println!("cplane request: {}", url);
        let resp = reqwest::blocking::get(url)?;
-        if !resp.status().is_success() {
-            bail!("Auth failed: {}", resp.status())
-        }

-        let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text()?.as_str())?;
-        println!("got auth info: #{:?}", auth_info);
-
-        use ProxyAuthResponse::*;
-        match auth_info {
-            Ready { conn_info } => Ok(conn_info),
-            Error { error } => bail!(error),
-            NotReady { .. } => waiter.wait()?.map_err(|e| anyhow!(e)),
+        if resp.status().is_success() {
+            let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?;
+            println!("got conn info: #{:?}", conn_info);
+            Ok(conn_info)
+        } else {
+            bail!("Auth failed")
        }
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use serde_json::json;
-
-    #[test]
-    fn test_proxy_auth_response() {
-        // Ready
-        let auth: ProxyAuthResponse = serde_json::from_value(json!({
-            "ready": true,
-            "conn_info": DatabaseInfo::default(),
-        }))
-        .unwrap();
-        assert!(matches!(
-            auth,
-            ProxyAuthResponse::Ready {
-                conn_info: DatabaseInfo { .. }
-            }
-        ));
-
-        // Error
-        let auth: ProxyAuthResponse = serde_json::from_value(json!({
-            "ready": false,
-            "error": "too bad, so sad",
-        }))
-        .unwrap();
-        assert!(matches!(auth, ProxyAuthResponse::Error { .. }));
-
-        // NotReady
-        let auth: ProxyAuthResponse = serde_json::from_value(json!({
-            "ready": false,
-        }))
-        .unwrap();
-        assert!(matches!(auth, ProxyAuthResponse::NotReady { .. }));
-    }
-}
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -5,21 +5,78 @@
 /// (control plane API in our case) and can create new databases and accounts
 /// in somewhat transparent manner (again via communication with control plane API).
 ///
-use anyhow::bail;
-use clap::{App, Arg};
-use state::{ProxyConfig, ProxyState};
-use std::thread;
-use zenith_utils::{tcp_listener, GIT_VERSION};
+use std::{
+    collections::HashMap,
+    net::{SocketAddr, TcpListener},
+    sync::{mpsc, Arc, Mutex},
+    thread,
+};
+
+use anyhow::{anyhow, bail, ensure, Context};
+use clap::{App, Arg, ArgMatches};
+
+use cplane_api::DatabaseInfo;
+use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};

 mod cplane_api;
 mod mgmt;
 mod proxy;
-mod state;
-mod waiters;
+
+pub struct ProxyConf {
+    /// main entrypoint for users to connect to
+    pub proxy_address: SocketAddr,
+
+    /// http management endpoint. Upon user account creation control plane
+    /// will notify us here, so that we can 'unfreeze' user session.
+    pub mgmt_address: SocketAddr,
+
+    /// send unauthenticated users to this URI
+    pub redirect_uri: String,
+
+    /// control plane address where we would check auth.
+    pub auth_endpoint: String,
+
+    pub ssl_config: Option<Arc<ServerConfig>>,
+}
+
+pub struct ProxyState {
+    pub conf: ProxyConf,
+    pub waiters: Mutex<HashMap<String, mpsc::Sender<anyhow::Result<DatabaseInfo>>>>,
+}
+
+fn configure_ssl(arg_matches: &ArgMatches) -> anyhow::Result<Option<Arc<ServerConfig>>> {
+    let (key_path, cert_path) = match (
+        arg_matches.value_of("ssl-key"),
+        arg_matches.value_of("ssl-cert"),
+    ) {
+        (Some(key_path), Some(cert_path)) => (key_path, cert_path),
+        (None, None) => return Ok(None),
+        _ => bail!("either both or neither ssl-key and ssl-cert must be specified"),
+    };
+
+    let key = {
+        let key_bytes = std::fs::read(key_path).context("SSL key file")?;
+        let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+            .map_err(|_| anyhow!("couldn't read TLS keys"))?;
+        ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
+        keys.pop().unwrap()
+    };
+
+    let cert_chain = {
+        let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?;
+        pemfile::certs(&mut &cert_chain_bytes[..])
+            .map_err(|_| anyhow!("couldn't read TLS certificates"))?
+    };
+
+    let mut config = ServerConfig::new(NoClientAuth::new());
+    config.set_single_cert(cert_chain, key)?;
+    config.versions = vec![ProtocolVersion::TLSv1_3];
+
+    Ok(Some(Arc::new(config)))
+}

 fn main() -> anyhow::Result<()> {
    let arg_matches = App::new("Zenith proxy/router")
-        .version(GIT_VERSION)
        .arg(
            Arg::with_name("proxy")
                .short("p")
@@ -68,47 +125,38 @@ fn main() -> anyhow::Result<()> {
        )
        .get_matches();

-    let ssl_config = match (
-        arg_matches.value_of("ssl-key"),
-        arg_matches.value_of("ssl-cert"),
-    ) {
-        (Some(key_path), Some(cert_path)) => {
-            Some(crate::state::configure_ssl(key_path, cert_path)?)
-        }
-        (None, None) => None,
-        _ => bail!("either both or neither ssl-key and ssl-cert must be specified"),
-    };
-
-    let config = ProxyConfig {
+    let conf = ProxyConf {
        proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
        mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
        redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
        auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
-        ssl_config,
+        ssl_config: configure_ssl(&arg_matches)?,
    };
-    let state: &ProxyState = Box::leak(Box::new(ProxyState::new(config)));
-
-    println!("Version: {}", GIT_VERSION);
+    let state = ProxyState {
+        conf,
+        waiters: Mutex::new(HashMap::new()),
+    };
+    let state: &'static ProxyState = Box::leak(Box::new(state));

    // Check that we can bind to address before further initialization
    println!("Starting proxy on {}", state.conf.proxy_address);
-    let pageserver_listener = tcp_listener::bind(state.conf.proxy_address)?;
+    let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;

    println!("Starting mgmt on {}", state.conf.mgmt_address);
-    let mgmt_listener = tcp_listener::bind(state.conf.mgmt_address)?;
+    let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;

-    let threads = [
+    let threads = vec![
        // Spawn a thread to listen for connections. It will spawn further threads
        // for each connection.
        thread::Builder::new()
-            .name("Listener thread".into())
+            .name("Proxy thread".into())
            .spawn(move || proxy::thread_main(state, pageserver_listener))?,
        thread::Builder::new()
            .name("Mgmt thread".into())
            .spawn(move || mgmt::thread_main(state, mgmt_listener))?,
    ];

-    for t in threads {
+    for t in threads.into_iter() {
        t.join().unwrap()?;
    }

--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -3,6 +3,7 @@ use std::{
    thread,
 };

+use anyhow::bail;
 use bytes::Bytes;
 use serde::Deserialize;
 use zenith_utils::{
@@ -24,23 +25,22 @@ pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow:
        socket.set_nodelay(true).unwrap();

        thread::spawn(move || {
-            if let Err(err) = handle_connection(state, socket) {
+            if let Err(err) = mgmt_conn_main(state, socket) {
                println!("error: {}", err);
            }
        });
    }
 }

-fn handle_connection(state: &ProxyState, socket: TcpStream) -> anyhow::Result<()> {
+pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
    let mut conn_handler = MgmtHandler { state };
-    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
+    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
    pgbackend.run(&mut conn_handler)
 }

-struct MgmtHandler<'a> {
-    state: &'a ProxyState,
+struct MgmtHandler {
+    state: &'static ProxyState,
 }
-
 /// Serialized examples:
 // {
 //     "session_id": "71d6d03e6d93d99a",
@@ -64,18 +64,18 @@ struct MgmtHandler<'a> {
 // // to test manually by sending a query to mgmt interface:
 // psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
 #[derive(Deserialize)]
-struct PsqlSessionResponse {
+pub struct PsqlSessionResponse {
    session_id: String,
    result: PsqlSessionResult,
 }

 #[derive(Deserialize)]
-enum PsqlSessionResult {
+pub enum PsqlSessionResult {
    Success(DatabaseInfo),
    Failure(String),
 }

-impl postgres_backend::Handler for MgmtHandler<'_> {
+impl postgres_backend::Handler for MgmtHandler {
    fn process_query(
        &mut self,
        pgb: &mut PostgresBackend,
@@ -96,26 +96,32 @@ fn try_process_query(
    query_string: Bytes,
 ) -> anyhow::Result<()> {
    let query_string = query_from_cstring(query_string);
+
    println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);

    let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;

-    use PsqlSessionResult::*;
-    let msg = match resp.result {
-        Success(db_info) => Ok(db_info),
-        Failure(message) => Err(message),
-    };
+    let waiters = mgmt.state.waiters.lock().unwrap();
+
+    let sender = waiters
+        .get(&resp.session_id)
+        .ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
+
+    match resp.result {
+        PsqlSessionResult::Success(db_info) => {
+            sender.send(Ok(db_info))?;

-    match mgmt.state.waiters.notify(&resp.session_id, msg) {
-        Ok(()) => {
            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
-                .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            pgb.flush()?;
+            Ok(())
        }
-        Err(e) => {
-            pgb.write_message(&BeMessage::ErrorResponse(e.to_string()))?;
+
+        PsqlSessionResult::Failure(message) => {
+            sender.send(Err(anyhow::Error::msg(message.clone())))?;
+
+            bail!("psql session request failed: {}", message)
        }
    }
-
-    Ok(())
 }
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -1,12 +1,18 @@
-use crate::cplane_api::{CPlaneApi, DatabaseInfo};
+use crate::cplane_api::CPlaneApi;
+use crate::cplane_api::DatabaseInfo;
 use crate::ProxyState;
-use anyhow::{anyhow, bail};
-use std::net::TcpStream;
-use std::{io, thread};
+
+use anyhow::bail;
 use tokio_postgres::NoTls;
-use zenith_utils::postgres_backend::{self, PostgresBackend, ProtoState, Stream};
-use zenith_utils::pq_proto::{BeMessage as Be, FeMessage as Fe, *};
+
+use rand::Rng;
+use std::io::Write;
+use std::{io, sync::mpsc::channel, thread};
+use zenith_utils::postgres_backend::Stream;
+use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
+use zenith_utils::pq_proto::*;
 use zenith_utils::sock_split::{ReadStream, WriteStream};
+use zenith_utils::{postgres_backend, pq_proto::BeMessage};

 ///
 /// Main proxy listener loop.
@@ -22,259 +28,271 @@ pub fn thread_main(
        println!("accepted connection from {}", peer_addr);
        socket.set_nodelay(true).unwrap();

-        thread::Builder::new()
-            .name("Proxy thread".into())
-            .spawn(move || {
-                if let Err(err) = proxy_conn_main(state, socket) {
-                    println!("error: {}", err);
-                }
-            })?;
+        thread::spawn(move || {
+            if let Err(err) = proxy_conn_main(state, socket) {
+                println!("error: {}", err);
+            }
+        });
    }
 }

-// TODO: clean up fields
+// XXX: clean up fields
 struct ProxyConnection {
    state: &'static ProxyState,
-    psql_session_id: String,
+
+    cplane: CPlaneApi,
+
+    user: String,
+    database: String,
+
    pgb: PostgresBackend,
+    md5_salt: [u8; 4],
+
+    psql_session_id: String,
 }

-pub fn proxy_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
-    let conn = ProxyConnection {
+pub fn proxy_conn_main(
+    state: &'static ProxyState,
+    socket: std::net::TcpStream,
+) -> anyhow::Result<()> {
+    let mut conn = ProxyConnection {
        state,
-        psql_session_id: hex::encode(rand::random::<[u8; 8]>()),
+        cplane: CPlaneApi::new(&state.conf.auth_endpoint),
+        user: "".into(),
+        database: "".into(),
        pgb: PostgresBackend::new(
            socket,
            postgres_backend::AuthType::MD5,
            state.conf.ssl_config.clone(),
-            false,
        )?,
+        md5_salt: [0u8; 4],
+        psql_session_id: "".into(),
    };

-    let (client, server) = conn.handle_client()?;
+    // Check StartupMessage
+    // This will set conn.existing_user and we can decide on next actions
+    conn.handle_startup()?;

-    let server = zenith_utils::sock_split::BidiStream::from_tcp(server);
-
-    let client = match client {
-        Stream::Bidirectional(bidi_stream) => bidi_stream,
-        _ => panic!("invalid stream type"),
+    // both scenarious here should end up producing database connection string
+    let db_info = if conn.is_existing_user() {
+        conn.handle_existing_user()?
+    } else {
+        conn.handle_new_user()?
    };

-    proxy(client.split(), server.split())
+    // XXX: move that inside handle_new_user/handle_existing_user to be able to
+    // report wrong connection error.
+    proxy_pass(conn.pgb, db_info)
 }

 impl ProxyConnection {
-    fn handle_client(mut self) -> anyhow::Result<(Stream, TcpStream)> {
-        let mut authenticate = || {
-            let (username, dbname) = self.handle_startup()?;
-
-            // Both scenarios here should end up producing database credentials
-            if username.ends_with("@zenith") {
-                self.handle_existing_user(&username, &dbname)
-            } else {
-                self.handle_new_user()
-            }
-        };
-
-        let conn = match authenticate() {
-            Ok(db_info) => connect_to_db(db_info),
-            Err(e) => {
-                // Report the error to the client
-                self.pgb.write_message(&Be::ErrorResponse(e.to_string()))?;
-                bail!("failed to handle client: {:?}", e);
-            }
-        };
-
-        // We'll get rid of this once migration to async is complete
-        let (pg_version, db_stream) = {
-            let runtime = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?;
-
-            let (pg_version, stream) = runtime.block_on(conn)?;
-            let stream = stream.into_std()?;
-            stream.set_nonblocking(false)?;
-
-            (pg_version, stream)
-        };
-
-        // Let the client send new requests
-        self.pgb
-            .write_message_noflush(&BeMessage::ParameterStatus(
-                BeParameterStatusMessage::ServerVersion(&pg_version),
-            ))?
-            .write_message(&Be::ReadyForQuery)?;
-
-        Ok((self.pgb.into_stream(), db_stream))
+    fn is_existing_user(&self) -> bool {
+        self.user.ends_with("@zenith")
    }

-    fn handle_startup(&mut self) -> anyhow::Result<(String, String)> {
-        let have_tls = self.pgb.tls_config.is_some();
+    fn handle_startup(&mut self) -> anyhow::Result<()> {
        let mut encrypted = false;
-
        loop {
-            let mut msg = match self.pgb.read_message()? {
-                Some(Fe::StartupMessage(msg)) => msg,
-                None => bail!("connection is lost"),
-                bad => bail!("unexpected message type: {:?}", bad),
-            };
-            println!("got message: {:?}", msg);
+            let msg = self.pgb.read_message()?;
+            println!("got message {:?}", msg);
+            match msg {
+                Some(FeMessage::StartupMessage(m)) => {
+                    println!("got startup message {:?}", m);

-            match msg.kind {
-                StartupRequestCode::NegotiateGss => {
-                    self.pgb.write_message(&Be::EncryptionResponse(false))?;
-                }
-                StartupRequestCode::NegotiateSsl => {
-                    self.pgb.write_message(&Be::EncryptionResponse(have_tls))?;
-                    if have_tls {
-                        self.pgb.start_tls()?;
-                        encrypted = true;
+                    match m.kind {
+                        StartupRequestCode::NegotiateGss => {
+                            self.pgb
+                                .write_message(&BeMessage::EncryptionResponse(false))?;
+                        }
+                        StartupRequestCode::NegotiateSsl => {
+                            println!("SSL requested");
+                            if self.pgb.tls_config.is_some() {
+                                self.pgb
+                                    .write_message(&BeMessage::EncryptionResponse(true))?;
+                                self.pgb.start_tls()?;
+                                encrypted = true;
+                            } else {
+                                self.pgb
+                                    .write_message(&BeMessage::EncryptionResponse(false))?;
+                            }
+                        }
+                        StartupRequestCode::Normal => {
+                            if self.state.conf.ssl_config.is_some() && !encrypted {
+                                self.pgb.write_message(&BeMessage::ErrorResponse(
+                                    "must connect with TLS".to_string(),
+                                ))?;
+                                bail!("client did not connect with TLS");
+                            }
+                            self.user = m
+                                .params
+                                .get("user")
+                                .ok_or_else(|| {
+                                    anyhow::Error::msg("user is required in startup packet")
+                                })?
+                                .into();
+                            self.database = m
+                                .params
+                                .get("database")
+                                .ok_or_else(|| {
+                                    anyhow::Error::msg("database is required in startup packet")
+                                })?
+                                .into();
+
+                            break;
+                        }
+                        StartupRequestCode::Cancel => break,
                    }
                }
-                StartupRequestCode::Normal => {
-                    if have_tls && !encrypted {
-                        bail!("must connect with TLS");
-                    }
-
-                    let mut get_param = |key| {
-                        msg.params
-                            .remove(key)
-                            .ok_or_else(|| anyhow!("{} is missing in startup packet", key))
-                    };
-
-                    return Ok((get_param("user")?, get_param("database")?));
+                None => {
+                    bail!("connection closed")
+                }
+                unexpected => {
+                    bail!("unexpected message type : {:?}", unexpected)
                }
-                // TODO: implement proper stmt cancellation
-                StartupRequestCode::Cancel => bail!("query cancellation is not supported"),
            }
        }
+        Ok(())
+    }
+
+    fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
+        // ask password
+        rand::thread_rng().fill(&mut self.md5_salt);
+        self.pgb
+            .write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
+        self.pgb.state = ProtoState::Authentication; // XXX
+
+        // check password
+        println!("handle_existing_user");
+        let msg = self.pgb.read_message()?;
+        println!("got message {:?}", msg);
+        if let Some(FeMessage::PasswordMessage(m)) = msg {
+            println!("got password message '{:?}'", m);
+
+            assert!(self.is_existing_user());
+
+            let (_trailing_null, md5_response) = m
+                .split_last()
+                .ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;
+
+            match self.cplane.authenticate_proxy_request(
+                self.user.as_str(),
+                md5_response,
+                &self.md5_salt,
+            ) {
+                Err(e) => {
+                    self.pgb
+                        .write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
+
+                    bail!("auth failed: {}", e);
+                }
+                Ok(conn_info) => {
+                    self.pgb
+                        .write_message_noflush(&BeMessage::AuthenticationOk)?;
+                    self.pgb
+                        .write_message_noflush(&BeMessage::ParameterStatus)?;
+                    self.pgb.write_message(&BeMessage::ReadyForQuery)?;
+
+                    Ok(conn_info)
+                }
+            }
+        } else {
+            bail!("protocol violation");
+        }
    }

-    fn handle_existing_user(&mut self, user: &str, db: &str) -> anyhow::Result<DatabaseInfo> {
-        let md5_salt = rand::random::<[u8; 4]>();
-
-        // Ask password
-        self.pgb
-            .write_message(&Be::AuthenticationMD5Password(&md5_salt))?;
-        self.pgb.state = ProtoState::Authentication; // XXX
-
-        // Check password
-        let msg = match self.pgb.read_message()? {
-            Some(Fe::PasswordMessage(msg)) => msg,
-            None => bail!("connection is lost"),
-            bad => bail!("unexpected message type: {:?}", bad),
-        };
-        println!("got message: {:?}", msg);
-
-        let (_trailing_null, md5_response) = msg
-            .split_last()
-            .ok_or_else(|| anyhow!("unexpected password message"))?;
-
-        let cplane = CPlaneApi::new(&self.state.conf.auth_endpoint, &self.state.waiters);
-        let db_info = cplane.authenticate_proxy_request(
-            user,
-            db,
-            md5_response,
-            &md5_salt,
-            &self.psql_session_id,
-        )?;
-
-        self.pgb
-            .write_message_noflush(&Be::AuthenticationOk)?
-            .write_message_noflush(&BeParameterStatusMessage::encoding())?;
-
-        Ok(db_info)
-    }
-
    fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
-        let greeting = hello_message(&self.state.conf.redirect_uri, &self.psql_session_id);
+        let mut psql_session_id_buf = [0u8; 8];
+        rand::thread_rng().fill(&mut psql_session_id_buf);
+        self.psql_session_id = hex::encode(psql_session_id_buf);

-        // First, register this session
-        let waiter = self.state.waiters.register(self.psql_session_id.clone());
+        let hello_message = format!("☀️  Welcome to Zenith!
+
+To proceed with database creation, open the following link:
+
+    {redirect_uri}{sess_id}
+
+It needs to be done once and we will send you '.pgpass' file, which will allow you to access or create
+databases without opening the browser.
+
+", redirect_uri = self.state.conf.redirect_uri, sess_id = self.psql_session_id);

-        // Give user a URL to spawn a new database
        self.pgb
-            .write_message_noflush(&Be::AuthenticationOk)?
-            .write_message_noflush(&BeParameterStatusMessage::encoding())?
-            .write_message(&Be::NoticeResponse(greeting))?;
+            .write_message_noflush(&BeMessage::AuthenticationOk)?;
+        self.pgb
+            .write_message_noflush(&BeMessage::ParameterStatus)?;
+        self.pgb
+            .write_message(&BeMessage::NoticeResponse(hello_message))?;
+
+        // await for database creation
+        let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
+        let _ = self
+            .state
+            .waiters
+            .lock()
+            .unwrap()
+            .insert(self.psql_session_id.clone(), tx);

        // Wait for web console response
-        let db_info = waiter.wait()?.map_err(|e| anyhow!(e))?;
+        // XXX: respond with error to client
+        let dbinfo = rx.recv()??;

-        self.pgb
-            .write_message_noflush(&Be::NoticeResponse("Connecting to database.".into()))?;
+        self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
+            "Connecting to database.".to_string(),
+        ))?;
+        self.pgb.write_message(&BeMessage::ReadyForQuery)?;

-        Ok(db_info)
+        Ok(dbinfo)
    }
 }

-fn hello_message(redirect_uri: &str, session_id: &str) -> String {
-    format!(
-        concat![
-            "☀️  Welcome to Zenith!\n",
-            "To proceed with database creation, open the following link:\n\n",
-            "    {redirect_uri}{session_id}\n\n",
-            "It needs to be done once and we will send you '.pgpass' file,\n",
-            "which will allow you to access or create ",
-            "databases without opening your web browser."
-        ],
-        redirect_uri = redirect_uri,
-        session_id = session_id,
-    )
-}
-
 /// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
-async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<(String, tokio::net::TcpStream)> {
-    let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
-    let config = tokio_postgres::Config::from(db_info);
-    let (client, conn) = config.connect_raw(&mut socket, NoTls).await?;
-
-    let query = client.query_one("select current_setting('server_version')", &[]);
-
-    tokio::pin!(query, conn);
-
-    let version = tokio::select!(
-        x = query => x?.try_get(0)?,
-        _ = conn => bail!("connection closed too early"),
-    );
-
-    Ok((version, socket))
+async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
+    let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?;
+    let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
+    let _ = config.connect_raw(&mut socket, NoTls).await?;
+    Ok(socket)
 }

 /// Concurrently proxy both directions of the client and server connections
 fn proxy(
-    (client_read, client_write): (ReadStream, WriteStream),
-    (server_read, server_write): (ReadStream, WriteStream),
+    client_read: ReadStream,
+    client_write: WriteStream,
+    server_read: ReadStream,
+    server_write: WriteStream,
 ) -> anyhow::Result<()> {
-    fn do_proxy(mut reader: impl io::Read, mut writer: WriteStream) -> io::Result<u64> {
-        /// FlushWriter will make sure that every message is sent as soon as possible
-        struct FlushWriter<W>(W);
-
-        impl<W: io::Write> io::Write for FlushWriter<W> {
-            fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-                // `std::io::copy` is guaranteed to exit if we return an error,
-                // so we can afford to lose `res` in case `flush` fails
-                let res = self.0.write(buf);
-                if res.is_ok() {
-                    self.flush()?;
-                }
-                res
-            }
-
-            fn flush(&mut self) -> io::Result<()> {
-                self.0.flush()
-            }
-        }
-
-        let res = std::io::copy(&mut reader, &mut FlushWriter(&mut writer));
-        writer.shutdown(std::net::Shutdown::Both)?;
-        res
+    fn do_proxy(mut reader: ReadStream, mut writer: WriteStream) -> io::Result<()> {
+        std::io::copy(&mut reader, &mut writer)?;
+        writer.flush()?;
+        writer.shutdown(std::net::Shutdown::Both)
    }

    let client_to_server_jh = thread::spawn(move || do_proxy(client_read, server_write));

-    do_proxy(server_read, client_write)?;
-    client_to_server_jh.join().unwrap()?;
+    let res1 = do_proxy(server_read, client_write);
+    let res2 = client_to_server_jh.join().unwrap();
+    res1?;
+    res2?;

    Ok(())
 }
+
+/// Proxy a client connection to a postgres database
+fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()?;
+    let db_stream = runtime.block_on(connect_to_db(db_info))?;
+    let db_stream = db_stream.into_std()?;
+    db_stream.set_nonblocking(false)?;
+
+    let db_stream = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
+    let (db_read, db_write) = db_stream.split();
+
+    let stream = match pgb.into_stream() {
+        Stream::Bidirectional(bidi_stream) => bidi_stream,
+        _ => bail!("invalid stream"),
+    };
+
+    let (client_read, client_write) = stream.split();
+    proxy(client_read, client_write, db_read, db_write)
+}
--- a/proxy/src/state.rs
+++ b/proxy/src/state.rs
@@ -1,62 +0,0 @@
-use crate::cplane_api::DatabaseInfo;
-use anyhow::{anyhow, ensure, Context};
-use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
-use std::net::SocketAddr;
-use std::sync::Arc;
-
-pub type SslConfig = Arc<ServerConfig>;
-
-pub struct ProxyConfig {
-    /// main entrypoint for users to connect to
-    pub proxy_address: SocketAddr,
-
-    /// http management endpoint. Upon user account creation control plane
-    /// will notify us here, so that we can 'unfreeze' user session.
-    pub mgmt_address: SocketAddr,
-
-    /// send unauthenticated users to this URI
-    pub redirect_uri: String,
-
-    /// control plane address where we would check auth.
-    pub auth_endpoint: String,
-
-    pub ssl_config: Option<SslConfig>,
-}
-
-pub type ProxyWaiters = crate::waiters::Waiters<Result<DatabaseInfo, String>>;
-
-pub struct ProxyState {
-    pub conf: ProxyConfig,
-    pub waiters: ProxyWaiters,
-}
-
-impl ProxyState {
-    pub fn new(conf: ProxyConfig) -> Self {
-        Self {
-            conf,
-            waiters: ProxyWaiters::default(),
-        }
-    }
-}
-
-pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result<SslConfig> {
-    let key = {
-        let key_bytes = std::fs::read(key_path).context("SSL key file")?;
-        let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
-            .map_err(|_| anyhow!("couldn't read TLS keys"))?;
-        ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
-        keys.pop().unwrap()
-    };
-
-    let cert_chain = {
-        let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?;
-        pemfile::certs(&mut &cert_chain_bytes[..])
-            .map_err(|_| anyhow!("couldn't read TLS certificates"))?
-    };
-
-    let mut config = ServerConfig::new(NoClientAuth::new());
-    config.set_single_cert(cert_chain, key)?;
-    config.versions = vec![ProtocolVersion::TLSv1_3];
-
-    Ok(config.into())
-}
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -1,58 +0,0 @@
-use anyhow::{anyhow, Context};
-use std::collections::HashMap;
-use std::sync::{mpsc, Mutex};
-
-pub struct Waiters<T>(pub(self) Mutex<HashMap<String, mpsc::Sender<T>>>);
-
-impl<T> Default for Waiters<T> {
-    fn default() -> Self {
-        Waiters(Default::default())
-    }
-}
-
-impl<T> Waiters<T> {
-    pub fn register(&self, key: String) -> Waiter<T> {
-        let (tx, rx) = mpsc::channel();
-
-        // TODO: use `try_insert` (unstable)
-        let prev = self.0.lock().unwrap().insert(key.clone(), tx);
-        assert!(matches!(prev, None)); // assert_matches! is nightly-only
-
-        Waiter {
-            receiver: rx,
-            registry: self,
-            key,
-        }
-    }
-
-    pub fn notify(&self, key: &str, value: T) -> anyhow::Result<()>
-    where
-        T: Send + Sync + 'static,
-    {
-        let tx = self
-            .0
-            .lock()
-            .unwrap()
-            .remove(key)
-            .ok_or_else(|| anyhow!("key {} not found", key))?;
-        tx.send(value).context("channel hangup")
-    }
-}
-
-pub struct Waiter<'a, T> {
-    receiver: mpsc::Receiver<T>,
-    registry: &'a Waiters<T>,
-    key: String,
-}
-
-impl<T> Waiter<'_, T> {
-    pub fn wait(self) -> anyhow::Result<T> {
-        self.receiver.recv().context("channel hangup")
-    }
-}
-
-impl<T> Drop for Waiter<'_, T> {
-    fn drop(&mut self) {
-        self.registry.0.lock().unwrap().remove(&self.key);
-    }
-}
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,9 +0,0 @@
-[pytest]
-addopts =
-    -m 'not remote_cluster'
-markers =
-    remote_cluster
-minversion = 6.0
-log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
-log_date_format = %Y-%m-%d %H:%M:%S
-log_cli = true
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -1,510 +0,0 @@
-#!/usr/bin/env python3
-
-# Here'a good link in case you're interested in learning more
-# about current deficiencies of rust code coverage story:
-# https://github.com/rust-lang/rust/issues?q=is%3Aissue+is%3Aopen+instrument-coverage+label%3AA-code-coverage
-#
-# Also a couple of inspirational tools which I deliberately ended up not using:
-#  * https://github.com/mozilla/grcov
-#  * https://github.com/taiki-e/cargo-llvm-cov
-#  * https://github.com/llvm/llvm-project/tree/main/llvm/test/tools/llvm-cov
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from textwrap import dedent
-from typing import Any, Iterable, List, Optional
-
-import argparse
-import json
-import os
-import shutil
-import subprocess
-import sys
-
-
-def intersperse(sep: Any, iterable: Iterable[Any]):
-    fst = True
-    for item in iterable:
-        if not fst:
-            yield sep
-        fst = False
-        yield item
-
-
-def find_demangler(demangler=None):
-    known_tools = ['c++filt', 'rustfilt', 'llvm-cxxfilt']
-
-    if demangler:
-        # Explicit argument has precedence over `known_tools`
-        demanglers = [demangler]
-    else:
-        demanglers = known_tools
-
-    for demangler in demanglers:
-        if shutil.which(demangler):
-            return demangler
-
-    raise Exception(' '.join([
-        'Failed to find symbol demangler.',
-        'Please install it or provide another tool',
-        f"(e.g. {', '.join(known_tools)})",
-    ]))
-
-
-class Cargo:
-    def __init__(self, cwd: Path):
-        self.cwd = cwd
-        self.target_dir = Path(os.environ.get('CARGO_TARGET_DIR', cwd / 'target')).resolve()
-        self._rustlib_dir = None
-
-    @property
-    def rustlib_dir(self):
-        if not self._rustlib_dir:
-            cmd = [
-                'cargo',
-                '-Zunstable-options',
-                'rustc',
-                '--print=target-libdir',
-            ]
-            self._rustlib_dir = Path(subprocess.check_output(cmd, cwd=self.cwd, text=True)).parent
-
-        return self._rustlib_dir
-
-    def binaries(self, profile: str) -> List[str]:
-        executables = []
-
-        # This will emit json messages containing test binaries names
-        cmd = [
-            'cargo',
-            'test',
-            '--no-run',
-            '--message-format=json',
-        ]
-        env = dict(os.environ, PROFILE=profile)
-        output = subprocess.check_output(cmd, cwd=self.cwd, env=env, text=True)
-
-        for line in output.splitlines(keepends=False):
-            meta = json.loads(line)
-            exe = meta.get('executable')
-            if exe:
-                executables.append(exe)
-
-        # Metadata contains crate names, which can be used
-        # to recover names of executables, e.g. `pageserver`
-        cmd = [
-            'cargo',
-            'metadata',
-            '--format-version=1',
-            '--no-deps',
-        ]
-        meta = json.loads(subprocess.check_output(cmd, cwd=self.cwd))
-
-        for pkg in meta.get('packages', []):
-            for target in pkg.get('targets', []):
-                if 'bin' in target['kind']:
-                    exe = self.target_dir / profile / target['name']
-                    if exe.exists():
-                        executables.append(str(exe))
-
-        return executables
-
-
-@dataclass
-class LLVM:
-    cargo: Cargo
-
-    def resolve_tool(self, name: str) -> str:
-        exe = self.cargo.rustlib_dir / 'bin' / name
-        if exe.exists():
-            return str(exe)
-
-        if not shutil.which(name):
-            # Show a user-friendly warning
-            raise Exception(' '.join([
-                f"It appears that you don't have `{name}` installed.",
-                "Please execute `rustup component add llvm-tools-preview`,",
-                "or install it via your package manager of choice.",
-                "LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
-            ]))
-
-        return name
-
-    def profdata(self, input_dir: Path, output_profdata: Path):
-        profraws = [f for f in input_dir.iterdir() if f.suffix == '.profraw']
-        if not profraws:
-            raise Exception(f'No profraw files found at {input_dir}')
-
-        with open(input_dir / 'profraw.list', 'w') as input_files:
-            profraw_mtime = 0
-            for profraw in profraws:
-                profraw_mtime = max(profraw_mtime, profraw.stat().st_mtime_ns)
-                print(profraw, file=input_files)
-            input_files.flush()
-
-            try:
-                profdata_mtime = output_profdata.stat().st_mtime_ns
-            except FileNotFoundError:
-                profdata_mtime = 0
-
-            # An obvious make-ish optimization
-            if profraw_mtime >= profdata_mtime:
-                subprocess.check_call([
-                    self.resolve_tool('llvm-profdata'),
-                    'merge',
-                    '-sparse',
-                    f'-input-files={input_files.name}',
-                    f'-output={output_profdata}',
-                ])
-
-    def _cov(self,
-             *extras,
-             subcommand: str,
-             profdata: Path,
-             objects: List[str],
-             sources: List[str],
-             demangler: Optional[str] = None) -> None:
-
-        cwd = self.cargo.cwd
-        objects = list(intersperse('-object', objects))
-        extras = list(extras)
-
-        # For some reason `rustc` produces relative paths to src files,
-        # so we force it to cut the $PWD prefix.
-        # see: https://github.com/rust-lang/rust/issues/34701#issuecomment-739809584
-        if sources:
-            extras.append(f'-path-equivalence=.,{cwd.resolve()}')
-
-        if demangler:
-            extras.append(f'-Xdemangler={demangler}')
-
-        cmd = [
-            self.resolve_tool('llvm-cov'),
-            subcommand,  # '-dump-collected-paths',  # classified debug flag
-            '-instr-profile',
-            str(profdata),
-            *extras,
-            *objects,
-            *sources,
-        ]
-        subprocess.check_call(cmd, cwd=cwd)
-
-    def cov_report(self, **kwargs) -> None:
-        self._cov(subcommand='report', **kwargs)
-
-    def cov_export(self, *, kind: str, **kwargs) -> None:
-        extras = [f'-format={kind}']
-        self._cov(subcommand='export', *extras, **kwargs)
-
-    def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None:
-        extras = [f'-format={kind}']
-        if output_dir:
-            extras.append(f'-output-dir={output_dir}')
-
-        self._cov(subcommand='show', *extras, **kwargs)
-
-
-@dataclass
-class Report(ABC):
-    """ Common properties of a coverage report """
-
-    llvm: LLVM
-    demangler: str
-    profdata: Path
-    objects: List[str]
-    sources: List[str]
-
-    def _common_kwargs(self):
-        return dict(profdata=self.profdata,
-                    objects=self.objects,
-                    sources=self.sources,
-                    demangler=self.demangler)
-
-    @abstractmethod
-    def generate(self):
-        pass
-
-    def open(self):
-        # Do nothing by default
-        pass
-
-
-class SummaryReport(Report):
-    def generate(self):
-        self.llvm.cov_report(**self._common_kwargs())
-
-
-class TextReport(Report):
-    def generate(self):
-        self.llvm.cov_show(kind='text', **self._common_kwargs())
-
-
-class LcovReport(Report):
-    def generate(self):
-        self.llvm.cov_export(kind='lcov', **self._common_kwargs())
-
-
-@dataclass
-class HtmlReport(Report):
-    output_dir: Path
-
-    def generate(self):
-        self.llvm.cov_show(kind='html', output_dir=self.output_dir, **self._common_kwargs())
-        print(f'HTML report is located at `{self.output_dir}`')
-
-    def open(self):
-        tool = dict(linux='xdg-open', darwin='open').get(sys.platform)
-        if not tool:
-            raise Exception(f'Unknown platform {sys.platform}')
-
-        subprocess.check_call([tool, self.output_dir / 'index.html'],
-                              stdout=subprocess.DEVNULL,
-                              stderr=subprocess.DEVNULL)
-
-
-@dataclass
-class GithubPagesReport(HtmlReport):
-    output_dir: Path
-    commit_url: str
-
-    def generate(self):
-        def index_path(path):
-            return path / 'index.html'
-
-        common = self._common_kwargs()
-        # Provide default sources if there's none
-        common.setdefault('sources', ['.'])
-
-        self.llvm.cov_show(kind='html', output_dir=self.output_dir, **common)
-        shutil.copy(index_path(self.output_dir), self.output_dir / 'local.html')
-
-        with TemporaryDirectory() as tmp:
-            output_dir = Path(tmp)
-            args = dict(common, sources=[])
-            self.llvm.cov_show(kind='html', output_dir=output_dir, **args)
-            shutil.copy(index_path(output_dir), self.output_dir / 'all.html')
-
-        with open(index_path(self.output_dir), 'w') as index:
-            commit_sha = self.commit_url.rsplit('/', maxsplit=1)[-1][:10]
-
-            html = f"""
-                <!DOCTYPE html>
-                <html>
-                    <head>
-                        <title>Coverage ({commit_sha})</title>
-                    </head>
-                    <body>
-                        <h1>
-                            Coverage report for commit
-                                <a href="{self.commit_url}">
-                                    {commit_sha}
-                                </a>
-                        </h1>
-
-                        <p>
-                            <a href="./local.html">
-                                <b>Show only local sources</b>
-                            </a>
-                        </p>
-
-                        <p>
-                            <a href="./all.html">
-                                Show all sources (including dependencies)
-                            </a>
-                        </p>
-                    </body>
-                </html>
-            """
-            index.write(dedent(html))
-
-        print(f'HTML report is located at `{self.output_dir}`')
-
-
-class State:
-    def __init__(self, cwd: Path, top_dir: Optional[Path], profraw_prefix: Optional[str]):
-        # Use hostname by default
-        profraw_prefix = profraw_prefix or '%h'
-
-        self.cwd = cwd
-        self.cargo = Cargo(self.cwd)
-        self.llvm = LLVM(self.cargo)
-
-        self.top_dir = top_dir or self.cargo.target_dir / 'coverage'
-        self.report_dir = self.top_dir / 'report'
-
-        # Directory for raw coverage data emitted by executables
-        self.profraw_dir = self.top_dir / 'profraw'
-        self.profraw_dir.mkdir(parents=True, exist_ok=True)
-
-        # Aggregated coverage data
-        self.profdata_file = self.top_dir / 'coverage.profdata'
-
-        # Dump all coverage data files into a dedicated directory.
-        # Each filename is parameterized by PID & executable's signature.
-        os.environ['LLVM_PROFILE_FILE'] = str(self.profraw_dir /
-                                              f'cov-{profraw_prefix}-%p-%m.profraw')
-
-        os.environ['RUSTFLAGS'] = ' '.join([
-            os.environ.get('RUSTFLAGS', ''),
-            # Enable LLVM's source-based coverage
-            # see: https://clang.llvm.org/docs/SourceBasedCodeCoverage.html
-            # see: https://blog.rust-lang.org/inside-rust/2020/11/12/source-based-code-coverage.html
-            '-Zinstrument-coverage',
-            # Link every bit of code to prevent "holes" in coverage report
-            # see: https://doc.rust-lang.org/rustc/codegen-options/index.html#link-dead-code
-            '-Clink-dead-code',
-            # Some of the paths that `rustc` embeds into binaries are absolute, others are relative.
-            # The point is, we can't have both, because depending on `-path-equivalence`, `llvm-cov`
-            # either will cripple absolute paths or won't be able to show relative paths at all.
-            # There's no way to turn relative paths into absolute, so we strip $PWD prefix.
-            # Only source files of deps (e.g. `$HOME/.cargo`) will keep their absolute paths,
-            # but we won't include them in report by default (but see `--all`).
-            f'--remap-path-prefix {self.cwd}=',
-        ])
-
-        # XXX: God, have mercy on our souls...
-        # see: https://github.com/rust-lang/rust/pull/90132
-        os.environ['RUSTC_BOOTSTRAP'] = '1'
-
-    def do_run(self, args):
-        subprocess.check_call([*args.command, *args.args])
-
-    def do_report(self, args):
-        if args.all and args.sources:
-            raise Exception('--all should not be used with sources')
-
-        # see man for `llvm-cov show [sources]`
-        if args.all:
-            sources = []
-        elif not args.sources:
-            sources = ['.']
-        else:
-            sources = args.sources
-
-        print('* Merging profraw files')
-        self.llvm.profdata(self.profraw_dir, self.profdata_file)
-
-        objects = []
-        if args.input_objects:
-            print('* Collecting object files using --input-objects')
-            with open(args.input_objects) as f:
-                objects.extend(f.read().splitlines(keepends=False))
-        if args.cargo_objects == 'true' or (args.cargo_objects == 'auto'
-                                            and not args.input_objects):
-            print('* Collecting object files using cargo')
-            objects.extend(self.cargo.binaries(args.profile))
-
-        params = dict(llvm=self.llvm,
-                      demangler=find_demangler(args.demangler),
-                      profdata=self.profdata_file,
-                      objects=objects,
-                      sources=sources)
-
-        formats = {
-            'html':
-            lambda: HtmlReport(**params, output_dir=self.report_dir),
-            'text':
-            lambda: TextReport(**params),
-            'lcov':
-            lambda: LcovReport(**params),
-            'summary':
-            lambda: SummaryReport(**params),
-            'github':
-            lambda: GithubPagesReport(
-                **params, output_dir=self.report_dir, commit_url=args.commit_url),
-        }
-
-        report = formats.get(args.format)()
-        if not report:
-            raise Exception('Format `{args.format}` is not supported')
-
-        print(f'* Rendering coverage report ({args.format})')
-        report.generate()
-
-        if args.open:
-            print('* Opening the report')
-            report.open()
-
-    def do_clean(self, args):
-        # Wipe everything if no filters have been provided
-        if not (args.report or args.prof):
-            shutil.rmtree(self.top_dir, ignore_errors=True)
-        else:
-            if args.report:
-                shutil.rmtree(self.report_dir, ignore_errors=True)
-            if args.prof:
-                self.profdata_file.unlink(missing_ok=True)
-
-
-def main():
-    app = sys.argv[0]
-    example = f"""
-prerequisites:
-    # alternatively, install a system package for `llvm-tools`
-    rustup component add llvm-tools-preview
-
-self-contained example:
-    {app} run make
-    {app} run pipenv run pytest test_runner
-    {app} run cargo test
-    {app} report --open
-    """
-
-    parser = argparse.ArgumentParser(description='Coverage report builder',
-                                     formatter_class=argparse.RawDescriptionHelpFormatter,
-                                     epilog=example)
-    parser.add_argument('--dir', type=Path, help='output directory')
-    parser.add_argument('--profraw-prefix', metavar='STRING', type=str)
-
-    commands = parser.add_subparsers(title='commands', dest='subparser_name')
-
-    p_run = commands.add_parser('run', help='run a command with magic env')
-    p_run.add_argument('command', nargs=1)
-    p_run.add_argument('args', nargs=argparse.REMAINDER)
-
-    p_report = commands.add_parser('report', help='generate a coverage report')
-    p_report.add_argument('--profile',
-                          default='debug',
-                          choices=('debug', 'release'),
-                          help='cargo build profile')
-    p_report.add_argument('--format',
-                          default='html',
-                          choices=('html', 'text', 'summary', 'lcov', 'github'),
-                          help='report format')
-    p_report.add_argument('--input-objects',
-                          metavar='FILE',
-                          type=Path,
-                          help='file containing list of binaries')
-    p_report.add_argument('--cargo-objects',
-                          default='auto',
-                          choices=('auto', 'true', 'false'),
-                          help='use cargo for auto discovery of binaries')
-    p_report.add_argument('--commit-url', type=str, help='required for --format=github')
-    p_report.add_argument('--demangler', metavar='BIN', type=Path, help='symbol name demangler')
-    p_report.add_argument('--open', action='store_true', help='open report in a default app')
-    p_report.add_argument('--all', action='store_true', help='show everything, e.g. deps')
-    p_report.add_argument('sources', nargs='*', type=Path, help='source file or directory')
-
-    p_clean = commands.add_parser('clean', help='wipe coverage artifacts')
-    p_clean.add_argument('--report', action='store_true', help='pick generated report')
-    p_clean.add_argument('--prof', action='store_true', help='pick *.profdata & *.profraw')
-
-    args = parser.parse_args()
-    state = State(cwd=Path.cwd(), top_dir=args.dir, profraw_prefix=args.profraw_prefix)
-
-    commands = {
-        'run': state.do_run,
-        'report': state.do_report,
-        'clean': state.do_clean,
-    }
-
-    action = commands.get(args.subparser_name)
-    if action:
-        action(args)
-    else:
-        parser.print_help()
-
-
-if __name__ == '__main__':
-    main()
--- a/scripts/generate_and_push_perf_report.sh
+++ b/scripts/generate_and_push_perf_report.sh
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-# this is a shortcut script to avoid duplication in CI
-
-set -eux -o pipefail
-
-SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-
-git clone https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git
-cd zenith-perf-data
-mkdir -p reports/
-mkdir -p data/$REPORT_TO
-
-cp $REPORT_FROM/* data/$REPORT_TO
-
-echo "Generating report"
-pipenv run python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html 
-echo "Uploading perf result"
-git add data reports
-git \
-    -c "user.name=vipvap" \
-    -c "user.email=vipvap@zenith.tech" \
-    commit \
-    --author="vipvap <vipvap@zenith.tech>" \
-    -m "add performance test result for $GITHUB_SHA zenith revision"
-
-git push https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git master
--- a/scripts/generate_perf_report_page.py
+++ b/scripts/generate_perf_report_page.py
@@ -1,207 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-from dataclasses import dataclass
-from pathlib import Path
-import json
-from typing import Any, Dict, List, Optional, Tuple, cast
-from jinja2 import Template
-
-# skip 'input' columns. They are included in the header and just blow the table
-EXCLUDE_COLUMNS = frozenset({
-    'scale',
-    'duration',
-    'number_of_clients',
-    'number_of_threads',
-    'init_start_timestamp',
-    'init_end_timestamp',
-    'run_start_timestamp',
-    'run_end_timestamp',
-})
-
-KEY_EXCLUDE_FIELDS = frozenset({
-    'init_start_timestamp',
-    'init_end_timestamp',
-    'run_start_timestamp',
-    'run_end_timestamp',
-})
-NEGATIVE_COLOR = 'negative'
-POSITIVE_COLOR = 'positive'
-
-
-@dataclass
-class SuitRun:
-    revision: str
-    values: Dict[str, Any]
-
-
-@dataclass
-class SuitRuns:
-    platform: str
-    suit: str
-    common_columns: List[Tuple[str, str]]
-    value_columns: List[str]
-    runs: List[SuitRun]
-
-
-@dataclass
-class RowValue:
-    value: str
-    color: str
-    ratio: str
-
-
-def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]:
-    value_columns = []
-    common_columns = []
-    for item in values:
-        if item['name'] in KEY_EXCLUDE_FIELDS:
-            continue
-        if item['report'] != 'test_param':
-            value_columns.append(cast(str, item['name']))
-        else:
-            common_columns.append((cast(str, item['name']), cast(str, item['value'])))
-    value_columns.sort()
-    common_columns.sort(key=lambda x: x[0])  # sort by name
-    return common_columns, value_columns
-
-
-def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
-    color = ''
-    sign = '+' if ratio > 0 else ''
-    if abs(ratio) < 0.05:
-        return f'&nbsp({sign}{ratio:.2f})', color
-
-    if report not in {'test_param', 'higher_is_better', 'lower_is_better'}:
-        raise ValueError(f'Unknown report type: {report}')
-
-    if report == 'test_param':
-        return f'{ratio:.2f}', color
-
-    if ratio > 0:
-        if report == 'higher_is_better':
-            color = POSITIVE_COLOR
-        elif report == 'lower_is_better':
-            color = NEGATIVE_COLOR
-    elif ratio < 0:
-        if report == 'higher_is_better':
-            color = NEGATIVE_COLOR
-        elif report == 'lower_is_better':
-            color = POSITIVE_COLOR
-
-    return f'&nbsp({sign}{ratio:.2f})', color
-
-
-def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
-    for item in suit_run.values['data']:
-        if item['name'] == name:
-            return cast(Dict[str, Any], item)
-    return None
-
-
-def get_row_values(columns: List[str], run_result: SuitRun,
-                   prev_result: Optional[SuitRun]) -> List[RowValue]:
-    row_values = []
-    for column in columns:
-        current_value = extract_value(column, run_result)
-        if current_value is None:
-            # should never happen
-            raise ValueError(f'{column} not found in {run_result.values}')
-
-        value = current_value["value"]
-        if isinstance(value, float):
-            value = f'{value:.2f}'
-
-        if prev_result is None:
-            row_values.append(RowValue(value, '', ''))
-            continue
-
-        prev_value = extract_value(column, prev_result)
-        if prev_value is None:
-            # this might happen when new metric is added and there is no value for it in previous run
-            # let this be here, TODO add proper handling when this actually happens
-            raise ValueError(f'{column} not found in previous result')
-        ratio = float(value) / float(prev_value['value']) - 1
-        ratio_display, color = format_ratio(ratio, current_value['report'])
-        row_values.append(RowValue(value, color, ratio_display))
-    return row_values
-
-
-@dataclass
-class SuiteRunTableRow:
-    revision: str
-    values: List[RowValue]
-
-
-def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]:
-    rows = []
-    prev_run = None
-    for run in runs:
-        rows.append(
-            SuiteRunTableRow(revision=run.revision,
-                             values=get_row_values(value_columns, run, prev_run)))
-        prev_run = run
-
-    return rows
-
-
-def main(args: argparse.Namespace) -> None:
-    input_dir = Path(args.input_dir)
-    grouped_runs: Dict[str, SuitRuns] = {}
-    # we have files in form: <ctr>_<rev>.json
-    # fill them in the hashmap so we have grouped items for the
-    # same run configuration (scale, duration etc.) ordered by counter.
-    for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split('_')[0])):
-        run_data = json.loads(item.read_text())
-        revision = run_data['revision']
-
-        for suit_result in run_data['result']:
-            key = "{}{}".format(run_data['platform'], suit_result['suit'])
-            # pack total duration as a synthetic value
-            total_duration = suit_result['total_duration']
-            suit_result['data'].append({
-                'name': 'total_duration',
-                'value': total_duration,
-                'unit': 's',
-                'report': 'lower_is_better',
-            })
-            common_columns, value_columns = get_columns(suit_result['data'])
-
-            grouped_runs.setdefault(
-                key,
-                SuitRuns(
-                    platform=run_data['platform'],
-                    suit=suit_result['suit'],
-                    common_columns=common_columns,
-                    value_columns=value_columns,
-                    runs=[],
-                ),
-            )
-
-            grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result))
-    context = {}
-    for result in grouped_runs.values():
-        suit = result.suit
-        context[suit] = {
-            'common_columns': result.common_columns,
-            'value_columns': result.value_columns,
-            'platform': result.platform,
-            # reverse the order so newest results are on top of the table
-            'rows': reversed(prepare_rows_from_runs(result.value_columns, result.runs)),
-        }
-
-    template = Template((Path(__file__).parent / 'perf_report_template.html').read_text())
-
-    Path(args.out).write_text(template.render(context=context))
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--input-dir',
-        dest='input_dir',
-        required=True,
-        help='Directory with jsons generated by the test suite',
-    )
-    parser.add_argument('--out', required=True, help='Output html file path')
-    args = parser.parse_args()
-    main(args)
--- a/scripts/git-upload
+++ b/scripts/git-upload
@@ -1,136 +0,0 @@
-#!/usr/bin/env python3
-
-from contextlib import contextmanager
-from tempfile import TemporaryDirectory
-from pathlib import Path
-
-import argparse
-import os
-import shutil
-import subprocess
-import sys
-
-
-def absolute_path(path):
-    return Path(path).resolve()
-
-
-def relative_path(path):
-    path = Path(path)
-    if path.is_absolute():
-        raise Exception(f'path `{path}` must be relative!')
-    return path
-
-
-@contextmanager
-def chdir(cwd: Path):
-    old = os.getcwd()
-    os.chdir(cwd)
-    try:
-        yield cwd
-    finally:
-        os.chdir(old)
-
-
-def run(cmd, *args, **kwargs):
-    print('$', ' '.join(cmd))
-    subprocess.check_call(cmd, *args, **kwargs)
-
-
-class GitRepo:
-    def __init__(self, url):
-        self.url = url
-        self.cwd = TemporaryDirectory()
-
-        subprocess.check_call([
-            'git',
-            'clone',
-            str(url),
-            self.cwd.name,
-        ])
-
-    def is_dirty(self):
-        res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip()
-        return bool(res)
-
-    def update(self, message, action, branch=None):
-        with chdir(self.cwd.name):
-            if not branch:
-                cmd = ['git', 'branch', '--show-current']
-                branch = subprocess.check_output(cmd, text=True).strip()
-
-            # Run action in repo's directory
-            action()
-
-            run(['git', 'add', '.'])
-
-            if not self.is_dirty():
-                print('No changes detected, quitting')
-                return
-
-            run([
-                'git',
-                '-c',
-                'user.name=vipvap',
-                '-c',
-                'user.email=vipvap@zenith.tech',
-                'commit',
-                '--author="vipvap <vipvap@zenith.tech>"',
-                f'--message={message}',
-            ])
-
-            for _ in range(5):
-                try:
-                    run(['git', 'fetch', 'origin', branch])
-                    run(['git', 'rebase', f'origin/{branch}'])
-                    run(['git', 'push', 'origin', branch])
-                    return
-
-                except subprocess.CalledProcessError as e:
-                    print(f'failed to update branch `{branch}`: {e}', file=sys.stderr)
-
-            raise Exception(f'failed to update branch `{branch}`')
-
-
-def do_copy(args):
-    src = args.src
-    dst = args.dst
-
-    try:
-        if src.is_dir():
-            shutil.copytree(src, dst)
-        else:
-            shutil.copy(src, dst)
-    except FileExistsError:
-        if args.forbid_overwrite:
-            raise
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Git upload tool')
-    parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url')
-    parser.add_argument('--message', type=str, metavar='TEXT', help='commit message')
-
-    commands = parser.add_subparsers(title='commands', dest='subparser_name')
-
-    p_copy = commands.add_parser('copy', help='copy file into the repo')
-    p_copy.add_argument('src', type=absolute_path, help='source path')
-    p_copy.add_argument('dst', type=relative_path, help='relative dest path')
-    p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites')
-
-    args = parser.parse_args()
-
-    commands = {
-        'copy': do_copy,
-    }
-
-    action = commands.get(args.subparser_name)
-    if action:
-        message = args.message or 'update'
-        GitRepo(args.repo).update(message, lambda: action(args))
-    else:
-        parser.print_usage()
-
-
-if __name__ == '__main__':
-    main()
--- a/scripts/perf_report_template.html
+++ b/scripts/perf_report_template.html
@@ -1,52 +0,0 @@
-<!DOCTYPE html>
-<html>
-
-<body>
-    <style>
-        table,
-        th,
-        td {
-            border: 1px solid black;
-            border-collapse: collapse;
-        }
-
-        .positive {
-            background-color: rgba(0, 255, 0, 0.8)
-        }
-
-        .negative {
-            background-color: rgba(255, 0, 0, 0.65)
-        }
-    </style>
-
-    <h2>Zenith Performance Tests</h2>
-
-    {% for suit_name, suit_data in context.items() %}
-    <h3>Runs for {{ suit_name }} </h3>
-    <b>platform:</b> {{ suit_data.platform }}<br>
-    {% for common_column_name, common_column_value in suit_data.common_columns %}
-    <b>{{ common_column_name }}</b>: {{ common_column_value }}<br>
-    {% endfor %}
-    <br>
-
-    <table>
-        <tr>
-            <th>revision</th>
-            {% for column_name in suit_data.value_columns %}
-            <th>{{ column_name }}</th>
-            {% endfor %}
-        </tr>
-        {% for row in suit_data.rows %}
-        <tr>
-            <td><a href=https://github.com/zenithdb/zenith/commit/{{ row.revision }}>{{ row.revision[:6] }}</a></td>
-            {% for column_value in row.values %}
-            <td class="{{ column_value.color }}">{{ column_value.value }}{{column_value.ratio}}</td>
-            {% endfor %}
-        </tr>
-        {% endfor %}
-    </table>
-    {% endfor %}
-
-</body>
-
-</html>
--- a/test_runner/Pipfile
+++ b/test_runner/Pipfile
@@ -0,0 +1,22 @@
+[[source]]
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pytest = ">=6.0.0"
+psycopg2 = "*"
+typing-extensions = "*"
+pyjwt = {extras = ["crypto"], version = "*"}
+requests = "*"
+pytest-xdist = "*"
+asyncpg = "*"
+
+[dev-packages]
+yapf = "*"
+flake8 = "*"
+mypy = "*"
+
+[requires]
+# we need at least 3.6, but pipenv doesn't allow to say this directly
+python_version = "3"
--- a/test_runner/Pipfile.lock
+++ b/test_runner/Pipfile.lock
@@ -0,0 +1,376 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "3cdc048691824d0b93912b6b78a0aa01dc98f278212c1badb0cc2edbd2103c3a"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.python.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "asyncpg": {
+            "hashes": [
+                "sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
+                "sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
+                "sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
+                "sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
+                "sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
+                "sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
+                "sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
+                "sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
+                "sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
+                "sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
+                "sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
+                "sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
+                "sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
+            ],
+            "index": "pypi",
+            "version": "==0.24.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
+                "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==21.2.0"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
+                "sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
+            ],
+            "version": "==2021.5.30"
+        },
+        "cffi": {
+            "hashes": [
+                "sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d",
+                "sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771",
+                "sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872",
+                "sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c",
+                "sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc",
+                "sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762",
+                "sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202",
+                "sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5",
+                "sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548",
+                "sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a",
+                "sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f",
+                "sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20",
+                "sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218",
+                "sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c",
+                "sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e",
+                "sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56",
+                "sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224",
+                "sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a",
+                "sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2",
+                "sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a",
+                "sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819",
+                "sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346",
+                "sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b",
+                "sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e",
+                "sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534",
+                "sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb",
+                "sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0",
+                "sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156",
+                "sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd",
+                "sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87",
+                "sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc",
+                "sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195",
+                "sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33",
+                "sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f",
+                "sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d",
+                "sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd",
+                "sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728",
+                "sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7",
+                "sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca",
+                "sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99",
+                "sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf",
+                "sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e",
+                "sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c",
+                "sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5",
+                "sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69"
+            ],
+            "version": "==1.14.6"
+        },
+        "charset-normalizer": {
+            "hashes": [
+                "sha256:5d209c0a931f215cee683b6445e2d77677e7e75e159f78def0db09d68fafcaa6",
+                "sha256:5ec46d183433dcbd0ab716f2d7f29d8dee50505b3fdb40c6b985c7c4f5a3591f"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==2.0.6"
+        },
+        "cryptography": {
+            "hashes": [
+                "sha256:0a7dcbcd3f1913f664aca35d47c1331fce738d44ec34b7be8b9d332151b0b01e",
+                "sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b",
+                "sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7",
+                "sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085",
+                "sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc",
+                "sha256:3c4129fc3fdc0fa8e40861b5ac0c673315b3c902bbdc05fc176764815b43dd1d",
+                "sha256:3fa3a7ccf96e826affdf1a0a9432be74dc73423125c8f96a909e3835a5ef194a",
+                "sha256:5b0fbfae7ff7febdb74b574055c7466da334a5371f253732d7e2e7525d570498",
+                "sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89",
+                "sha256:8695456444f277af73a4877db9fc979849cd3ee74c198d04fc0776ebc3db52b9",
+                "sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c",
+                "sha256:94fff993ee9bc1b2440d3b7243d488c6a3d9724cc2b09cdb297f6a886d040ef7",
+                "sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb",
+                "sha256:a00cf305f07b26c351d8d4e1af84ad7501eca8a342dedf24a7acb0e7b7406e14",
+                "sha256:a305600e7a6b7b855cd798e00278161b681ad6e9b7eca94c721d5f588ab212af",
+                "sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e",
+                "sha256:d2a6e5ef66503da51d2110edf6c403dc6b494cc0082f85db12f54e9c5d4c3ec5",
+                "sha256:d9ec0e67a14f9d1d48dd87a2531009a9b251c02ea42851c060b25c782516ff06",
+                "sha256:f44d141b8c4ea5eb4dbc9b3ad992d45580c1d22bf5e24363f2fbf50c2d7ae8a7"
+            ],
+            "version": "==3.4.8"
+        },
+        "execnet": {
+            "hashes": [
+                "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
+                "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==1.9.0"
+        },
+        "idna": {
+            "hashes": [
+                "sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
+                "sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
+            ],
+            "markers": "python_version >= '3'",
+            "version": "==3.2"
+        },
+        "iniconfig": {
+            "hashes": [
+                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
+                "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
+            ],
+            "version": "==1.1.1"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7",
+                "sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==21.0"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
+                "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==1.0.0"
+        },
+        "psycopg2": {
+            "hashes": [
+                "sha256:079d97fc22de90da1d370c90583659a9f9a6ee4007355f5825e5f1c70dffc1fa",
+                "sha256:2087013c159a73e09713294a44d0c8008204d06326006b7f652bef5ace66eebb",
+                "sha256:2c992196719fadda59f72d44603ee1a2fdcc67de097eea38d41c7ad9ad246e62",
+                "sha256:7640e1e4d72444ef012e275e7b53204d7fab341fb22bc76057ede22fe6860b25",
+                "sha256:7f91312f065df517187134cce8e395ab37f5b601a42446bdc0f0d51773621854",
+                "sha256:830c8e8dddab6b6716a4bf73a09910c7954a92f40cf1d1e702fb93c8a919cc56",
+                "sha256:89409d369f4882c47f7ea20c42c5046879ce22c1e4ea20ef3b00a4dfc0a7f188",
+                "sha256:bf35a25f1aaa8a3781195595577fcbb59934856ee46b4f252f56ad12b8043bcf",
+                "sha256:de5303a6f1d0a7a34b9d40e4d3bef684ccc44a49bbe3eb85e3c0bffb4a131b7c"
+            ],
+            "index": "pypi",
+            "version": "==2.9.1"
+        },
+        "py": {
+            "hashes": [
+                "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
+                "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.10.0"
+        },
+        "pycparser": {
+            "hashes": [
+                "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
+                "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.20"
+        },
+        "pyjwt": {
+            "extras": [
+                "crypto"
+            ],
+            "hashes": [
+                "sha256:934d73fbba91b0483d3857d1aff50e96b2a892384ee2c17417ed3203f173fca1",
+                "sha256:fba44e7898bbca160a2b2b501f492824fc8382485d3a6f11ba5d0c1937ce6130"
+            ],
+            "index": "pypi",
+            "version": "==2.1.0"
+        },
+        "pyparsing": {
+            "hashes": [
+                "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
+                "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.4.7"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
+                "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
+            ],
+            "index": "pypi",
+            "version": "==6.2.5"
+        },
+        "pytest-forked": {
+            "hashes": [
+                "sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
+                "sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==1.3.0"
+        },
+        "pytest-xdist": {
+            "hashes": [
+                "sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
+                "sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
+            ],
+            "index": "pypi",
+            "version": "==2.4.0"
+        },
+        "requests": {
+            "hashes": [
+                "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
+                "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
+            ],
+            "index": "pypi",
+            "version": "==2.26.0"
+        },
+        "toml": {
+            "hashes": [
+                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.10.2"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
+                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
+                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
+            ],
+            "index": "pypi",
+            "version": "==3.10.0.2"
+        },
+        "urllib3": {
+            "hashes": [
+                "sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
+                "sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
+            "version": "==1.26.6"
+        }
+    },
+    "develop": {
+        "flake8": {
+            "hashes": [
+                "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b",
+                "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"
+            ],
+            "index": "pypi",
+            "version": "==3.9.2"
+        },
+        "mccabe": {
+            "hashes": [
+                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+            ],
+            "version": "==0.6.1"
+        },
+        "mypy": {
+            "hashes": [
+                "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
+                "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
+                "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
+                "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
+                "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
+                "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
+                "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
+                "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
+                "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
+                "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
+                "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
+                "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
+                "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
+                "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
+                "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
+                "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
+                "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
+                "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
+                "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
+                "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
+                "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
+                "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
+                "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
+            ],
+            "index": "pypi",
+            "version": "==0.910"
+        },
+        "mypy-extensions": {
+            "hashes": [
+                "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
+                "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
+            ],
+            "version": "==0.4.3"
+        },
+        "pycodestyle": {
+            "hashes": [
+                "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068",
+                "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.7.0"
+        },
+        "pyflakes": {
+            "hashes": [
+                "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3",
+                "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.3.1"
+        },
+        "toml": {
+            "hashes": [
+                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.10.2"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
+                "sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
+                "sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
+            ],
+            "index": "pypi",
+            "version": "==3.10.0.2"
+        },
+        "yapf": {
+            "hashes": [
+                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
+                "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
+            ],
+            "index": "pypi",
+            "version": "==0.31.0"
+        }
+    }
+}
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -3,13 +3,18 @@
 This directory contains integration tests.

 Prerequisites:
- Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python)
+- Python 3.6 or later
+- Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
+  packages are stale, as it commonly happens, so manual installation is not
+  recommended.
+  Run `pipenv shell` to activate the venv or use `pipenv run` to run a single
+  command in the venv, e.g. `pipenv run pytest`.
 - Zenith and Postgres binaries
-    - See the root [README.md](/README.md) for build directions
+    - See the root README.md for build directions
    - Tests can be run from the git tree; or see the environment variables
      below to run from other directories.
 - The zenith git repo, including the postgres submodule
-  (for some tests, e.g. `pg_regress`)
+  (for some tests, e.g. pg_regress)

 ### Test Organization

@@ -30,15 +35,15 @@ be stored under a directory `test_output`.

 You can run all the tests with:

-`pipenv run pytest`
+`pytest`

 If you want to run all the tests in a particular file:

-`pipenv run pytest test_pgbench.py`
+`pytest test_pgbench.py`

 If you want to run all tests that have the string "bench" in their names:

-`pipenv run pytest -k bench`
+`pytest -k bench`

 Useful environment variables:

@@ -48,8 +53,8 @@ Useful environment variables:
 should go.
 `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.

-Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them:
-`pytest -s --log-cli-level=INFO ...`
+Let stdout and stderr go to the terminal instead of capturing them:
+`pytest -s ...`
 (Note many tests capture subprocess outputs separately, so this may not
 show much.)

@@ -57,51 +62,44 @@ Exit after the first test failure:
 `pytest -x ...`
 (there are many more pytest options; run `pytest -h` to see them.)

-### Writing a test

-Every test needs a Zenith Environment, or ZenithEnv to operate in. A Zenith Environment
-is like a little cloud-in-a-box, and consists of a Pageserver, 0-N Safekeepers, and
-compute Postgres nodes. The connections between them can be configured to use JWT
-authentication tokens, and some other configuration options can be tweaked too.
+### Building new tests

-The easiest way to get access to a Zenith Environment is by using the `zenith_simple_env`
-fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
-or make other destructive changes in that environment. Also don't assume that
-there are no tenants or branches or data in the cluster. For convenience, there is a
-branch called `empty`, though. The convention is to create a test-specific branch of
-that and load any test data there, instead of the 'main' branch.
+The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html

-For more complicated cases, you can build a custom Zenith Environment, with the `zenith_env`
-fixture:
+Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
+
+So this code:

 ```python
-def test_foobar(zenith_env_builder: ZenithEnvBuilder):
-    # Prescribe the environment.
-    # We want to have 3 safekeeper nodes, and use JWT authentication in the
-    # connections to the page server
-    zenith_env_builder.num_safekeepers = 3
-    zenith_env_builder.set_pageserver_auth(True)
-
-    # Now create the environment. This initializes the repository, and starts
-    # up the page server and the safekeepers
-    env = zenith_env_builder.init()
-
-    # Run the test
-    ...
+def test_something(zenith_cli, pg_bin):
+    pass
 ```

-For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html
+... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.

-At the end of a test, all the nodes in the environment are automatically stopped, so you
-don't need to worry about cleaning up. Logs and test data are preserved for the analysis,
-in a directory under `../test_output/<testname>`
+Fixtures can't be imported using the normal python syntax. Instead, use this:

-### Before submitting a patch
-Ensure that you pass all [obligatory checks](/docs/sourcetree.md#obligatory-checks).
+```python
+pytest_plugins = ("fixtures.something")
+```

-Also consider:
+That will make all the fixtures in the `fixtures/something.py` file available.
+
+Anything that's likely to be used in multiple tests should be built into a fixture.
+
+Note that fixtures can clean up after themselves if they use the `yield` syntax.
+Cleanup will happen even if the test fails (raises an unhandled exception).
+Python destructors, e.g. `__del__()` aren't recommended for cleanup.
+
+
+### Code quality
+
+Before submitting a patch, please consider:

 * Writing a couple of docstrings to clarify the reasoning behind a new test.
-* Adding more type hints to your code to avoid `Any`, especially:
-  * For fixture parameters, they are not automatically deduced.
-  * For function arguments and return values.
+* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
+* Formatting the code with `yapf -r -i .` (TODO: implement an opt-in pre-commit hook for that).
+* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.
+
+The tools can be installed with `pipenv install --dev`.
--- a/test_runner/batch_others/test_auth.py
+++ b/test_runner/batch_others/test_auth.py
@@ -1,22 +1,21 @@
+
 from contextlib import closing
 from typing import Iterator
 from uuid import uuid4
 import psycopg2
-from fixtures.zenith_fixtures import ZenithEnvBuilder
+from fixtures.zenith_fixtures import PortDistributor, Postgres, ZenithCli, ZenithPageserver, PgBin
 import pytest

+
 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
-    zenith_env_builder.pageserver_auth_enabled = True
-    env = zenith_env_builder.init()
+def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
+    ps = pageserver_auth_enabled

-    ps = env.pageserver
-
-    tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant)
-    invalid_tenant_token = env.auth_keys.generate_tenant_token(uuid4().hex)
-    management_token = env.auth_keys.generate_management_token()
+    tenant_token = ps.auth_keys.generate_tenant_token(ps.initial_tenant)
+    invalid_tenant_token = ps.auth_keys.generate_tenant_token(uuid4().hex)
+    management_token = ps.auth_keys.generate_management_token()

    # this does not invoke auth check and only decodes jwt and checks it for validity
    # check both tokens
@@ -24,41 +23,57 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
    ps.safe_psql("status", password=management_token)

    # tenant can create branches
-    ps.safe_psql(f"branch_create {env.initial_tenant} new1 main", password=tenant_token)
+    ps.safe_psql(f"branch_create {ps.initial_tenant} new1 main", password=tenant_token)
    # console can create branches for tenant
-    ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=management_token)
+    ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=management_token)

    # fail to create branch using token with different tenantid
    with pytest.raises(psycopg2.DatabaseError, match='Tenant id mismatch. Permission denied'):
-        ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=invalid_tenant_token)
+        ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=invalid_tenant_token)

    # create tenant using management token
    ps.safe_psql(f"tenant_create {uuid4().hex}", password=management_token)

    # fail to create tenant using tenant token
-    with pytest.raises(
-            psycopg2.DatabaseError,
-            match='Attempt to access management api with tenant scope. Permission denied'):
+    with pytest.raises(psycopg2.DatabaseError, match='Attempt to access management api with tenant scope. Permission denied'):
        ps.safe_psql(f"tenant_create {uuid4().hex}", password=tenant_token)


@pytest.mark.parametrize('with_wal_acceptors', [False, True])
-def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
-    zenith_env_builder.pageserver_auth_enabled = True
-    if with_wal_acceptors:
-        zenith_env_builder.num_safekeepers = 3
-    env = zenith_env_builder.init()
+def test_compute_auth_to_pageserver(
+    zenith_cli: ZenithCli,
+    wa_factory,
+    pageserver_auth_enabled: ZenithPageserver,
+    repo_dir: str,
+    with_wal_acceptors: bool,
+    pg_bin: PgBin,
+    port_distributor: PortDistributor,
+):
+    ps = pageserver_auth_enabled
+    # since we are in progress of refactoring protocols between compute safekeeper and page server
+    # use hardcoded management token in safekeeper
+    management_token = ps.auth_keys.generate_management_token()

    branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}"
-    env.zenith_cli(["branch", branch, "main"])
+    zenith_cli.run(["branch", branch, "empty"])
+    if with_wal_acceptors:
+        wa_factory.start_n_new(3, management_token)

-    pg = env.postgres.create_start(branch)
-
-    with closing(pg.connect()) as conn:
-        with conn.cursor() as cur:
-            # we rely upon autocommit after each statement
-            # as waiting for acceptors happens there
-            cur.execute('CREATE TABLE t(key int primary key, value text)')
-            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
-            cur.execute('SELECT sum(key) FROM t')
-            assert cur.fetchone() == (5000050000, )
+    with Postgres(
+        zenith_cli=zenith_cli,
+        repo_dir=repo_dir,
+        pg_bin=pg_bin,
+        tenant_id=ps.initial_tenant,
+        port=port_distributor.get_port(),
+    ).create_start(
+        branch,
+        wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
+    ) as pg:
+        with closing(pg.connect()) as conn:
+            with conn.cursor() as cur:
+                # we rely upon autocommit after each statement
+                # as waiting for acceptors happens there
+                cur.execute('CREATE TABLE t(key int primary key, value text)')
+                cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+                cur.execute('SELECT sum(key) FROM t')
+                assert cur.fetchone() == (5000050000, )
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -1,11 +1,9 @@
 import subprocess
-from contextlib import closing
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver

-import psycopg2.extras
-import pytest
-from fixtures.log_helper import log
-from fixtures.utils import print_gc_result
-from fixtures.zenith_fixtures import ZenithEnv
+import logging
+import fixtures.log_helper  # configures loggers
+log = logging.getLogger('root')

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -13,27 +11,18 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Create a couple of branches off the main branch, at a historical point in time.
 #
-def test_branch_behind(zenith_simple_env: ZenithEnv):
-    env = zenith_simple_env
+def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
    # Branch at the point where only 100 rows were inserted
-    env.zenith_cli(["branch", "test_branch_behind", "empty"])
+    zenith_cli.run(["branch", "test_branch_behind", "empty"])

-    pgmain = env.postgres.create_start('test_branch_behind')
+    pgmain = postgres.create_start('test_branch_behind')
    log.info("postgres is running on 'test_branch_behind' branch")

    main_pg_conn = pgmain.connect()
    main_cur = main_pg_conn.cursor()

-    main_cur.execute("SHOW zenith.zenith_timeline")
-    timeline = main_cur.fetchone()[0]
-
    # Create table, and insert the first 100 rows
    main_cur.execute('CREATE TABLE foo (t text)')
-
-    # keep some early lsn to test branch creation on out of date lsn
-    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
-    gced_lsn = main_cur.fetchone()[0]
-
    main_cur.execute('''
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
@@ -47,32 +36,32 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
    main_cur.execute('''
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
-            FROM generate_series(1, 200000) g
+            FROM generate_series(1, 100000) g
    ''')
    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
    lsn_b = main_cur.fetchone()[0]
-    log.info(f'LSN after 200100 rows: {lsn_b}')
+    log.info(f'LSN after 100100 rows: {lsn_b}')

    # Branch at the point where only 100 rows were inserted
-    env.zenith_cli(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
+    zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])

    # Insert many more rows. This generates enough WAL to fill a few segments.
    main_cur.execute('''
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
-            FROM generate_series(1, 200000) g
+            FROM generate_series(1, 100000) g
    ''')
    main_cur.execute('SELECT pg_current_wal_insert_lsn()')

    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
    lsn_c = main_cur.fetchone()[0]
-    log.info(f'LSN after 400100 rows: {lsn_c}')
+    log.info(f'LSN after 200100 rows: {lsn_c}')

-    # Branch at the point where only 200100 rows were inserted
-    env.zenith_cli(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
+    # Branch at the point where only 200 rows were inserted
+    zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])

-    pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
-    pg_more = env.postgres.create_start("test_branch_behind_more")
+    pg_hundred = postgres.create_start("test_branch_behind_hundred")
+    pg_more = postgres.create_start("test_branch_behind_more")

    # On the 'hundred' branch, we should see only 100 rows
    hundred_pg_conn = pg_hundred.connect()
@@ -84,43 +73,23 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
    more_pg_conn = pg_more.connect()
    more_cur = more_pg_conn.cursor()
    more_cur.execute('SELECT count(*) FROM foo')
-    assert more_cur.fetchone() == (200100, )
+    assert more_cur.fetchone() == (100100, )

    # All the rows are visible on the main branch
    main_cur.execute('SELECT count(*) FROM foo')
-    assert main_cur.fetchone() == (400100, )
+    assert main_cur.fetchone() == (200100, )

    # Check bad lsn's for branching

    # branch at segment boundary
-    env.zenith_cli(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
-    pg = env.postgres.create_start("test_branch_segment_boundary")
+    zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
+    pg = postgres.create_start("test_branch_segment_boundary")
    cur = pg.connect().cursor()
    cur.execute('SELECT 1')
    assert cur.fetchone() == (1, )

    # branch at pre-initdb lsn
-    with pytest.raises(Exception, match="invalid branch start lsn"):
-        env.zenith_cli(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
-
-    # check that we cannot create branch based on garbage collected data
-    with closing(env.pageserver.connect()) as psconn:
-        with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
-            # call gc to advace latest_gc_cutoff_lsn
-            pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
-            row = pscur.fetchone()
-            print_gc_result(row)
-
-    with pytest.raises(Exception, match="invalid branch start lsn"):
-        # this gced_lsn is pretty random, so if gc is disabled this woudln't fail
-        env.zenith_cli(["branch", "test_branch_create_fail", f"test_branch_behind@{gced_lsn}"])
-
-    # check that after gc everything is still there
-    hundred_cur.execute('SELECT count(*) FROM foo')
-    assert hundred_cur.fetchone() == (100, )
-
-    more_cur.execute('SELECT count(*) FROM foo')
-    assert more_cur.fetchone() == (200100, )
-
-    main_cur.execute('SELECT count(*) FROM foo')
-    assert main_cur.fetchone() == (400100, )
+    try:
+        zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
+    except subprocess.CalledProcessError:
+        log.info("Branch creation with pre-initdb LSN failed (as expected)")
--- a/test_runner/batch_others/test_clog_truncate.py
+++ b/test_runner/batch_others/test_clog_truncate.py
@@ -3,8 +3,11 @@ import os

 from contextlib import closing

-from fixtures.zenith_fixtures import ZenithEnv
-from fixtures.log_helper import log
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+
+import logging
+import fixtures.log_helper  # configures loggers
+log = logging.getLogger('root')

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -12,23 +15,19 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test compute node start after clog truncation
 #
-def test_clog_truncate(zenith_simple_env: ZenithEnv):
-    env = zenith_simple_env
+def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
    # Create a branch for us
-    env.zenith_cli(["branch", "test_clog_truncate", "empty"])
+    zenith_cli.run(["branch", "test_clog_truncate", "empty"])

    # set agressive autovacuum to make sure that truncation will happen
    config = [
-        'autovacuum_max_workers=10',
-        'autovacuum_vacuum_threshold=0',
-        'autovacuum_vacuum_insert_threshold=0',
-        'autovacuum_vacuum_cost_delay=0',
-        'autovacuum_vacuum_cost_limit=10000',
-        'autovacuum_naptime =1s',
+        'autovacuum_max_workers=10', 'autovacuum_vacuum_threshold=0',
+        'autovacuum_vacuum_insert_threshold=0', 'autovacuum_vacuum_cost_delay=0',
+        'autovacuum_vacuum_cost_limit=10000', 'autovacuum_naptime =1s',
        'autovacuum_freeze_max_age=100000'
    ]

-    pg = env.postgres.create_start('test_clog_truncate', config_lines=config)
+    pg = postgres.create_start('test_clog_truncate', config_lines=config)
    log.info('postgres is running on test_clog_truncate branch')

    # Install extension containing function needed for test
@@ -65,13 +64,13 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv):

    # create new branch after clog truncation and start a compute node on it
    log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
-    env.zenith_cli(
+    zenith_cli.run(
        ["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation])

-    pg2 = env.postgres.create_start('test_clog_truncate_new')
+    pg2 = postgres.create_start('test_clog_truncate_new')
    log.info('postgres is running on test_clog_truncate_new branch')

    # check that new node doesn't contain truncated segment
    pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), '0000')
-    log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}")
+    log.info("pg_xact_0000_path_new = " + pg_xact_0000_path_new)
    assert os.path.isfile(pg_xact_0000_path_new) is False
--- a/test_runner/batch_others/test_config.py
+++ b/test_runner/batch_others/test_config.py
@@ -1,7 +1,10 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import ZenithEnv
-from fixtures.log_helper import log
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+
+import logging
+import fixtures.log_helper  # configures loggers
+log = logging.getLogger('root')

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -9,13 +12,12 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test starting Postgres with custom options
 #
-def test_config(zenith_simple_env: ZenithEnv):
-    env = zenith_simple_env
+def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
    # Create a branch for us
-    env.zenith_cli(["branch", "test_config", "empty"])
+    zenith_cli.run(["branch", "test_config", "empty"])

    # change config
-    pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
+    pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
    log.info('postgres is running on test_config branch')

    with closing(pg.connect()) as conn:
--- a/test_runner/batch_others/test_createdropdb.py
+++ b/test_runner/batch_others/test_createdropdb.py
@@ -2,8 +2,11 @@ import os
 import pathlib

 from contextlib import closing
-from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
-from fixtures.log_helper import log
+from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
+
+import logging
+import fixtures.log_helper  # configures loggers
+log = logging.getLogger('root')

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -11,11 +14,15 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test CREATE DATABASE when there have been relmapper changes
 #
-def test_createdb(zenith_simple_env: ZenithEnv):
-    env = zenith_simple_env
-    env.zenith_cli(["branch", "test_createdb", "empty"])
+def test_createdb(
+    zenith_cli: ZenithCli,
+    pageserver: ZenithPageserver,
+    postgres: PostgresFactory,
+    pg_bin,
+):
+    zenith_cli.run(["branch", "test_createdb", "empty"])

-    pg = env.postgres.create_start('test_createdb')
+    pg = postgres.create_start('test_createdb')
    log.info("postgres is running on 'test_createdb' branch")

    with closing(pg.connect()) as conn:
@@ -29,23 +36,27 @@ def test_createdb(zenith_simple_env: ZenithEnv):
            lsn = cur.fetchone()[0]

    # Create a branch
-    env.zenith_cli(["branch", "test_createdb2", "test_createdb@" + lsn])
+    zenith_cli.run(["branch", "test_createdb2", "test_createdb@" + lsn])

-    pg2 = env.postgres.create_start('test_createdb2')
+    pg2 = postgres.create_start('test_createdb2')

    # Test that you can connect to the new database on both branches
    for db in (pg, pg2):
        db.connect(dbname='foodb').close()

-
 #
 # Test DROP DATABASE
 #
-def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
-    env = zenith_simple_env
-    env.zenith_cli(["branch", "test_dropdb", "empty"])
+def test_dropdb(
+    zenith_cli: ZenithCli,
+    pageserver: ZenithPageserver,
+    postgres: PostgresFactory,
+    pg_bin,
+    test_output_dir
+):
+    zenith_cli.run(["branch", "test_dropdb", "empty"])

-    pg = env.postgres.create_start('test_dropdb')
+    pg = postgres.create_start('test_dropdb')
    log.info("postgres is running on 'test_dropdb' branch")

    with closing(pg.connect()) as conn:
@@ -58,6 +69,7 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
            cur.execute("SELECT oid FROM pg_database WHERE datname='foodb';")
            dboid = cur.fetchone()[0]

+
    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
            cur.execute('DROP DATABASE foodb')
@@ -67,29 +79,28 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
            cur.execute('SELECT pg_current_wal_insert_lsn()')
            lsn_after_drop = cur.fetchone()[0]

-    # Create two branches before and after database drop.
-    env.zenith_cli(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
-    pg_before = env.postgres.create_start('test_before_dropdb')

-    env.zenith_cli(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
-    pg_after = env.postgres.create_start('test_after_dropdb')
+    # Create two branches before and after database drop.
+    zenith_cli.run(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
+    pg_before = postgres.create_start('test_before_dropdb')
+
+    zenith_cli.run(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
+    pg_after = postgres.create_start('test_after_dropdb')

    # Test that database exists on the branch before drop
    pg_before.connect(dbname='foodb').close()

    # Test that database subdir exists on the branch before drop
-    assert pg_before.pgdata_dir
    dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid)
    log.info(dbpath)

    assert os.path.isdir(dbpath) == True

    # Test that database subdir doesn't exist on the branch after drop
-    assert pg_after.pgdata_dir
    dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid)
    log.info(dbpath)

    assert os.path.isdir(dbpath) == False

    # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(test_output_dir, env, pg)
+    check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
--- a/test_runner/batch_others/test_createuser.py
+++ b/test_runner/batch_others/test_createuser.py
@@ -1,7 +1,10 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import ZenithEnv
-from fixtures.log_helper import log
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+
+import logging
+import fixtures.log_helper  # configures loggers
+log = logging.getLogger('root')

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -9,11 +12,10 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 #
 # Test CREATE USER to check shared catalog restore
 #
-def test_createuser(zenith_simple_env: ZenithEnv):
-    env = zenith_simple_env
-    env.zenith_cli(["branch", "test_createuser", "empty"])
+def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
+    zenith_cli.run(["branch", "test_createuser", "empty"])

-    pg = env.postgres.create_start('test_createuser')
+    pg = postgres.create_start('test_createuser')
    log.info("postgres is running on 'test_createuser' branch")

    with closing(pg.connect()) as conn:
@@ -27,9 +29,9 @@ def test_createuser(zenith_simple_env: ZenithEnv):
            lsn = cur.fetchone()[0]

    # Create a branch
-    env.zenith_cli(["branch", "test_createuser2", "test_createuser@" + lsn])
+    zenith_cli.run(["branch", "test_createuser2", "test_createuser@" + lsn])

-    pg2 = env.postgres.create_start('test_createuser2')
+    pg2 = postgres.create_start('test_createuser2')

    # Test that you can connect to new branch as a new user
    assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]
--- a/test_runner/batch_others/test_multixact.py
+++ b/test_runner/batch_others/test_multixact.py
@@ -1,5 +1,8 @@
-from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
-from fixtures.log_helper import log
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
+
+import logging
+import fixtures.log_helper  # configures loggers
+log = logging.getLogger('root')

 pytest_plugins = ("fixtures.zenith_fixtures")

@@ -10,11 +13,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # it only checks next_multixact_id field in restored pg_control,
 # since we don't have functions to check multixact internals.
 #
-def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
-    env = zenith_simple_env
+def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
+                    pg_bin, zenith_cli, base_dir, test_output_dir):
    # Create a branch for us
-    env.zenith_cli(["branch", "test_multixact", "empty"])
-    pg = env.postgres.create_start('test_multixact')
+    zenith_cli.run(["branch", "test_multixact", "empty"])
+    pg = postgres.create_start('test_multixact')

    log.info("postgres is running on 'test_multixact' branch")
    pg_conn = pg.connect()
@@ -53,8 +56,8 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
    assert int(next_multixact_id) > int(next_multixact_id_old)

    # Branch at this point
-    env.zenith_cli(["branch", "test_multixact_new", "test_multixact@" + lsn])
-    pg_new = env.postgres.create_start('test_multixact_new')
+    zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
+    pg_new = postgres.create_start('test_multixact_new')

    log.info("postgres is running on 'test_multixact_new' branch")
    pg_new_conn = pg_new.connect()
@@ -67,4 +70,4 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
    assert next_multixact_id_new == next_multixact_id

    # Check that we restore the content of the datadir correctly
-    check_restored_datadir_content(test_output_dir, env, pg_new)
+    check_restored_datadir_content(zenith_cli, test_output_dir, pg_new, pageserver.service_port.pg)
--- a/test_runner/batch_others/test_old_request_lsn.py
+++ b/test_runner/batch_others/test_old_request_lsn.py
@@ -1,11 +1,13 @@
 from contextlib import closing

-from fixtures.zenith_fixtures import ZenithEnv
-from fixtures.log_helper import log
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+
+import logging
+import fixtures.log_helper  # configures loggers
+log = logging.getLogger('root')

 pytest_plugins = ("fixtures.zenith_fixtures")

-
 #
 # Test where Postgres generates a lot of WAL, and it's garbage collected away, but
 # no pages are evicted so that Postgres uses an old LSN in a GetPage request.
@@ -16,11 +18,10 @@ pytest_plugins = ("fixtures.zenith_fixtures")
 # just a hint that the page hasn't been modified since that LSN, and the page
 # server should return the latest page version regardless of the LSN.
 #
-def test_old_request_lsn(zenith_simple_env: ZenithEnv):
-    env = zenith_simple_env
+def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
    # Create a branch for us
-    env.zenith_cli(["branch", "test_old_request_lsn", "empty"])
-    pg = env.postgres.create_start('test_old_request_lsn')
+    zenith_cli.run(["branch", "test_old_request_lsn", "empty"])
+    pg = postgres.create_start('test_old_request_lsn')
    log.info('postgres is running on test_old_request_lsn branch')

    pg_conn = pg.connect()
@@ -30,7 +31,7 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv):
    cur.execute("SHOW zenith.zenith_timeline")
    timeline = cur.fetchone()[0]

-    psconn = env.pageserver.connect()
+    psconn = pageserver.connect()
    pscur = psconn.cursor()

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
@@ -49,20 +50,20 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv):
        from pg_settings where name = 'shared_buffers'
    ''')
    row = cur.fetchone()
-    log.info(f'shared_buffers is {row[0]}, table size {row[1]}')
+    log.info(f'shared_buffers is {row[0]}, table size {row[1]}');
    assert int(row[0]) < int(row[1])

-    cur.execute('VACUUM foo')
+    cur.execute('VACUUM foo');

    # Make a lot of updates on a single row, generating a lot of WAL. Trigger
    # garbage collections so that the page server will remove old page versions.
    for i in range(10):
-        pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
+        pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
        for j in range(100):
-            cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;')
+            cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;');

    # All (or at least most of) the updates should've been on the same page, so
    # that we haven't had to evict any dirty pages for a long time. Now run
    # a query that sends GetPage@LSN requests with the old LSN.
-    cur.execute("SELECT COUNT(*), SUM(val) FROM foo")
+    cur.execute("SELECT COUNT(*), SUM(val) FROM foo");
    assert cur.fetchone() == (100000, 101000)
--- a/test_runner/batch_others/test_pageserver_api.py
+++ b/test_runner/batch_others/test_pageserver_api.py
@@ -3,28 +3,25 @@ from uuid import uuid4
 import pytest
 import psycopg2
 import requests
-from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
-from typing import cast
+from fixtures.zenith_fixtures import ZenithPageserver, ZenithPageserverHttpClient

 pytest_plugins = ("fixtures.zenith_fixtures")


-def test_status_psql(zenith_simple_env: ZenithEnv):
-    env = zenith_simple_env
-    assert env.pageserver.safe_psql('status') == [
+def test_status_psql(pageserver):
+    assert pageserver.safe_psql('status') == [
        ('hello world', ),
    ]


-def test_branch_list_psql(zenith_simple_env: ZenithEnv):
-    env = zenith_simple_env
+def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
    # Create a branch for us
-    env.zenith_cli(["branch", "test_branch_list_main", "empty"])
+    zenith_cli.run(["branch", "test_branch_list_main", "empty"])

-    conn = env.pageserver.connect()
+    conn = pageserver.connect()
    cur = conn.cursor()

-    cur.execute(f'branch_list {env.initial_tenant}')
+    cur.execute(f'branch_list {pageserver.initial_tenant}')
    branches = json.loads(cur.fetchone()[0])
    # Filter out branches created by other tests
    branches = [x for x in branches if x['name'].startswith('test_branch_list')]
@@ -37,10 +34,10 @@ def test_branch_list_psql(zenith_simple_env: ZenithEnv):
    assert 'ancestor_lsn' in branches[0]

    # Create another branch, and start Postgres on it
-    env.zenith_cli(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
-    env.zenith_cli(['pg', 'create', 'test_branch_list_experimental'])
+    zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
+    zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])

-    cur.execute(f'branch_list {env.initial_tenant}')
+    cur.execute(f'branch_list {pageserver.initial_tenant}')
    new_branches = json.loads(cur.fetchone()[0])
    # Filter out branches created by other tests
    new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
@@ -56,22 +53,18 @@ def test_branch_list_psql(zenith_simple_env: ZenithEnv):
    conn.close()


-def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
-    # don't use zenith_simple_env, because there might be other tenants there,
-    # left over from other tests.
-    env = zenith_env_builder.init()
-
-    res = env.zenith_cli(["tenant", "list"])
+def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
+    res = zenith_cli.run(["tenant", "list"])
    res.check_returncode()
-    tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
-    assert tenants == [env.initial_tenant]
+    tenants = res.stdout.splitlines()
+    assert tenants == [pageserver.initial_tenant]

-    conn = env.pageserver.connect()
+    conn = pageserver.connect()
    cur = conn.cursor()

    # check same tenant cannot be created twice
-    with pytest.raises(psycopg2.DatabaseError, match=f'tenant {env.initial_tenant} already exists'):
-        cur.execute(f'tenant_create {env.initial_tenant}')
+    with pytest.raises(psycopg2.DatabaseError, match=f'tenant {pageserver.initial_tenant} already exists'):
+        cur.execute(f'tenant_create {pageserver.initial_tenant}')

    # create one more tenant
    tenant1 = uuid4().hex
@@ -80,20 +73,20 @@ def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
    cur.execute('tenant_list')

    # compare tenants list
-    new_tenants = sorted(map(lambda t: cast(str, t['id']), json.loads(cur.fetchone()[0])))
-    assert sorted([env.initial_tenant, tenant1]) == new_tenants
+    new_tenants = sorted(json.loads(cur.fetchone()[0]))
+    assert sorted([pageserver.initial_tenant, tenant1]) == new_tenants


 def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
    client.check_status()

    # check initial tenant is there
-    assert initial_tenant in {t['id'] for t in client.tenant_list()}
+    assert initial_tenant in set(client.tenant_list())

    # create new tenant and check it is also there
    tenant_id = uuid4()
    client.tenant_create(tenant_id)
-    assert tenant_id.hex in {t['id'] for t in client.tenant_list()}
+    assert tenant_id.hex in set(client.tenant_list())

    # create branch
    branch_name = uuid4().hex
@@ -103,17 +96,11 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
    assert branch_name in {b['name'] for b in client.branch_list(tenant_id)}


-def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):
-    env = zenith_simple_env
-    client = env.pageserver.http_client()
-    check_client(client, env.initial_tenant)
+def test_pageserver_http_api_client(pageserver: ZenithPageserver):
+    client = pageserver.http_client()
+    check_client(client, pageserver.initial_tenant)


-def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder):
-    zenith_env_builder.pageserver_auth_enabled = True
-    env = zenith_env_builder.init()
-
-    management_token = env.auth_keys.generate_management_token()
-
-    client = env.pageserver.http_client(auth_token=management_token)
-    check_client(client, env.initial_tenant)
+def test_pageserver_http_api_client_auth_enabled(pageserver_auth_enabled: ZenithPageserver):
+    client = pageserver_auth_enabled.http_client(auth_token=pageserver_auth_enabled.auth_keys.generate_management_token())
+    check_client(client, pageserver_auth_enabled.initial_tenant)
--- a/test_runner/batch_others/test_pageserver_restart.py
+++ b/test_runner/batch_others/test_pageserver_restart.py
@@ -4,22 +4,25 @@ import time

 from contextlib import closing
 from multiprocessing import Process, Value
-from fixtures.zenith_fixtures import ZenithEnvBuilder
-from fixtures.log_helper import log
+from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
+
+import logging
+import fixtures.log_helper  # configures loggers
+log = logging.getLogger('root')

 pytest_plugins = ("fixtures.zenith_fixtures")

-
 # Check that dead minority doesn't prevent the commits: execute insert n_inserts
 # times, with fault_probability chance of getting a wal acceptor down or up
 # along the way. 2 of 3 are always alive, so the work keeps going.
-def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
-    # One safekeeper is enough for this test.
-    zenith_env_builder.num_safekeepers = 1
-    env = zenith_env_builder.init()
+def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):

-    env.zenith_cli(["branch", "test_pageserver_restart", "main"])
-    pg = env.postgres.create_start('test_pageserver_restart')
+    # One safekeeper is enough for this test.
+    wa_factory.start_n_new(1)
+
+    zenith_cli.run(["branch", "test_pageserver_restart", "empty"])
+    pg = postgres.create_start('test_pageserver_restart',
+                               wal_acceptors=wa_factory.get_connstrs())

    pg_conn = pg.connect()
    cur = pg_conn.cursor()
@@ -41,14 +44,14 @@ def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
        from pg_settings where name = 'shared_buffers'
    ''')
    row = cur.fetchone()
-    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
+    log.info(f"shared_buffers is {row[0]}, table size {row[1]}");
    assert int(row[0]) < int(row[1])

    # Stop and restart pageserver. This is a more or less graceful shutdown, although
    # the page server doesn't currently have a shutdown routine so there's no difference
    # between stopping and crashing.
-    env.pageserver.stop()
-    env.pageserver.start()
+    pageserver.stop();
+    pageserver.start();

    # Stopping the pageserver breaks the connection from the postgres backend to
    # the page server, and causes the next query on the connection to fail. Start a new
@@ -62,5 +65,6 @@ def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
    assert cur.fetchone() == (100000, )

    # Stop the page server by force, and restart it
-    env.pageserver.stop()
-    env.pageserver.start()
+    pageserver.stop();
+    pageserver.start();
+
--- a/test_runner/batch_others/test_parallel_copy.py
+++ b/test_runner/batch_others/test_parallel_copy.py
@@ -1,54 +0,0 @@
-from io import BytesIO
-import asyncio
-import asyncpg
-import subprocess
-from fixtures.zenith_fixtures import ZenithEnv, Postgres
-from fixtures.log_helper import log
-
-pytest_plugins = ("fixtures.zenith_fixtures")
-
-
-async def repeat_bytes(buf, repetitions: int):
-    for i in range(repetitions):
-        yield buf
-
-
-async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str):
-    buf = BytesIO()
-    for i in range(1000):
-        buf.write(
-            f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode())
-    buf.seek(0)
-
-    copy_input = repeat_bytes(buf.read(), 5000)
-
-    pg_conn = await pg.connect_async()
-    await pg_conn.copy_to_table(table_name, source=copy_input)
-
-
-async def parallel_load_same_table(pg: Postgres, n_parallel: int):
-    workers = []
-    for worker_id in range(n_parallel):
-        worker = copy_test_data_to_table(pg, worker_id, f'copytest')
-        workers.append(asyncio.create_task(worker))
-
-    # await all workers
-    await asyncio.gather(*workers)
-
-
-# Load data into one table with COPY TO from 5 parallel connections
-def test_parallel_copy(zenith_simple_env: ZenithEnv, n_parallel=5):
-    env = zenith_simple_env
-    # Create a branch for us
-    env.zenith_cli(["branch", "test_parallel_copy", "empty"])
-
-    pg = env.postgres.create_start('test_parallel_copy')
-    log.info("postgres is running on 'test_parallel_copy' branch")
-
-    # Create test table
-    conn = pg.connect()
-    cur = conn.cursor()
-    cur.execute(f'CREATE TABLE copytest (i int, t text)')
-
-    # Run COPY TO to load the table with parallel connections.
-    asyncio.run(parallel_load_same_table(pg, n_parallel))
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Arthur Petukhovsky	8a44796905	Increase parallel workers to trigger more errors	2021-09-29 12:23:49 +03:00
Arthur Petukhovsky	ed521e05e7	Hide debug logs in test_wal_acceptor_async	2021-09-29 11:47:53 +03:00
Arthur Petukhovsky	6a13500da4	Fix print in last test	2021-09-29 11:47:53 +03:00
Arthur Petukhovsky	9b8168ebde	Don't log test output while running	2021-09-29 11:47:53 +03:00
Arthur Petukhovsky	f9bb4dbf08	Use f-strings for logs	2021-09-29 11:47:52 +03:00
Arthur Petukhovsky	20ee204c27	Fix string formatting	2021-09-29 11:47:52 +03:00
Arthur Petukhovsky	3fdd85bcb8	Use logging in python tests	2021-09-29 11:47:52 +03:00