Add in-memory storage engine

Add in-memory storage engine-a
Do not update relation metadata when materializing page in get_page_at_lsn
2026-03-21 09:10:37 +00:00 · 2021-07-22 10:59:30 +03:00 · 2021-07-22 10:59:28 +03:00 · 2021-07-20 23:03:35 +03:00 · 2021-07-20 22:23:24 +03:00 · 2021-07-20 19:29:28 +03:00
153 changed files with 16019 additions and 19945 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -0,0 +1,267 @@
+version: 2.1
+
+orbs:
+  python: circleci/python@1.4.0
+
+executors:
+  zenith-build-executor:
+    resource_class: xlarge
+    docker:
+      - image: cimg/rust:1.51.0
+
+jobs:
+
+  # A job to build postgres
+  build-postgres:
+    executor: zenith-build-executor
+    steps:
+        # Checkout the git repo (circleci doesn't have a flag to enable submodules here)
+      - checkout
+
+        # Grab the postgres git revision to build a cache key.
+        # Note this works even though the submodule hasn't been checkout out yet.
+      - run:
+          name: Get postgres cache key
+          command: |
+            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+
+      - restore_cache:
+          name: Restore postgres cache
+          keys:
+            # Restore ONLY if the rev key matches exactly
+            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+
+        # FIXME We could cache our own docker container, instead of installing packages every time.
+      - run:
+          name: apt install dependencies
+          command: |
+            if [ ! -e tmp_install/bin/postgres ]; then
+              sudo apt update
+              sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
+            fi
+
+        # Build postgres if the restore_cache didn't find a build.
+        # `make` can't figure out whether the cache is valid, since
+        # it only compares file timestamps.
+      - run:
+          name: build postgres
+          command: |
+            if [ ! -e tmp_install/bin/postgres ]; then
+              # "depth 1" saves some time by not cloning the whole repo
+              git submodule update --init --depth 1
+              make postgres
+            fi
+
+      - save_cache:
+          name: Save postgres cache
+          key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+          paths:
+            - tmp_install
+
+  # A job to build zenith rust code
+  build-zenith:
+    executor: zenith-build-executor
+    parameters:
+      build_type:
+        type: enum
+        enum: ["debug", "release"]
+    steps:
+      - run:
+          name: apt install dependencies
+          command: |
+            sudo apt update
+            sudo apt install libssl-dev clang
+
+        # Checkout the git repo (without submodules)
+      - checkout
+
+        # Grab the postgres git revision to build a cache key.
+        # Note this works even though the submodule hasn't been checkout out yet.
+      - run:
+          name: Get postgres cache key
+          command: |
+            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+
+      - restore_cache:
+          name: Restore postgres cache
+          keys:
+            # Restore ONLY if the rev key matches exactly
+            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
+
+      - restore_cache:
+          name: Restore rust cache
+          keys:
+            # Require an exact match. While an out of date cache might speed up the build,
+            # there's no way to clean out old packages, so the cache grows every time something
+            # changes.
+            - v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+
+        # Build the rust code, including test binaries
+      - run:
+          name: Rust build << parameters.build_type >>
+          command: |
+            export CARGO_INCREMENTAL=0
+            BUILD_TYPE="<< parameters.build_type >>"
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              echo "Build in debug mode"
+              cargo build --bins --tests
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              echo "Build in release mode"
+              cargo build --release --bins --tests
+            fi
+
+      - save_cache:
+          name: Save rust cache
+          key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
+          paths:
+            - ~/.cargo/registry
+            - ~/.cargo/git
+            - target
+
+        # Run rust unit tests
+      - run: cargo test
+
+        # Install the rust binaries, for use by test jobs
+        # `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
+        # FIXME: this is a really silly way to install; maybe we should just output
+        # a tarball as an artifact? Or a .deb package?
+      - run:
+          name: cargo install
+          command: |
+            export CARGO_INCREMENTAL=0
+            BUILD_TYPE="<< parameters.build_type >>"
+            if [[ $BUILD_TYPE == "debug" ]]; then
+              echo "Install debug mode"
+              CARGO_FLAGS="--debug"
+            elif [[ $BUILD_TYPE == "release" ]]; then
+              echo "Install release mode"
+              # The default is release mode; there is no --release flag.
+              CARGO_FLAGS=""
+            fi
+            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
+            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
+            cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith
+
+        # Install the postgres binaries, for use by test jobs
+        # FIXME: this is a silly way to do "install"; maybe just output a standard
+        # postgres package, whatever the favored form is (tarball? .deb package?)
+        # Note that pg_regress needs some build artifacts that probably aren't
+        # in the usual package...?
+      - run:
+          name: postgres install
+          command: |
+            cp -a tmp_install /tmp/zenith/pg_install
+
+        # Save the rust output binaries for other jobs in this workflow.
+      - persist_to_workspace:
+          root: /tmp/zenith
+          paths:
+            - "*"
+
+  run-pytest:
+    #description: "Run pytest"
+    executor: python/default
+    parameters:
+      # pytest args to specify the tests to run.
+      #
+      # This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory,
+      # or '-k foobar' to run tests containing string 'foobar'. See pytest man page
+      # section SPECIFYING TESTS / SELECTING TESTS for details.
+      #
+      # Select the type of Rust build. Must be "release" or "debug".
+      build_type:
+        type: string
+        default: "debug"
+      # This parameter is required, to prevent the mistake of running all tests in one job.
+      test_selection:
+        type: string
+        default: ""
+      # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
+      extra_params:
+        type: string
+        default: ""
+      needs_postgres_source:
+        type: boolean
+        default: false
+    steps:
+      - attach_workspace:
+          at: /tmp/zenith
+      - checkout
+      - when:
+          condition: << parameters.needs_postgres_source >>
+          steps:
+            - run: git submodule update --init --depth 1
+      - run:
+          name: Install pipenv & deps
+          working_directory: test_runner
+          command: |
+            pip install pipenv
+            pipenv install
+      - run:
+          name: Run pytest
+          working_directory: test_runner
+          environment:
+            - ZENITH_BIN: /tmp/zenith/bin
+            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
+            - TEST_OUTPUT: /tmp/test_output
+          command: |
+            TEST_SELECTION="<< parameters.test_selection >>"
+            EXTRA_PARAMS="<< parameters.extra_params >>"
+            if [ -z "$TEST_SELECTION" ]; then
+              echo "test_selection must be set"
+              exit 1
+            fi
+            # Run the tests.
+            #
+            # The junit.xml file allows CircleCI to display more fine-grained test information
+            # in its "Tests" tab in the results page.
+            # -s prevents pytest from capturing output, which helps to see
+            # what's going on if the test hangs
+            # --verbose prints name of each test (helpful when there are
+            # multiple tests in one file)
+            # -rA prints summary in the end
+            pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short -s --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
+      - run:
+          # CircleCI artifacts are preserved one file at a time, so skipping
+          # this step isn't a good idea. If you want to extract the
+          # pageserver state, perhaps a tarball would be a better idea.
+          name: Delete all data but logs
+          when: always
+          command: |
+            du -sh /tmp/test_output/*
+            find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" -delete
+            du -sh /tmp/test_output/*
+      - store_artifacts:
+          path: /tmp/test_output
+      # The store_test_results step tells CircleCI where to find the junit.xml file.
+      - store_test_results:
+          path: /tmp/test_output
+
+workflows:
+  build_and_test:
+    jobs:
+      - build-postgres
+      - build-zenith:
+          name: build-zenith-<< matrix.build_type >>
+          matrix:
+            parameters:
+              build_type: ["debug", "release"]
+          requires:
+            - build-postgres
+      - run-pytest:
+          name: pg_regress tests << matrix.build_type >>
+          matrix:
+            parameters:
+              build_type: ["debug", "release"]
+          test_selection: batch_pg_regress
+          needs_postgres_source: true
+          requires:
+            - build-zenith-<< matrix.build_type >>
+      - run-pytest:
+          name: other tests << matrix.build_type >>
+          matrix:
+            parameters:
+              build_type: ["debug", "release"]
+          test_selection: batch_others
+          requires:
+            - build-zenith-<< matrix.build_type >>
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,13 @@
+**/.git/
+**/__pycache__
+**/.pytest_cache
+
+/target
+/tmp_check
+/tmp_install
+/tmp_check_cli
+/test_output
+/.vscode
+/.zenith
+/integration_tests/.zenith
+/Dockerfile
--- a/.github/workflows/notifications.yml
+++ b/.github/workflows/notifications.yml
@@ -0,0 +1,45 @@
+name: Send Notifications
+
+on:
+  push:
+    branches: [ main ]
+
+jobs:
+  send-notifications:
+    timeout-minutes: 30
+    name: send commit notifications
+    runs-on: ubuntu-latest
+
+    steps:
+
+      - name: Checkout
+        uses: actions/checkout@v2
+        with:
+          submodules: true
+          fetch-depth: 2
+
+      - name: Form variables for notification message
+        id: git_info_grab
+        run: |
+          git_stat=$(git show --stat=50)
+          git_stat="${git_stat//'%'/'%25'}"
+          git_stat="${git_stat//$'\n'/'%0A'}"
+          git_stat="${git_stat//$'\r'/'%0D'}"
+          git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces
+          echo "::set-output name=git_stat::$git_stat"
+          echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
+          echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
+
+      - name: Send notification
+        uses: appleboy/telegram-action@master
+        with:
+          to: ${{ secrets.TELEGRAM_TO }}
+          token: ${{ secrets.TELEGRAM_TOKEN }}
+          format: markdown
+          args: |
+            *@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
+
+            ```
+            ${{ steps.git_info_grab.outputs.git_stat }}
+            ```
+
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -1,50 +1,41 @@
-name: regression check
+name: Build and Test

-on: [push]
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]

 jobs:
  regression-check:
+    strategy:
+      matrix:
+        # If we want to duplicate this job for different
+        # Rust toolchains (e.g. nightly or 1.37.0), add them here.
+        rust_toolchain: [stable]
+        os: [ubuntu-latest]
    timeout-minutes: 30
    name: run regression test suite
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}

    steps:
-
      - name: Checkout
        uses: actions/checkout@v2
        with:
          submodules: true
          fetch-depth: 2

-      - name: Form variables for notification message
-        id: git_info_grab
-        run: |
-          git_stat=$(git show --stat=50)
-          git_stat="${git_stat//'%'/'%25'}"
-          git_stat="${git_stat//$'\n'/'%0A'}"
-          git_stat="${git_stat//$'\r'/'%0D'}"
-          git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces
-          echo "::set-output name=git_stat::$git_stat"
-          echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
-          echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
-
-      - name: Send notification
-        uses: appleboy/telegram-action@master
+      - name: install rust toolchain ${{ matrix.rust_toolchain }}
+        uses: actions-rs/toolchain@v1
        with:
-          to: ${{ secrets.TELEGRAM_TO }}
-          token: ${{ secrets.TELEGRAM_TOKEN }}
-          format: markdown
-          args: |
-            *@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
-
-            ```
-            ${{ steps.git_info_grab.outputs.git_stat }}
-            ```
+          profile: minimal
+          toolchain: ${{ matrix.rust_toolchain }}
+          override: true

      - name: Install postgres dependencies
        run: |
          sudo apt update
-          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libxml2-dev libcurl4-openssl-dev
+          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev

      - name: Set pg revision for caching
        id: pg_ver
@@ -61,11 +52,7 @@ jobs:
      - name: Build postgres
        if: steps.cache_pg.outputs.cache-hit != 'true'
        run: |
-          ./pgbuild.sh
-
-      - name: Install rust
-        run: |
-          sudo apt install -y cargo
+          make postgres

      - name: Cache cargo deps
        id: cache_cargo
@@ -77,10 +64,10 @@ jobs:
            target
          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}

-      - name: Build
+      - name: Run cargo build
        run: |
-          cargo build
+          cargo build --workspace --bins --examples --tests

-      - name: Run test
+      - name: Run cargo test
        run: |
-          cargo test --test test_pageserver -- --nocapture --test-threads=1
+          cargo test -- --nocapture --test-threads=1
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,8 @@
 /tmp_check
 /tmp_install
 /tmp_check_cli
+__pycache__/
+test_output/
 .vscode
+/.zenith
+/integration_tests/.zenith
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,31 @@
+# How to contribute
+
+Howdy! Usual good software engineering practices apply. Write
+tests. Write comments. Follow standard Rust coding practices where
+possible. Use 'cargo fmt' and 'clippy' to tidy up formatting.
+
+There are soft spots in the code, which could use cleanup,
+refactoring, additional comments, and so forth. Let's try to raise the
+bar, and clean things up as we go. Try to leave code in a better shape
+than it was before.
+
+## Submitting changes
+
+1. Make a PR for every change.
+
+   Even seemingly trivial patches can break things in surprising ways.
+Use of common sense is OK. If you're only fixing a typo in a comment,
+it's probably fine to just push it. But if in doubt, open a PR.
+
+2. Get at least one +1 on your PR before you push.
+
+   For simple patches, it will only take a minute for someone to review
+it.
+
+3. Always keep the CI green.
+
+   Do not push, if the CI failed on your PR. Even if you think it's not
+your patch's fault. Help to fix the root cause if something else has
+broken the CI, before pushing.
+
+*Happy Hacking!*
--- a/20
+++ b/20
@@ -0,0 +1,20 @@
+This software is licensed under the Apache 2.0 License:
+
+----------------------------------------------------------------------------
+Copyright 2021 Zenith Labs, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+----------------------------------------------------------------------------
+
+The PostgreSQL submodule in vendor/postgres is licensed under the
+PostgreSQL license. See vendor/postgres/COPYRIGHT.
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,10 +1,16 @@
 [workspace]
 members = [
-    "integration_tests",
    "pageserver",
    "walkeeper",
    "zenith",
    "control_plane",
    "postgres_ffi",
    "zenith_utils",
+    "workspace_hack",
+    "proxy"
 ]
+
+[profile.release]
+# This is useful for profiling and, to some extent, debug.
+# Besides, debug info should not affect the performance.
+debug = true
--- a/95
+++ b/95
@@ -0,0 +1,95 @@
+#
+# Docker image for console integration testing.
+#
+# We may also reuse it in CI to unify installation process and as a general binaries building
+# tool for production servers.
+#
+# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls
+# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust
+# images which are statically linked and have guards against any dlopen. I would rather
+# prefer all static binaries so we may change the way librocksdb-sys builds or wait until
+# we will have our own storage and drop rockdb dependency.
+#
+# Cargo-chef is used to separate dependencies building from main binaries building. This
+# way `docker build` will download and install dependencies only of there are changes to
+# out Cargo.toml files.
+#
+
+
+#
+# build postgres separately -- this layer will be rebuilt only if one of
+# mentioned paths will get any changes
+#
+FROM alpine:3.13 as pg-build
+RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \
+                     make bison flex readline-dev zlib-dev perl linux-headers
+WORKDIR zenith
+COPY ./vendor/postgres vendor/postgres
+COPY ./Makefile Makefile
+# Build using clang and lld
+RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4
+
+#
+# Calculate cargo dependencies.
+# This will always run, but only generate recipe.json with list of dependencies without
+# installing them.
+#
+FROM alpine:20210212 as cargo-deps-inspect
+RUN apk add --update rust cargo
+RUN cargo install cargo-chef
+WORKDIR zenith
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+#
+# Build cargo dependencies.
+# This temp cantainner would be build only if recipe.json was changed.
+#
+FROM alpine:20210212 as deps-build
+RUN apk add --update rust cargo openssl-dev clang build-base
+# rust-rocksdb can be built against system-wide rocksdb -- that saves about
+# 10 minutes during build. Rocksdb apk package is in testing now, but use it
+# anyway. In case of any troubles we can download and build rocksdb here manually
+# (to cache it as a docker layer).
+RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
+WORKDIR zenith
+COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
+COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/
+COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
+RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
+
+#
+# Build zenith binaries
+#
+FROM alpine:20210212 as build
+RUN apk add --update rust cargo openssl-dev clang build-base
+RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
+WORKDIR zenith
+COPY . .
+# Copy cached dependencies
+COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
+COPY --from=deps-build /zenith/target target
+COPY --from=deps-build /root/.cargo /root/.cargo
+RUN cargo build --release
+
+#
+# Copy binaries to resulting image.
+# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure
+# out how to statically link rocksdb or avoid it at all).
+#
+FROM alpine:3.13
+RUN apk add --update openssl build-base
+RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
+COPY --from=build /zenith/target/release/pageserver /usr/local/bin
+COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
+COPY --from=build /zenith/target/release/proxy /usr/local/bin
+COPY --from=pg-build /zenith/tmp_install /usr/local
+COPY docker-entrypoint.sh /docker-entrypoint.sh
+
+RUN addgroup zenith && adduser -h /data -D -G zenith zenith
+VOLUME ["/data"]
+WORKDIR /data
+USER zenith
+EXPOSE 6400
+ENTRYPOINT ["/docker-entrypoint.sh"]
+CMD ["pageserver"]
--- a/202
+++ b/202
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/68
+++ b/68
@@ -0,0 +1,68 @@
+# Seccomp BPF is only available for Linux
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+	SECCOMP = --with-libseccomp
+else
+	SECCOMP =
+endif
+
+#
+# Top level Makefile to build Zenith and PostgreSQL
+#
+all: zenith postgres
+
+# We don't want to run 'cargo build' in parallel with the postgres build,
+# because interleaving cargo build output with postgres build output looks
+# confusing. Also, 'cargo build' is parallel on its own, so it would be too
+# much parallelism. (Recursive invocation of postgres target still gets any
+# '-j' flag from the command line, so 'make -j' is still useful.)
+.NOTPARALLEL:
+
+### Zenith Rust bits
+#
+# The 'postgres_ffi' depends on the Postgres headers.
+zenith: postgres-headers
+	cargo build
+
+### PostgreSQL parts
+tmp_install/build/config.status:
+	+@echo "Configuring postgres build"
+	mkdir -p tmp_install/build
+	(cd tmp_install/build && \
+	../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
+		--enable-cassert \
+		--enable-debug \
+		--enable-depend \
+		$(SECCOMP) \
+		--prefix=$(abspath tmp_install) > configure.log)
+
+# nicer alias for running 'configure'
+postgres-configure: tmp_install/build/config.status
+
+# Install the PostgreSQL header files into tmp_install/include
+postgres-headers: postgres-configure
+	+@echo "Installing PostgreSQL headers"
+	$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
+
+
+# Compile and install PostgreSQL and contrib/zenith
+postgres: postgres-configure
+	+@echo "Compiling PostgreSQL"
+	$(MAKE) -C tmp_install/build MAKELEVEL=0 install
+	+@echo "Compiling contrib/zenith"
+	$(MAKE) -C tmp_install/build/contrib/zenith install
+
+postgres-clean:
+	$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
+
+# This doesn't remove the effects of 'configure'.
+clean:
+	cd tmp_install/build && ${MAKE} clean
+	cargo clean
+
+# This removes everything
+distclean:
+	rm -rf tmp_install
+	cargo clean
+
+.PHONY: postgres-configure postgres postgres-headers zenith
--- a/1
+++ b/1
@@ -0,0 +1 @@
+./test_runner/Pipfile
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -0,0 +1 @@
+./test_runner/Pipfile.lock
--- a/README.md
+++ b/README.md
@@ -4,89 +4,164 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus

 ## Running local installation

-1. Build zenith and patched postgres
+1. Install build dependencies and other useful packages
+
+On Ubuntu or Debian this set of packages should be sufficient to build the code:
+```text
+apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
+libssl-dev clang
+```
+
+[Rust] 1.48 or later is also required.
+
+To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
+
+To run the integration tests (not required to use the code), install
+Python (3.6 or higher), and install python3 packages with `pip` (called `pip3` on some systems):
+```
+pip install pytest psycopg2
+```
+
+2. Build zenith and patched postgres
 ```sh
 git clone --recursive https://github.com/libzenith/zenith.git
 cd zenith
-./pgbuild.sh # builds postgres and installs it to ./tmp_install
-cargo build
+make -j5
 ```

-2. Start pageserver and postggres on top of it (should be called from repo root):
+3. Start pageserver and postgres on top of it (should be called from repo root):
 ```sh
-# Create ~/.zenith with proper paths to binaries and data
+# Create repository in .zenith with proper paths to binaries and data
 # Later that would be responsibility of a package install script
->./target/debug/zenith init
+> ./target/debug/zenith init
+<...>
+new zenith repository was created in .zenith

 # start pageserver
-> ./target/debug/zenith pageserver start
-Starting pageserver at '127.0.0.1:64000'
+> ./target/debug/zenith start
+Starting pageserver at '127.0.0.1:64000' in .zenith
+Pageserver started

-# create and configure postgres data dir
-> ./target/debug/zenith pg create
-Creating new postgres: path=/Users/user/code/zenith/tmp_check_cli/compute/pg1 port=55432
-Database initialized
+# start postgres on top on the pageserver
+> ./target/debug/zenith pg start main
+Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
+waiting for server to start.... done

-# start it
-> ./target/debug/zenith pg start pg1
-
-# look up status and connection info
-> ./target/debug/zenith pg list     
-NODE		ADDRESS				STATUS
-pg1			127.0.0.1:55432		running
+# check list of running postgres instances
+> ./target/debug/zenith pg list
+BRANCH	ADDRESS		LSN		STATUS
+main	127.0.0.1:55432	0/1609610	running
 ```

-3. Now it is possible to connect to postgres and run some queries:
-```
+4. Now it is possible to connect to postgres and run some queries:
+```text
 > psql -p55432 -h 127.0.0.1 postgres
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
 postgres=# insert into t values(1,1);
 INSERT 0 1
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
+```
+
+5. And create branches and run postgres on them:
+```sh
+# create branch named migration_check
+> ./target/debug/zenith branch migration_check main
+Created branch 'migration_check' at 0/1609610
+
+# check branches tree
+> ./target/debug/zenith branch
+ main
+ ┗━ @0/1609610: migration_check
+
+# start postgres on that branch
+> ./target/debug/zenith pg start migration_check
+Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
+waiting for server to start.... done
+
+# this new postgres instance will have all the data from 'main' postgres,
+# but all modifications would not affect data in original postgres
+> psql -p55433 -h 127.0.0.1 postgres
 postgres=# select * from t;
 key | value 
 -----+-------
   1 | 1
 (1 row)
+
+postgres=# insert into t values(2,2);
+INSERT 0 1
 ```

 ## Running tests

 ```sh
 git clone --recursive https://github.com/libzenith/zenith.git
-./pgbuild.sh # builds postgres and installs it to ./tmp_install
-cargo test -- --test-threads=1
+make # builds also postgres and installs it to ./tmp_install
+cd test_runner
+pytest
 ```

+## Documentation
+
+Now we use README files to cover design ideas and overall architecture for each module.
+And rustdoc style documentation comments.
+
+To view your documentation in a browser, try running `cargo doc --no-deps --open`
+
 ## Source tree layout

-/walkeeper:
+`/control_plane`:

-WAL safekeeper. Written in Rust.
+Local control plane.
+Functions to start, configure and stop pageserver and postgres instances running as a local processes.
+Intended to be used in integration tests and in CLI tools for local installations.

-/pageserver:
+`/zenith`
+
+Main entry point for the 'zenith' CLI utility.
+TODO: Doesn't it belong to control_plane?
+
+`/postgres_ffi`:
+
+Utility functions for interacting with PostgreSQL file formats.
+Misc constants, copied from PostgreSQL headers.
+
+`/zenith_utils`:
+
+Helpers that are shared between other crates in this repository.
+
+`/walkeeper`:
+
+WAL safekeeper (also known as WAL acceptor). Written in Rust.
+
+`/pageserver`:

 Page Server. Written in Rust.

 Depends on the modified 'postgres' binary for WAL redo.

-/integration_tests:
-
-Tests with different combinations of a Postgres compute node, WAL safekeeper and Page Server.
-
-/mgmt-console:
-
-Web UI to launch (modified) Postgres servers, using S3 as the backing store. Written in Python.
-This is somewhat outdated, as it doesn't use the WAL safekeeper or Page Servers.
-
-/vendor/postgres:
+`/vendor/postgres`:

 PostgreSQL source tree, with the modifications needed for Zenith.

-/vendor/postgres/src/bin/safekeeper:
+`/vendor/postgres/contrib/zenith`:

-Extension (safekeeper_proxy) that runs in the compute node, and connects to the WAL safekeepers
-and streams the WAL
+PostgreSQL extension that implements storage manager API and network communications with remote page server.

+`/test_runner`:

+Integration tests, written in Python using the `pytest` framework.

+`test_runner/zenith_regress`:
+
+Quick way to add new SQL regression test to integration tests set.
+
+`/integration_tests`:
+
+Another pack of integration tests. Written in Rust.
+
+[Rust]: https://www.rust-lang.org/learn/get-started
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,19 +9,19 @@ edition = "2018"
 [dependencies]
 rand = "0.8.3"
 tar = "0.4.33"
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-
-serde = ""
-serde_derive = ""
-toml = ""
-lazy_static = ""
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1"
+toml = "0.5"
+lazy_static = "1.4"
 regex = "1"
 anyhow = "1.0"
-hex = "0.4.3"
 bytes = "1.0.1"
-fs_extra = "1.2.0"
+nix = "0.20"
+url = "2.2.2"

 pageserver = { path = "../pageserver" }
 walkeeper = { path = "../walkeeper" }
 postgres_ffi = { path = "../postgres_ffi" }
+zenith_utils = { path = "../zenith_utils" }
+workspace_hack = { path = "../workspace_hack" }
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -1,23 +1,25 @@
-use std::fs::{self, File, OpenOptions};
-use std::io::{Read, Write};
+use std::io::Write;
 use std::net::SocketAddr;
 use std::net::TcpStream;
 use std::os::unix::fs::PermissionsExt;
-use std::path::Path;
 use std::process::Command;
 use std::sync::Arc;
 use std::time::Duration;
 use std::{collections::BTreeMap, path::PathBuf};
+use std::{
+    fs::{self, OpenOptions},
+    io::Read,
+};

 use anyhow::{Context, Result};
 use lazy_static::lazy_static;
 use regex::Regex;
-
-use postgres::{Client, NoTls};
+use zenith_utils::connstring::connection_host_port;

 use crate::local_env::LocalEnv;
-use crate::storage::{PageServerNode, WalProposerNode};
-use pageserver::{zenith_repo_dir, ZTimelineId};
+use pageserver::ZTimelineId;
+
+use crate::storage::PageServerNode;

 //
 // ComputeControlPlane
@@ -36,7 +38,7 @@ impl ComputeControlPlane {
        // it is running on default port. Change that when pageserver will have config.
        let pageserver = Arc::new(PageServerNode::from_env(&env));

-        let pgdatadirspath = env.repo_path.join("pgdatadirs");
+        let pgdatadirspath = &env.pg_data_dirs_path();
        let nodes: Result<BTreeMap<_, _>> = fs::read_dir(&pgdatadirspath)
            .with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
            .into_iter()
@@ -79,11 +81,10 @@ impl ComputeControlPlane {
        &mut self,
        is_test: bool,
        timelineid: ZTimelineId,
+        name: &str,
    ) -> Result<Arc<PostgresNode>> {
-        let node_id = self.nodes.len() as u32 + 1;
-
        let node = Arc::new(PostgresNode {
-            name: format!("pg{}", node_id),
+            name: name.to_owned(),
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
@@ -97,47 +98,24 @@ impl ComputeControlPlane {
        Ok(node)
    }

-    pub fn new_test_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
-        let node = self.new_from_page_server(true, timelineid);
-        assert!(node.is_ok());
-        let node = node.unwrap();
+    pub fn new_node(&mut self, branch_name: &str) -> Result<Arc<PostgresNode>> {
+        let timeline_id = self.pageserver.branch_get_by_name(branch_name)?.timeline_id;
+
+        let node = self.new_from_page_server(false, timeline_id, branch_name)?;

        // Configure the node to stream WAL directly to the pageserver
        node.append_conf(
            "postgresql.conf",
            format!(
-                "callmemaybe_connstring = '{}'\n", // FIXME escaping
+                concat!(
+                    "shared_preload_libraries = zenith\n",
+                    "synchronous_standby_names = 'pageserver'\n", // TODO: add a new function arg?
+                    "zenith.callmemaybe_connstring = '{}'\n",     // FIXME escaping
+                ),
                node.connstr()
            )
            .as_str(),
-        );
-
-        node
-    }
-
-    pub fn new_test_master_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
-        let node = self.new_from_page_server(true, timelineid).unwrap();
-
-        node.append_conf(
-            "postgresql.conf",
-            "synchronous_standby_names = 'safekeeper_proxy'\n",
-        );
-
-        node
-    }
-
-    pub fn new_node(&mut self, timelineid: ZTimelineId) -> Result<Arc<PostgresNode>> {
-        let node = self.new_from_page_server(false, timelineid).unwrap();
-
-        // Configure the node to stream WAL directly to the pageserver
-        node.append_conf(
-            "postgresql.conf",
-            format!(
-                "callmemaybe_connstring = '{}'\n", // FIXME escaping
-                node.connstr()
-            )
-            .as_str(),
-        );
+        )?;

        Ok(node)
    }
@@ -151,7 +129,7 @@ pub struct PostgresNode {
    pub env: LocalEnv,
    pageserver: Arc<PageServerNode>,
    is_test: bool,
-    timelineid: ZTimelineId,
+    pub timelineid: ZTimelineId,
 }

 impl PostgresNode {
@@ -169,6 +147,8 @@ impl PostgresNode {

        lazy_static! {
            static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
+            static ref CONF_TIMELINE_RE: Regex =
+                Regex::new(r"(?m)^\s*zenith.zenith_timeline\s*=\s*'(\w+)'\s*$").unwrap();
        }

        // parse data directory name
@@ -184,6 +164,7 @@ impl PostgresNode {
            )
        })?;

+        // parse port
        let err_msg = format!(
            "failed to find port definition in config file {}",
            cfg_path.to_str().unwrap()
@@ -199,11 +180,21 @@ impl PostgresNode {
            .parse()
            .with_context(|| err_msg)?;

-        // FIXME: What timeline is this server on? Would have to parse the postgresql.conf
-        // file for that, too. It's currently not needed for anything, but it would be
-        // nice to list the timeline in "zenith pg list"
-        let timelineid_buf = [0u8; 16];
-        let timelineid = ZTimelineId::from(timelineid_buf);
+        // parse timeline
+        let err_msg = format!(
+            "failed to find timeline definition in config file {}",
+            cfg_path.to_str().unwrap()
+        );
+        let timelineid: ZTimelineId = CONF_TIMELINE_RE
+            .captures(config.as_str())
+            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
+            .iter()
+            .last()
+            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
+            .ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
+            .as_str()
+            .parse()
+            .with_context(|| err_msg)?;

        // ok now
        Ok(PostgresNode {
@@ -269,50 +260,54 @@ impl PostgresNode {
        ar.unpack(&pgdata)
            .with_context(|| "extracting page backup failed")?;

-        // listen for selected port
+        // wal_log_hints is mandatory when running against pageserver (see gh issue#192)
+        // TODO: is it possible to check wal_log_hints at pageserver side via XLOG_PARAMETER_CHANGE?
        self.append_conf(
            "postgresql.conf",
            &format!(
                "max_wal_senders = 10\n\
+                 wal_log_hints = on\n\
                 max_replication_slots = 10\n\
                 hot_standby = on\n\
                 shared_buffers = 1MB\n\
-				 fsync = off\n\
+                 fsync = off\n\
                 max_connections = 100\n\
-				 wal_sender_timeout = 0\n\
+                 wal_sender_timeout = 0\n\
                 wal_level = replica\n\
                 listen_addresses = '{address}'\n\
                 port = {port}\n",
                address = self.address.ip(),
                port = self.address.port()
            ),
-        );
+        )?;

        // Never clean up old WAL. TODO: We should use a replication
        // slot or something proper, to prevent the compute node
        // from removing WAL that hasn't been streamed to the safekeepr or
        // page server yet. But this will do for now.
-        self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n");
+        self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;

        // Connect it to the page server.

        // Configure that node to take pages from pageserver
+        let (host, port) = connection_host_port(&self.pageserver.connection_config());
        self.append_conf(
            "postgresql.conf",
            &format!(
-                "page_server_connstring = 'host={} port={}'\n\
-                      zenith_timeline='{}'\n",
-                self.pageserver.address().ip(),
-                self.pageserver.address().port(),
-                self.timelineid
+                "shared_preload_libraries = zenith \n\
+                 zenith.page_server_connstring = 'host={} port={}'\n\
+                 zenith.zenith_timeline='{}'\n",
+                host, port, self.timelineid
            ),
-        );
+        )?;

+        fs::create_dir_all(self.pgdata().join("pg_wal"))?;
+        fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
        Ok(())
    }

-    fn pgdata(&self) -> PathBuf {
-        self.env.repo_path.join("pgdatadirs").join(&self.name)
+    pub fn pgdata(&self) -> PathBuf {
+        self.env.pg_data_dir(&self.name)
    }

    pub fn status(&self) -> &str {
@@ -328,13 +323,12 @@ impl PostgresNode {
        }
    }

-    pub fn append_conf(&self, config: &str, opts: &str) {
+    pub fn append_conf(&self, config: &str, opts: &str) -> Result<()> {
        OpenOptions::new()
            .append(true)
-            .open(self.pgdata().join(config).to_str().unwrap())
-            .unwrap()
-            .write_all(opts.as_bytes())
-            .unwrap();
+            .open(self.pgdata().join(config).to_str().unwrap())?
+            .write_all(opts.as_bytes())?;
+        Ok(())
    }

    fn pg_ctl(&self, args: &[&str]) -> Result<()> {
@@ -347,7 +341,8 @@ impl PostgresNode {
                        "-D",
                        self.pgdata().to_str().unwrap(),
                        "-l",
-                        self.pgdata().join("log").to_str().unwrap(),
+                        self.pgdata().join("pg.log").to_str().unwrap(),
+                        "-w", //wait till pg_ctl actually does what was asked
                    ],
                    args,
                ]
@@ -373,8 +368,16 @@ impl PostgresNode {
        self.pg_ctl(&["restart"])
    }

-    pub fn stop(&self) -> Result<()> {
-        self.pg_ctl(&["-m", "immediate", "stop"])
+    pub fn stop(&self, destroy: bool) -> Result<()> {
+        self.pg_ctl(&["-m", "immediate", "stop"])?;
+        if destroy {
+            println!(
+                "Destroying postgres data directory '{}'",
+                self.pgdata().to_str().unwrap()
+            );
+            fs::remove_dir_all(&self.pgdata())?;
+        }
+        Ok(())
    }

    pub fn connstr(&self) -> String {
@@ -398,131 +401,6 @@ impl PostgresNode {

        String::from_utf8(output.stdout).unwrap().trim().to_string()
    }
-
-    fn dump_log_file(&self) {
-        if let Ok(mut file) = File::open(self.env.repo_path.join("pageserver.log")) {
-            let mut buffer = String::new();
-            file.read_to_string(&mut buffer).unwrap();
-            println!("--------------- Dump pageserver.log:\n{}", buffer);
-        }
-    }
-
-    pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
-        let connstring = format!(
-            "host={} port={} dbname={} user={}",
-            self.address.ip(),
-            self.address.port(),
-            db,
-            self.whoami()
-        );
-        let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
-
-        println!("Running {}", sql);
-        let result = client.query(sql, &[]);
-        if result.is_err() {
-            self.dump_log_file();
-        }
-        result.unwrap()
-    }
-
-    pub fn open_psql(&self, db: &str) -> Client {
-        let connstring = format!(
-            "host={} port={} dbname={} user={}",
-            self.address.ip(),
-            self.address.port(),
-            db,
-            self.whoami()
-        );
-        Client::connect(connstring.as_str(), NoTls).unwrap()
-    }
-
-    pub fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
-        let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
-        match Command::new(proxy_path.as_path())
-            .args(&["--ztimelineid", &self.timelineid.to_string()])
-            .args(&["-s", wal_acceptors])
-            .args(&["-h", &self.address.ip().to_string()])
-            .args(&["-p", &self.address.port().to_string()])
-            .arg("-v")
-            .stderr(
-                OpenOptions::new()
-                    .create(true)
-                    .append(true)
-                    .open(self.pgdata().join("safekeeper_proxy.log"))
-                    .unwrap(),
-            )
-            .spawn()
-        {
-            Ok(child) => WalProposerNode { pid: child.id() },
-            Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
-        }
-    }
-
-    pub fn pg_regress(&self) {
-        self.safe_psql("postgres", "CREATE DATABASE regression");
-        let data_dir = zenith_repo_dir();
-        let regress_run_path = data_dir.join("regress");
-        fs::create_dir_all(&regress_run_path).unwrap();
-        fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
-        std::env::set_current_dir(regress_run_path).unwrap();
-
-        let regress_build_path =
-            Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
-        let regress_src_path =
-            Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
-
-        let _regress_check = Command::new(regress_build_path.join("pg_regress"))
-            .args(&[
-                "--bindir=''",
-                "--use-existing",
-                format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
-                format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
-                format!(
-                    "--schedule={}",
-                    regress_src_path.join("parallel_schedule").to_str().unwrap()
-                )
-                .as_str(),
-                format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
-            ])
-            .env_clear()
-            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .env("PGPORT", self.address.port().to_string())
-            .env("PGUSER", self.whoami())
-            .env("PGHOST", self.address.ip().to_string())
-            .status()
-            .expect("pg_regress failed");
-    }
-
-    pub fn pg_bench(&self, clients: u32, seconds: u32) {
-        let port = self.address.port().to_string();
-        let clients = clients.to_string();
-        let seconds = seconds.to_string();
-        let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
-            .args(&["-i", "-p", port.as_str(), "postgres"])
-            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .status()
-            .expect("pgbench -i");
-        let _pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
-            .args(&[
-                "-p",
-                port.as_str(),
-                "-T",
-                seconds.as_str(),
-                "-P",
-                "1",
-                "-c",
-                clients.as_str(),
-                "-M",
-                "prepared",
-                "postgres",
-            ])
-            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .status()
-            .expect("pgbench run");
-    }
 }

 impl Drop for PostgresNode {
@@ -531,7 +409,7 @@ impl Drop for PostgresNode {
    // and checking it here. But let just clean datadirs on start.
    fn drop(&mut self) {
        if self.is_test {
-            let _ = self.stop();
+            let _ = self.stop(true);
        }
    }
 }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,12 +1,31 @@
 //
 // Local control plane.
 //
-// Can start, cofigure and stop postgres instances running as a local processes.
+// Can start, configure and stop postgres instances running as a local processes.
 //
 // Intended to be used in integration tests and in CLI tools for
 // local installations.
 //
+use anyhow::{anyhow, bail, Context, Result};
+use std::fs;
+use std::path::Path;

 pub mod compute;
 pub mod local_env;
 pub mod storage;
+
+/// Read a PID file
+///
+/// We expect a file that contains a single integer.
+/// We return an i32 for compatibility with libc and nix.
+pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
+    let pid_str = fs::read_to_string(pidfile)
+        .with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
+    let pid: i32 = pid_str
+        .parse()
+        .map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
+    if pid < 1 {
+        bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
+    }
+    Ok(pid)
+}
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -4,34 +4,25 @@
 // Now it also provides init method which acts like a stub for proper installation
 // script which will use local paths.
 //
-use anyhow::Context;
-use bytes::Bytes;
-use rand::Rng;
-use std::env;
+use anyhow::{anyhow, Result};
+use serde::{Deserialize, Serialize};
 use std::fs;
-use std::path::{Path, PathBuf};
-use std::process::{Command, Stdio};
+use std::path::PathBuf;
+use std::{collections::BTreeMap, env};
+use url::Url;

-use anyhow::Result;
-use serde_derive::{Deserialize, Serialize};
-
-use pageserver::zenith_repo_dir;
-use pageserver::ZTimelineId;
-use postgres_ffi::xlog_utils;
+pub type Remotes = BTreeMap<String, String>;

 //
-// This data structure represents deserialized zenith config, which should be
-// located in ~/.zenith
-//
-// TODO: should we also support ZENITH_CONF env var?
+// This data structures represent deserialized zenith CLI config
 //
 #[derive(Serialize, Deserialize, Clone)]
 pub struct LocalEnv {
-    // Path to the Repository. Here page server and compute nodes will create and store their data.
-    pub repo_path: PathBuf,
+    // Pageserver connection strings
+    pub pageserver_connstring: String,

-    // System identifier, from the PostgreSQL control file
-    pub systemid: u64,
+    // Base directory for both pageserver and compute nodes
+    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
    // "lib", "share" from postgres distribution are there. If at some point
@@ -39,191 +30,118 @@ pub struct LocalEnv {
    // to four separate paths and match OS-specific installation layout.
    pub pg_distrib_dir: PathBuf,

-    // Path to pageserver binary.
-    pub zenith_distrib_dir: PathBuf,
+    // Path to pageserver binary. Empty for remote pageserver.
+    pub zenith_distrib_dir: Option<PathBuf>,
+
+    pub remotes: Remotes,
 }

 impl LocalEnv {
-    // postgres installation
+    // postgres installation paths
    pub fn pg_bin_dir(&self) -> PathBuf {
        self.pg_distrib_dir.join("bin")
    }
    pub fn pg_lib_dir(&self) -> PathBuf {
        self.pg_distrib_dir.join("lib")
    }
+
+    pub fn pageserver_bin(&self) -> Result<PathBuf> {
+        Ok(self
+            .zenith_distrib_dir
+            .as_ref()
+            .ok_or_else(|| anyhow!("Can not manage remote pageserver"))?
+            .join("pageserver"))
+    }
+
+    pub fn pg_data_dirs_path(&self) -> PathBuf {
+        self.base_data_dir.join("pgdatadirs")
+    }
+
+    pub fn pg_data_dir(&self, name: &str) -> PathBuf {
+        self.pg_data_dirs_path().join(name)
+    }
+
+    // TODO: move pageserver files into ./pageserver
+    pub fn pageserver_data_dir(&self) -> PathBuf {
+        self.base_data_dir.clone()
+    }
+}
+
+fn base_path() -> PathBuf {
+    match std::env::var_os("ZENITH_REPO_DIR") {
+        Some(val) => PathBuf::from(val.to_str().unwrap()),
+        None => ".zenith".into(),
+    }
 }

 //
 // Initialize a new Zenith repository
 //
-pub fn init() -> Result<()> {
+pub fn init(remote_pageserver: Option<&str>) -> Result<()> {
    // check if config already exists
-    let repo_path = zenith_repo_dir();
-    if repo_path.exists() {
+    let base_path = base_path();
+    if base_path.exists() {
        anyhow::bail!(
            "{} already exists. Perhaps already initialized?",
-            repo_path.to_str().unwrap()
-        );
-    }
-
-    // Now we can run init only from crate directory, so check that current dir is our crate.
-    // Use 'pageserver/Cargo.toml' existence as evidendce.
-    let cargo_path = env::current_dir()?;
-    if !cargo_path.join("pageserver/Cargo.toml").exists() {
-        anyhow::bail!(
-            "Current dirrectory does not look like a zenith repo. \
-            Please, run 'init' from zenith repo root."
+            base_path.to_str().unwrap()
        );
    }

    // ok, now check that expected binaries are present

-    // check postgres
-    let pg_distrib_dir = cargo_path.join("tmp_install");
-    let pg_path = pg_distrib_dir.join("bin/postgres");
-    if !pg_path.exists() {
-        anyhow::bail!(
-            "Can't find postres binary at {}. \
-                       Perhaps './pgbuild.sh' is needed to build it first.",
-            pg_path.to_str().unwrap()
-        );
-    }
-
-    // check pageserver
-    let zenith_distrib_dir = cargo_path.join("target/debug/");
-    let pageserver_path = zenith_distrib_dir.join("pageserver");
-    if !pageserver_path.exists() {
-        anyhow::bail!(
-            "Can't find pageserver binary at {}. Please build it.",
-            pageserver_path.to_str().unwrap()
-        );
-    }
-
-    // ok, we are good to go
-    let mut conf = LocalEnv {
-        repo_path,
-        pg_distrib_dir,
-        zenith_distrib_dir,
-        systemid: 0,
+    // Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
+    let pg_distrib_dir: PathBuf = {
+        if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+            postgres_bin.into()
+        } else {
+            let cwd = env::current_dir()?;
+            cwd.join("tmp_install")
+        }
    };
-    init_repo(&mut conf)?;
-
-    Ok(())
-}
-
-pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> {
-    let repopath = &local_env.repo_path;
-    fs::create_dir(&repopath)
-        .with_context(|| format!("could not create directory {}", repopath.display()))?;
-    fs::create_dir(repopath.join("pgdatadirs"))?;
-    fs::create_dir(repopath.join("timelines"))?;
-    fs::create_dir(repopath.join("refs"))?;
-    fs::create_dir(repopath.join("refs").join("branches"))?;
-    fs::create_dir(repopath.join("refs").join("tags"))?;
-    println!("created directory structure in {}", repopath.display());
-
-    // Create initial timeline
-    let tli = create_timeline(&local_env, None)?;
-    let timelinedir = repopath.join("timelines").join(tli.to_string());
-    println!("created initial timeline {}", timelinedir.display());
-
-    // Run initdb
-    //
-    // FIXME: we create it temporarily in "tmp" directory, and move it into
-    // the repository. Use "tempdir()" or something? Or just create it directly
-    // in the repo?
-    let initdb_path = local_env.pg_bin_dir().join("initdb");
-    let initdb = Command::new(initdb_path)
-        .args(&["-D", "tmp"])
-        .arg("--no-instructions")
-        .env_clear()
-        .env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap())
-        .env(
-            "DYLD_LIBRARY_PATH",
-            local_env.pg_lib_dir().to_str().unwrap(),
-        )
-        .stdout(Stdio::null())
-        .status()
-        .with_context(|| "failed to execute initdb")?;
-    if !initdb.success() {
-        anyhow::bail!("initdb failed");
+    if !pg_distrib_dir.join("bin/postgres").exists() {
+        anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
    }
-    println!("initdb succeeded");

-    // Read control file to extract the LSN and system id
-    let controlfile =
-        postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?;
-    let systemid = controlfile.system_identifier;
-    let lsn = controlfile.checkPoint;
-    let lsnstr = format!("{:016X}", lsn);
+    fs::create_dir(&base_path)?;
+    fs::create_dir(base_path.join("pgdatadirs"))?;

-    // Move the initial WAL file
-    fs::rename(
-        "tmp/pg_wal/000000010000000000000001",
-        timelinedir
-            .join("wal")
-            .join("000000010000000000000001.partial"),
-    )?;
-    println!("moved initial WAL file");
+    let conf = if let Some(addr) = remote_pageserver {
+        // check that addr is parsable
+        let _uri = Url::parse(addr).map_err(|e| anyhow!("{}: {}", addr, e))?;

-    // Remove pg_wal
-    fs::remove_dir_all("tmp/pg_wal")?;
-    println!("removed tmp/pg_wal");
+        LocalEnv {
+            pageserver_connstring: format!("postgresql://{}/", addr),
+            pg_distrib_dir,
+            zenith_distrib_dir: None,
+            base_data_dir: base_path,
+            remotes: BTreeMap::default(),
+        }
+    } else {
+        // Find zenith binaries.
+        let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+        if !zenith_distrib_dir.join("pageserver").exists() {
+            anyhow::bail!("Can't find pageserver binary.",);
+        }

-    force_crash_recovery(&PathBuf::from("tmp"))?;
-    println!("updated pg_control");
+        LocalEnv {
+            pageserver_connstring: "postgresql://127.0.0.1:6400".to_string(),
+            pg_distrib_dir,
+            zenith_distrib_dir: Some(zenith_distrib_dir),
+            base_data_dir: base_path,
+            remotes: BTreeMap::default(),
+        }
+    };

-    let target = timelinedir.join("snapshots").join(&lsnstr);
-    fs::rename("tmp", &target)?;
-    println!("moved 'tmp' to {}", target.display());
-
-    // Create 'main' branch to refer to the initial timeline
-    let data = tli.to_string();
-    fs::write(repopath.join("refs").join("branches").join("main"), data)?;
-    println!("created main branch");
-
-    // Also update the system id in the LocalEnv
-    local_env.systemid = systemid;
-
-    // write config
-    let toml = toml::to_string(&local_env)?;
-    fs::write(repopath.join("config"), toml)?;
-
-    println!(
-        "new zenith repository was created in {}",
-        repopath.display()
-    );
+    let toml = toml::to_string_pretty(&conf)?;
+    fs::write(conf.base_data_dir.join("config"), toml)?;

    Ok(())
 }

-// If control file says the cluster was shut down cleanly, modify it, to mark
-// it as crashed. That forces crash recovery when you start the cluster.
-//
-// FIXME:
-// We currently do this to the initial snapshot in "zenith init". It would
-// be more natural to do this when the snapshot is restored instead, but we
-// currently don't have any code to create new snapshots, so it doesn't matter
-// Or better yet, use a less hacky way of putting the cluster into recovery.
-// Perhaps create a backup label file in the data directory when it's restored.
-fn force_crash_recovery(datadir: &Path) -> Result<()> {
-    // Read in the control file
-    let controlfilepath = datadir.to_path_buf().join("global").join("pg_control");
-    let mut controlfile =
-        postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfilepath.as_path())?))?;
+// Locate and load config
+pub fn load_config() -> Result<LocalEnv> {
+    let repopath = base_path();

-    controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION;
-
-    fs::write(
-        controlfilepath.as_path(),
-        postgres_ffi::encode_pg_control(controlfile),
-    )?;
-
-    Ok(())
-}
-
-// check that config file is present
-pub fn load_config(repopath: &Path) -> Result<LocalEnv> {
    if !repopath.exists() {
        anyhow::bail!(
            "Zenith config is not found in {}. You need to run 'zenith init' first",
@@ -231,159 +149,18 @@ pub fn load_config(repopath: &Path) -> Result<LocalEnv> {
        );
    }

+    // TODO: check that it looks like a zenith repository
+
    // load and parse file
    let config = fs::read_to_string(repopath.join("config"))?;
    toml::from_str(config.as_str()).map_err(|e| e.into())
 }

-// local env for tests
-pub fn test_env(testname: &str) -> LocalEnv {
-    fs::create_dir_all("../tmp_check").expect("could not create directory ../tmp_check");
-
-    let repo_path = Path::new(env!("CARGO_MANIFEST_DIR"))
-        .join("../tmp_check/")
-        .join(testname);
-
-    // Remove remnants of old test repo
-    let _ = fs::remove_dir_all(&repo_path);
-
-    let mut local_env = LocalEnv {
-        repo_path,
-        pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
-        zenith_distrib_dir: cargo_bin_dir(),
-        systemid: 0,
-    };
-    init_repo(&mut local_env).expect("could not initialize zenith repository");
-    local_env
-}
-
-// Find the directory where the binaries were put (i.e. target/debug/)
-pub fn cargo_bin_dir() -> PathBuf {
-    let mut pathbuf = std::env::current_exe().unwrap();
-
-    pathbuf.pop();
-    if pathbuf.ends_with("deps") {
-        pathbuf.pop();
-    }
-
-    pathbuf
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct PointInTime {
-    pub timelineid: ZTimelineId,
-    pub lsn: u64,
-}
-
-fn create_timeline(local_env: &LocalEnv, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
-    let repopath = &local_env.repo_path;
-
-    // Create initial timeline
-    let mut tli_buf = [0u8; 16];
-    rand::thread_rng().fill(&mut tli_buf);
-    let timelineid = ZTimelineId::from(tli_buf);
-
-    let timelinedir = repopath.join("timelines").join(timelineid.to_string());
-
-    fs::create_dir(&timelinedir)?;
-    fs::create_dir(&timelinedir.join("snapshots"))?;
-    fs::create_dir(&timelinedir.join("wal"))?;
-
-    if let Some(ancestor) = ancestor {
-        let data = format!(
-            "{}@{:X}/{:X}",
-            ancestor.timelineid,
-            ancestor.lsn >> 32,
-            ancestor.lsn & 0xffffffff
-        );
-        fs::write(timelinedir.join("ancestor"), data)?;
-    }
-
-    Ok(timelineid)
-}
-
-// Parse an LSN in the format used in filenames
-//
-// For example: 00000000015D3DD8
-//
-fn parse_lsn(s: &str) -> std::result::Result<u64, std::num::ParseIntError> {
-    u64::from_str_radix(s, 16)
-}
-
-// Create a new branch in the repository (for the "zenith branch" subcommand)
-pub fn create_branch(
-    local_env: &LocalEnv,
-    branchname: &str,
-    startpoint: PointInTime,
-) -> Result<()> {
-    let repopath = &local_env.repo_path;
-
-    // create a new timeline for it
-    let newtli = create_timeline(local_env, Some(startpoint))?;
-    let newtimelinedir = repopath.join("timelines").join(newtli.to_string());
-
-    let data = newtli.to_string();
-    fs::write(
-        repopath.join("refs").join("branches").join(branchname),
-        data,
-    )?;
-
-    // Copy the latest snapshot (TODO: before the startpoint) and all WAL
-    // TODO: be smarter and avoid the copying...
-    let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(local_env, startpoint.timelineid)?;
-    let copy_opts = fs_extra::dir::CopyOptions::new();
-    fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), &copy_opts)?;
-
-    let oldtimelinedir = repopath
-        .join("timelines")
-        .join(startpoint.timelineid.to_string());
-    let mut copy_opts = fs_extra::dir::CopyOptions::new();
-    copy_opts.content_only = true;
-    fs_extra::dir::copy(
-        oldtimelinedir.join("wal"),
-        newtimelinedir.join("wal"),
-        &copy_opts,
-    )?;
+// Save config. We use that to change set of remotes from CLI itself.
+pub fn save_config(conf: &LocalEnv) -> Result<()> {
+    let config_path = base_path().join("config");
+    let conf_str = toml::to_string_pretty(conf)?;

+    fs::write(config_path, conf_str)?;
    Ok(())
 }
-
-// Find the end of valid WAL in a wal directory
-pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<u64> {
-    let repopath = &local_env.repo_path;
-    let waldir = repopath
-        .join("timelines")
-        .join(timeline.to_string())
-        .join("wal");
-
-    let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true);
-
-    Ok(lsn)
-}
-
-// Find the latest snapshot for a timeline
-fn find_latest_snapshot(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<(u64, PathBuf)> {
-    let repopath = &local_env.repo_path;
-
-    let snapshotsdir = repopath
-        .join("timelines")
-        .join(timeline.to_string())
-        .join("snapshots");
-    let paths = fs::read_dir(&snapshotsdir)?;
-    let mut maxsnapshot: u64 = 0;
-    let mut snapshotdir: Option<PathBuf> = None;
-    for path in paths {
-        let path = path?;
-        let filename = path.file_name().to_str().unwrap().to_owned();
-        if let Ok(lsn) = parse_lsn(&filename) {
-            maxsnapshot = std::cmp::max(lsn, maxsnapshot);
-            snapshotdir = Some(path.path());
-        }
-    }
-    if maxsnapshot == 0 {
-        // TODO: check ancestor timeline
-        anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
-    }
-
-    Ok((maxsnapshot, snapshotdir.unwrap()))
-}
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -1,139 +1,19 @@
-use anyhow::Result;
-use std::fs;
-use std::io;
-use std::net::SocketAddr;
+use std::collections::HashMap;
 use std::net::TcpStream;
-use std::path::{Path, PathBuf};
+use std::path::PathBuf;
 use std::process::Command;
-use std::str::FromStr;
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
 use std::thread;
 use std::time::Duration;

-use postgres::{Client, NoTls};
+use anyhow::{anyhow, bail, Result};
+use nix::sys::signal::{kill, Signal};
+use nix::unistd::Pid;
+use postgres::{Config, NoTls};

 use crate::local_env::LocalEnv;
-use pageserver::ZTimelineId;
-
-//
-// Collection of several example deployments useful for tests.
-//
-// I'm intendedly modelling storage and compute control planes as a separate entities
-// as it is closer to the actual setup.
-//
-pub struct TestStorageControlPlane {
-    pub wal_acceptors: Vec<WalAcceptorNode>,
-    pub pageserver: Arc<PageServerNode>,
-    pub test_done: AtomicBool,
-    pub repopath: PathBuf,
-}
-
-impl TestStorageControlPlane {
-    // Peek into the repository, to grab the timeline ID of given branch
-    pub fn get_branch_timeline(&self, branchname: &str) -> ZTimelineId {
-        let branchpath = self.repopath.join("refs/branches/".to_owned() + branchname);
-
-        ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap()
-    }
-
-    // postgres <-> page_server
-    //
-    // Initialize a new repository and configure a page server to run in it
-    //
-    pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
-        let repopath = local_env.repo_path.clone();
-
-        let pserver = Arc::new(PageServerNode {
-            env: local_env.clone(),
-            kill_on_exit: true,
-            listen_address: None,
-        });
-        pserver.start().unwrap();
-
-        TestStorageControlPlane {
-            wal_acceptors: Vec::new(),
-            pageserver: pserver,
-            test_done: AtomicBool::new(false),
-            repopath,
-        }
-    }
-
-    pub fn one_page_server_no_start(local_env: &LocalEnv) -> TestStorageControlPlane {
-        let repopath = local_env.repo_path.clone();
-
-        let pserver = Arc::new(PageServerNode {
-            env: local_env.clone(),
-            kill_on_exit: true,
-            listen_address: None,
-        });
-
-        TestStorageControlPlane {
-            wal_acceptors: Vec::new(),
-            pageserver: pserver,
-            test_done: AtomicBool::new(false),
-            repopath,
-        }
-    }
-
-    // postgres <-> {wal_acceptor1, wal_acceptor2, ...}
-    pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
-        let repopath = local_env.repo_path.clone();
-
-        let mut cplane = TestStorageControlPlane {
-            wal_acceptors: Vec::new(),
-            pageserver: Arc::new(PageServerNode {
-                env: local_env.clone(),
-                kill_on_exit: true,
-                listen_address: None,
-            }),
-            test_done: AtomicBool::new(false),
-            repopath,
-        };
-        cplane.pageserver.start().unwrap();
-
-        const WAL_ACCEPTOR_PORT: usize = 54321;
-
-        for i in 0..redundancy {
-            let wal_acceptor = WalAcceptorNode {
-                listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
-                    .parse()
-                    .unwrap(),
-                data_dir: local_env.repo_path.join(format!("wal_acceptor_{}", i)),
-                env: local_env.clone(),
-            };
-            wal_acceptor.init();
-            wal_acceptor.start();
-            cplane.wal_acceptors.push(wal_acceptor);
-        }
-        cplane
-    }
-
-    pub fn stop(&self) {
-        for wa in self.wal_acceptors.iter() {
-            let _ = wa.stop();
-        }
-        self.test_done.store(true, Ordering::Relaxed);
-    }
-
-    pub fn get_wal_acceptor_conn_info(&self) -> String {
-        self.wal_acceptors
-            .iter()
-            .map(|wa| wa.listen.to_string())
-            .collect::<Vec<String>>()
-            .join(",")
-    }
-
-    pub fn is_running(&self) -> bool {
-        self.test_done.load(Ordering::Relaxed)
-    }
-}
-
-impl Drop for TestStorageControlPlane {
-    fn drop(&mut self) {
-        self.stop();
-    }
-}
+use crate::read_pidfile;
+use pageserver::branches::BranchInfo;
+use zenith_utils::connstring::connection_address;

 //
 // Control routines for pageserver.
@@ -141,8 +21,8 @@ impl Drop for TestStorageControlPlane {
 // Used in CLI and tests.
 //
 pub struct PageServerNode {
-    kill_on_exit: bool,
-    listen_address: Option<SocketAddr>,
+    pub kill_on_exit: bool,
+    pub connection_config: Option<Config>,
    pub env: LocalEnv,
 }

@@ -150,45 +30,69 @@ impl PageServerNode {
    pub fn from_env(env: &LocalEnv) -> PageServerNode {
        PageServerNode {
            kill_on_exit: false,
-            listen_address: None, // default
+            connection_config: None, // default
            env: env.clone(),
        }
    }

-    pub fn address(&self) -> SocketAddr {
-        match self.listen_address {
-            Some(addr) => addr,
-            None => "127.0.0.1:64000".parse().unwrap(),
+    fn default_config() -> Config {
+        "postgresql://no_user@localhost:64000/no_db"
+            .parse()
+            .unwrap()
+    }
+
+    pub fn connection_config(&self) -> Config {
+        match &self.connection_config {
+            Some(config) => config.clone(),
+            None => Self::default_config(),
+        }
+    }
+
+    pub fn init(&self) -> Result<()> {
+        let mut cmd = Command::new(self.env.pageserver_bin()?);
+        let status = cmd
+            .args(&[
+                "--init",
+                "-D",
+                self.env.base_data_dir.to_str().unwrap(),
+                "--postgres-distrib",
+                self.env.pg_distrib_dir.to_str().unwrap(),
+            ])
+            .env_clear()
+            .env("RUST_BACKTRACE", "1")
+            .status()
+            .expect("pageserver init failed");
+
+        if status.success() {
+            Ok(())
+        } else {
+            Err(anyhow!("pageserver init failed"))
        }
    }

    pub fn repo_path(&self) -> PathBuf {
-        self.env.repo_path.clone()
+        self.env.pageserver_data_dir()
    }

    pub fn pid_file(&self) -> PathBuf {
-        self.env.repo_path.join("pageserver.pid")
+        self.repo_path().join("pageserver.pid")
    }

    pub fn start(&self) -> Result<()> {
        println!(
            "Starting pageserver at '{}' in {}",
-            self.address(),
+            connection_address(&self.connection_config()),
            self.repo_path().display()
        );

-        let mut cmd = Command::new(self.env.zenith_distrib_dir.join("pageserver"));
-        cmd.args(&["-l", self.address().to_string().as_str()])
+        let mut cmd = Command::new(self.env.pageserver_bin()?);
+        cmd.args(&["-D", self.repo_path().to_str().unwrap()])
            .arg("-d")
            .env_clear()
-            .env("RUST_BACKTRACE", "1")
-            .env("ZENITH_REPO_DIR", self.repo_path())
-            .env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
-            .env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
-            .env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
+            .env("RUST_BACKTRACE", "1");

        if !cmd.status()?.success() {
-            anyhow::bail!(
+            bail!(
                "Pageserver failed to start. See '{}' for details.",
                self.repo_path().join("pageserver.log").display()
            );
@@ -201,70 +105,104 @@ impl PageServerNode {
            if client.is_ok() {
                break;
            } else {
-                println!("page server not responding yet, retrying ({})...", retries);
+                println!("Pageserver not responding yet, retrying ({})...", retries);
                thread::sleep(Duration::from_secs(1));
            }
        }
+
+        println!("Pageserver started");
+
        Ok(())
    }

    pub fn stop(&self) -> Result<()> {
-        let pidfile = self.pid_file();
-        let pid = read_pidfile(&pidfile)?;
-
-        let status = Command::new("kill")
-            .arg(&pid)
-            .env_clear()
-            .status()
-            .expect("failed to execute kill");
-
-        if !status.success() {
-            anyhow::bail!("Failed to kill pageserver with pid {}", pid);
+        let pid = read_pidfile(&self.pid_file())?;
+        let pid = Pid::from_raw(pid);
+        if kill(pid, Signal::SIGTERM).is_err() {
+            bail!("Failed to kill pageserver with pid {}", pid);
        }

-        // await for pageserver stop
+        // wait for pageserver stop
+        let address = connection_address(&self.connection_config());
        for _ in 0..5 {
-            let stream = TcpStream::connect(self.address());
+            let stream = TcpStream::connect(&address);
+            thread::sleep(Duration::from_secs(1));
            if let Err(_e) = stream {
+                println!("Pageserver stopped");
                return Ok(());
            }
-            println!("Stopping pageserver on {}", self.address());
-            thread::sleep(Duration::from_secs(1));
+            println!("Stopping pageserver on {}", address);
        }

-        // ok, we failed to stop pageserver, let's panic
-        if !status.success() {
-            anyhow::bail!("Failed to stop pageserver with pid {}", pid);
-        } else {
-            Ok(())
-        }
+        bail!("Failed to stop pageserver with pid {}", pid);
    }

    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
-        let connstring = format!(
-            "host={} port={} dbname={} user={}",
-            self.address().ip(),
-            self.address().port(),
-            "no_db",
-            "no_user",
-        );
-        let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
+        let mut client = self.connection_config().connect(NoTls).unwrap();

        println!("Pageserver query: '{}'", sql);
        client.simple_query(sql).unwrap()
    }

-    pub fn page_server_psql_client(
-        &self,
-    ) -> std::result::Result<postgres::Client, postgres::Error> {
-        let connstring = format!(
-            "host={} port={} dbname={} user={}",
-            self.address().ip(),
-            self.address().port(),
-            "no_db",
-            "no_user",
-        );
-        Client::connect(connstring.as_str(), NoTls)
+    pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
+        self.connection_config().connect(NoTls)
+    }
+
+    pub fn branches_list(&self) -> Result<Vec<BranchInfo>> {
+        let mut client = self.page_server_psql_client()?;
+        let query_result = client.simple_query("branch_list")?;
+        let branches_json = query_result
+            .first()
+            .map(|msg| match msg {
+                postgres::SimpleQueryMessage::Row(row) => row.get(0),
+                _ => None,
+            })
+            .flatten()
+            .ok_or_else(|| anyhow!("missing branches"))?;
+
+        let res: Vec<BranchInfo> = serde_json::from_str(branches_json)?;
+        Ok(res)
+    }
+
+    pub fn branch_create(&self, name: &str, startpoint: &str) -> Result<BranchInfo> {
+        let mut client = self.page_server_psql_client()?;
+        let query_result =
+            client.simple_query(format!("branch_create {} {}", name, startpoint).as_str())?;
+
+        let branch_json = query_result
+            .first()
+            .map(|msg| match msg {
+                postgres::SimpleQueryMessage::Row(row) => row.get(0),
+                _ => None,
+            })
+            .flatten()
+            .ok_or_else(|| anyhow!("missing branch"))?;
+
+        let res: BranchInfo = serde_json::from_str(branch_json).map_err(|e| {
+            anyhow!(
+                "failed to parse branch_create response: {}: {}",
+                branch_json,
+                e
+            )
+        })?;
+
+        Ok(res)
+    }
+
+    // TODO: make this a separate request type and avoid loading all the branches
+    pub fn branch_get_by_name(&self, name: &str) -> Result<BranchInfo> {
+        let branch_infos = self.branches_list()?;
+        let branche_by_name: Result<HashMap<String, BranchInfo>> = branch_infos
+            .into_iter()
+            .map(|branch_info| Ok((branch_info.name.clone(), branch_info)))
+            .collect();
+        let branche_by_name = branche_by_name?;
+
+        let branch = branche_by_name
+            .get(name)
+            .ok_or_else(|| anyhow!("Branch {} not found", name))?;
+
+        Ok(branch.clone())
    }
 }

@@ -275,106 +213,3 @@ impl Drop for PageServerNode {
        }
    }
 }
-
-//
-// Control routines for WalAcceptor.
-//
-// Now used only in test setups.
-//
-pub struct WalAcceptorNode {
-    listen: SocketAddr,
-    data_dir: PathBuf,
-    env: LocalEnv,
-}
-
-impl WalAcceptorNode {
-    pub fn init(&self) {
-        if self.data_dir.exists() {
-            fs::remove_dir_all(self.data_dir.clone()).unwrap();
-        }
-        fs::create_dir_all(self.data_dir.clone()).unwrap();
-    }
-
-    pub fn start(&self) {
-        println!(
-            "Starting wal_acceptor in {} listening '{}'",
-            self.data_dir.to_str().unwrap(),
-            self.listen
-        );
-
-        let status = Command::new(self.env.zenith_distrib_dir.join("wal_acceptor"))
-            .args(&["-D", self.data_dir.to_str().unwrap()])
-            .args(&["-l", self.listen.to_string().as_str()])
-            .args(&["--systemid", &self.env.systemid.to_string()])
-            // Tell page server it can receive WAL from this WAL safekeeper
-            // FIXME: If there are multiple safekeepers, they will all inform
-            // the page server. Only the last "notification" will stay in effect.
-            // So it's pretty random which safekeeper the page server will connect to
-            .args(&["--pageserver", "127.0.0.1:64000"])
-            .arg("-d")
-            .arg("-n")
-            .status()
-            .expect("failed to start wal_acceptor");
-
-        if !status.success() {
-            panic!("wal_acceptor start failed");
-        }
-    }
-
-    pub fn stop(&self) -> std::result::Result<(), io::Error> {
-        println!("Stopping wal acceptor on {}", self.listen);
-        let pidfile = self.data_dir.join("wal_acceptor.pid");
-        let pid = read_pidfile(&pidfile)?;
-        // Ignores any failures when running this command
-        let _status = Command::new("kill")
-            .arg(pid)
-            .env_clear()
-            .status()
-            .expect("failed to execute kill");
-
-        Ok(())
-    }
-}
-
-impl Drop for WalAcceptorNode {
-    fn drop(&mut self) {
-        self.stop().unwrap();
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-pub struct WalProposerNode {
-    pub pid: u32,
-}
-
-impl WalProposerNode {
-    pub fn stop(&self) {
-        let status = Command::new("kill")
-            .arg(self.pid.to_string())
-            .env_clear()
-            .status()
-            .expect("failed to execute kill");
-
-        if !status.success() {
-            panic!("kill start failed");
-        }
-    }
-}
-
-impl Drop for WalProposerNode {
-    fn drop(&mut self) {
-        self.stop();
-    }
-}
-
-/// Read a PID file
-///
-/// This should contain an unsigned integer, but we return it as a String
-/// because our callers only want to pass it back into a subcommand.
-fn read_pidfile(pidfile: &Path) -> std::result::Result<String, io::Error> {
-    fs::read_to_string(pidfile).map_err(|err| {
-        eprintln!("failed to read pidfile {:?}: {:?}", pidfile, err);
-        err
-    })
-}
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+if [ "$1" = 'pageserver' ]; then
+    if [ ! -d "/data/timelines" ]; then
+        echo "Initializing pageserver data directory"
+        pageserver --init -D /data --postgres-distrib /usr/local
+    fi
+    echo "Staring pageserver at 0.0.0.0:6400"
+    pageserver -l 0.0.0.0:6400 -D /data
+else
+    "$@"
+fi
--- a/integration_tests/.gitignore
+++ b/integration_tests/.gitignore
@@ -1 +0,0 @@
-tmp_check/
--- a/integration_tests/Cargo.toml
+++ b/integration_tests/Cargo.toml
@@ -1,17 +0,0 @@
-[package]
-name = "integration_tests"
-version = "0.1.0"
-authors = ["Stas Kelvich <stas@zenith.tech>"]
-edition = "2018"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-lazy_static = "1.4.0"
-rand = "0.8.3"
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-
-pageserver = { path = "../pageserver" }
-walkeeper = { path = "../walkeeper" }
-control_plane = { path = "../control_plane" }
--- a/integration_tests/tests/test_compute.rs
+++ b/integration_tests/tests/test_compute.rs
@@ -1,11 +0,0 @@
-// test node resettlement to an empty datadir
-
-// TODO
-/*
-#[test]
-fn test_resettlement() {}
-
-// test seq scan of everythin after restart
-#[test]
-fn test_cold_seqscan() {}
-*/
--- a/integration_tests/tests/test_control_plane.rs
+++ b/integration_tests/tests/test_control_plane.rs
@@ -1,8 +0,0 @@
-// TODO
-/*
-#[test]
-fn test_actions() {}
-
-#[test]
-fn test_regress() {}
-*/
--- a/integration_tests/tests/test_pageserver.rs
+++ b/integration_tests/tests/test_pageserver.rs
@@ -1,148 +0,0 @@
-// mod control_plane;
-use control_plane::compute::ComputeControlPlane;
-use control_plane::local_env;
-use control_plane::local_env::PointInTime;
-use control_plane::storage::TestStorageControlPlane;
-
-// XXX: force all redo at the end
-// -- restart + seqscan won't read deleted stuff
-// -- pageserver api endpoint to check all rels
-#[test]
-fn test_redo_cases() {
-    let local_env = local_env::test_env("test_redo_cases");
-
-    // Start pageserver that reads WAL directly from that postgres
-    let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-
-    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_node(maintli);
-    node.start().unwrap();
-
-    // check basic work with table
-    node.safe_psql(
-        "postgres",
-        "CREATE TABLE t(key int primary key, value text)",
-    );
-    node.safe_psql(
-        "postgres",
-        "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
-    );
-    let count: i64 = node
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 5000050000);
-
-    // check 'create table as'
-    node.safe_psql("postgres", "CREATE TABLE t2 AS SELECT * FROM t");
-    let count: i64 = node
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 5000050000);
-}
-
-// Runs pg_regress on a compute node
-#[test]
-fn test_regress() {
-    let local_env = local_env::test_env("test_regress");
-
-    // Start pageserver that reads WAL directly from that postgres
-    let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-
-    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_node(maintli);
-    node.start().unwrap();
-
-    node.pg_regress();
-}
-
-// Runs pg_bench on a compute node
-#[test]
-fn pgbench() {
-    let local_env = local_env::test_env("pgbench");
-
-    // Start pageserver that reads WAL directly from that postgres
-    let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-
-    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_node(maintli);
-    node.start().unwrap();
-
-    node.pg_bench(10, 100);
-}
-
-// Run two postgres instances on one pageserver, on different timelines
-#[test]
-fn test_pageserver_two_timelines() {
-    let local_env = local_env::test_env("test_pageserver_two_timelines");
-
-    // Start pageserver that reads WAL directly from that postgres
-    let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-
-    let maintli = storage_cplane.get_branch_timeline("main");
-
-    // Create new branch at the end of 'main'
-    let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
-    local_env::create_branch(
-        &local_env,
-        "experimental",
-        PointInTime {
-            timelineid: maintli,
-            lsn: startpoint,
-        },
-    )
-    .unwrap();
-    let experimentaltli = storage_cplane.get_branch_timeline("experimental");
-
-    // Launch postgres instances on both branches
-    let node1 = compute_cplane.new_test_node(maintli);
-    let node2 = compute_cplane.new_test_node(experimentaltli);
-    node1.start().unwrap();
-    node2.start().unwrap();
-
-    // check node1
-    node1.safe_psql(
-        "postgres",
-        "CREATE TABLE t(key int primary key, value text)",
-    );
-    node1.safe_psql(
-        "postgres",
-        "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
-    );
-    let count: i64 = node1
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 5000050000);
-
-    // check node2
-    node2.safe_psql(
-        "postgres",
-        "CREATE TABLE t(key int primary key, value text)",
-    );
-    node2.safe_psql(
-        "postgres",
-        "INSERT INTO t SELECT generate_series(100000,200000), 'payload'",
-    );
-    let count: i64 = node2
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 15000150000);
-}
--- a/integration_tests/tests/test_wal_acceptor.rs
+++ b/integration_tests/tests/test_wal_acceptor.rs
@@ -1,308 +0,0 @@
-// Restart acceptors one by one while compute is under the load.
-use control_plane::compute::ComputeControlPlane;
-use control_plane::local_env;
-use control_plane::local_env::PointInTime;
-use control_plane::storage::TestStorageControlPlane;
-use pageserver::ZTimelineId;
-
-use rand::Rng;
-use std::sync::Arc;
-use std::time::SystemTime;
-use std::{thread, time};
-
-#[test]
-fn test_acceptors_normal_work() {
-    let local_env = local_env::test_env("test_acceptors_normal_work");
-
-    const REDUNDANCY: usize = 3;
-    let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-    let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
-
-    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_master_node(maintli);
-    node.start().unwrap();
-
-    // start proxy
-    let _proxy = node.start_proxy(&wal_acceptors);
-
-    // check basic work with table
-    node.safe_psql(
-        "postgres",
-        "CREATE TABLE t(key int primary key, value text)",
-    );
-    node.safe_psql(
-        "postgres",
-        "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
-    );
-    let count: i64 = node
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 5000050000);
-    // check wal files equality
-}
-
-// Run page server and multiple safekeepers, and multiple compute nodes running
-// against different timelines.
-#[test]
-fn test_many_timelines() {
-    // Initialize a new repository, and set up WAL safekeepers and page server.
-    const REDUNDANCY: usize = 3;
-    const N_TIMELINES: usize = 5;
-    let local_env = local_env::test_env("test_many_timelines");
-    let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-    let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
-
-    // Create branches
-    let mut timelines: Vec<ZTimelineId> = Vec::new();
-    let maintli = storage_cplane.get_branch_timeline("main"); // main branch
-    timelines.push(maintli);
-    let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
-    for i in 1..N_TIMELINES {
-        // additional branches
-        let branchname = format!("experimental{}", i);
-        local_env::create_branch(
-            &local_env,
-            &branchname,
-            PointInTime {
-                timelineid: maintli,
-                lsn: startpoint,
-            },
-        )
-        .unwrap();
-        let tli = storage_cplane.get_branch_timeline(&branchname);
-        timelines.push(tli);
-    }
-
-    // start postgres on each timeline
-    let mut nodes = Vec::new();
-    for tli in timelines {
-        let node = compute_cplane.new_test_node(tli);
-        nodes.push(node.clone());
-        node.start().unwrap();
-        node.start_proxy(&wal_acceptors);
-    }
-
-    // create schema
-    for node in &nodes {
-        node.safe_psql(
-            "postgres",
-            "CREATE TABLE t(key int primary key, value text)",
-        );
-    }
-
-    // Populate data
-    for node in &nodes {
-        node.safe_psql(
-            "postgres",
-            "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
-        );
-    }
-
-    // Check data
-    for node in &nodes {
-        let count: i64 = node
-            .safe_psql("postgres", "SELECT sum(key) FROM t")
-            .first()
-            .unwrap()
-            .get(0);
-        println!("sum = {}", count);
-        assert_eq!(count, 5000050000);
-    }
-}
-
-// Majority is always alive
-#[test]
-fn test_acceptors_restarts() {
-    let local_env = local_env::test_env("test_acceptors_restarts");
-
-    // Start pageserver that reads WAL directly from that postgres
-    const REDUNDANCY: usize = 3;
-    const FAULT_PROBABILITY: f32 = 0.01;
-
-    let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-    let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
-    let mut rng = rand::thread_rng();
-
-    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_master_node(maintli);
-    node.start().unwrap();
-
-    // start proxy
-    let _proxy = node.start_proxy(&wal_acceptors);
-    let mut failed_node: Option<usize> = None;
-
-    // check basic work with table
-    node.safe_psql(
-        "postgres",
-        "CREATE TABLE t(key int primary key, value text)",
-    );
-    let mut psql = node.open_psql("postgres");
-    for i in 1..=1000 {
-        psql.execute("INSERT INTO t values ($1, 'payload')", &[&i])
-            .unwrap();
-        let prob: f32 = rng.gen();
-        if prob <= FAULT_PROBABILITY {
-            if let Some(node) = failed_node {
-                storage_cplane.wal_acceptors[node].start();
-                failed_node = None;
-            } else {
-                let node: usize = rng.gen_range(0..REDUNDANCY);
-                failed_node = Some(node);
-                storage_cplane.wal_acceptors[node].stop().unwrap();
-            }
-        }
-    }
-    let count: i64 = node
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 500500);
-}
-
-fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
-    let cp = cplane.clone();
-    thread::spawn(move || {
-        thread::sleep(time::Duration::from_secs(1));
-        cp.wal_acceptors[no].start();
-    });
-}
-
-// Stop majority of acceptors while compute is under the load. Boot
-// them again and check that nothing was losed. Repeat.
-// N_CRASHES env var
-#[test]
-fn test_acceptors_unavailability() {
-    let local_env = local_env::test_env("test_acceptors_unavailability");
-
-    // Start pageserver that reads WAL directly from that postgres
-    const REDUNDANCY: usize = 2;
-
-    let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-    let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
-
-    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_master_node(maintli);
-    node.start().unwrap();
-
-    // start proxy
-    let _proxy = node.start_proxy(&wal_acceptors);
-
-    // check basic work with table
-    node.safe_psql(
-        "postgres",
-        "CREATE TABLE t(key int primary key, value text)",
-    );
-    let mut psql = node.open_psql("postgres");
-    psql.execute("INSERT INTO t values (1, 'payload')", &[])
-        .unwrap();
-
-    storage_cplane.wal_acceptors[0].stop().unwrap();
-    let cp = Arc::new(storage_cplane);
-    start_acceptor(&cp, 0);
-    let now = SystemTime::now();
-    psql.execute("INSERT INTO t values (2, 'payload')", &[])
-        .unwrap();
-    assert!(now.elapsed().unwrap().as_secs() > 1);
-    psql.execute("INSERT INTO t values (3, 'payload')", &[])
-        .unwrap();
-
-    cp.wal_acceptors[1].stop().unwrap();
-    start_acceptor(&cp, 1);
-    psql.execute("INSERT INTO t values (4, 'payload')", &[])
-        .unwrap();
-    assert!(now.elapsed().unwrap().as_secs() > 2);
-
-    psql.execute("INSERT INTO t values (5, 'payload')", &[])
-        .unwrap();
-
-    let count: i64 = node
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 15);
-}
-
-fn simulate_failures(cplane: Arc<TestStorageControlPlane>) {
-    let mut rng = rand::thread_rng();
-    let n_acceptors = cplane.wal_acceptors.len();
-    let failure_period = time::Duration::from_secs(1);
-    while cplane.is_running() {
-        thread::sleep(failure_period);
-        let mask: u32 = rng.gen_range(0..(1 << n_acceptors));
-        for i in 0..n_acceptors {
-            if (mask & (1 << i)) != 0 {
-                cplane.wal_acceptors[i].stop().unwrap();
-            }
-        }
-        thread::sleep(failure_period);
-        for i in 0..n_acceptors {
-            if (mask & (1 << i)) != 0 {
-                cplane.wal_acceptors[i].start();
-            }
-        }
-    }
-}
-
-// Race condition test
-#[test]
-fn test_race_conditions() {
-    let local_env = local_env::test_env("test_race_conditions");
-
-    // Start pageserver that reads WAL directly from that postgres
-    const REDUNDANCY: usize = 3;
-
-    let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(
-        &local_env, REDUNDANCY,
-    ));
-    let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
-    let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
-
-    // start postgres
-    let maintli = storage_cplane.get_branch_timeline("main");
-    let node = compute_cplane.new_test_master_node(maintli);
-    node.start().unwrap();
-
-    // start proxy
-    let _proxy = node.start_proxy(&wal_acceptors);
-
-    // check basic work with table
-    node.safe_psql(
-        "postgres",
-        "CREATE TABLE t(key int primary key, value text)",
-    );
-
-    let cp = storage_cplane.clone();
-    let failures_thread = thread::spawn(move || {
-        simulate_failures(cp);
-    });
-
-    let mut psql = node.open_psql("postgres");
-    for i in 1..=1000 {
-        psql.execute("INSERT INTO t values ($1, 'payload')", &[&i])
-            .unwrap();
-    }
-    let count: i64 = node
-        .safe_psql("postgres", "SELECT sum(key) FROM t")
-        .first()
-        .unwrap()
-        .get(0);
-    println!("sum = {}", count);
-    assert_eq!(count, 500500);
-
-    storage_cplane.stop();
-    failures_thread.join().unwrap();
-}
--- a/mgmt-console/.gitignore
+++ b/mgmt-console/.gitignore
@@ -1,23 +0,0 @@
-# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
-
-# dependencies
-/node_modules
-/.pnp
-.pnp.js
-
-# testing
-/coverage
-
-# production
-/build
-
-# misc
-.DS_Store
-.env.local
-.env.development.local
-.env.test.local
-.env.production.local
-
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
--- a/mgmt-console/README
+++ b/mgmt-console/README
@@ -1,55 +0,0 @@
-Mock implementation of a management console.
-
-See demo-howto.txt for usage.
-
-Building and Installation
-------------------------
-
-To compile Postgres:
-  sudo apt build-dep postgresql
-  sudo apt install bison flex libz-dev libssl-dev
-  sudo apt install ccache
-  sudo apt install libcurl4-openssl-dev libxml2-dev
-
-For the webapp:
-  # NOTE: This requires at least version 1.1.0 of python3-flask. That's not
-  # available in Debian Buster, need at least Bullseye.
-
-  sudo apt install python3 python3-flask python3-pip npm webpack
-  pip3 install Flask-BasicAuth
-  pip3 install boto3
-
-git clone and compile and install patched version of Postgres:
-
-  git clone https://github.com/libzenith/postgres.git
-  cd postgres
-  git checkout zenith-experiments
-  ./configure --enable-debug --enable-cassert --with-openssl --prefix=/home/heikki/pgsql-install --with-libxml CC="ccache gcc" CFLAGS="-O0"
-  make -j4 -s install
-
-Get the webapp:
-  cd ~
-  git clone https://github.com/libzenith/zenith-mgmt-console.git
-  cd zenith-mgmt-console
-  mkdir pgdatadirs
-
-
-  openssl req -new -x509 -days 365 -nodes -text -out server.crt \
-    -keyout server.key -subj "/CN=zenith-demo"
-
-For Mock S3 server (unless you want to test against a real cloud service):
-  sudo apt install python3-tornado
-
-  cd ~/zenith-mgmt-console
-  git clone https://github.com/hlinnaka/ms3.git
-
-Compile & run it:
-  npm install
-  webpack # compile React app
-
-  BASIC_AUTH_PASSWORD=<password> ./launch-local.sh
-
-
-You can view the contents of the S3 bucket with browser:
-
-http://<server>/list_bucket
--- a/mgmt-console/app.py
+++ b/mgmt-console/app.py
@@ -1,340 +0,0 @@
-from flask import request
-from flask_basicauth import BasicAuth
-from flask import render_template
-from subprocess import PIPE, STDOUT, run, Popen
-import html
-import os
-import re
-import shutil
-import logging
-import time
-
-import boto3
-from boto3.session import Session
-from botocore.client import Config
-from botocore.handlers import set_list_objects_encoding_type_url
-
-from flask import Flask
-
-import waldump
-
-
-app = Flask(__name__)
-
-app.config['BASIC_AUTH_USERNAME'] = 'zenith'
-app.config['BASIC_AUTH_PASSWORD'] = os.getenv('BASIC_AUTH_PASSWORD')
-app.config['BASIC_AUTH_FORCE'] = True
-
-basic_auth = BasicAuth(app)
-
-# S3 configuration:
-
-ENDPOINT = os.getenv('S3_ENDPOINT', 'https://localhost:9000')
-ACCESS_KEY = os.getenv('S3_ACCESSKEY', 'minioadmin')
-SECRET = os.getenv('S3_SECRET', '')
-BUCKET = os.getenv('S3_BUCKET', 'foobucket')
-
-print("Using bucket at " + ENDPOINT);
-
-#boto3.set_stream_logger('botocore', logging.DEBUG)
-
-session = Session(aws_access_key_id=ACCESS_KEY,
-                  aws_secret_access_key=SECRET,
-                  region_name=os.getenv('S3_REGION', 'auto'))
-
-# needed for google cloud?
-session.events.unregister('before-parameter-build.s3.ListObjects',
-                          set_list_objects_encoding_type_url)
-
-s3resource = session.resource('s3',
-                              endpoint_url=ENDPOINT,
-                              verify=False,
-                              config=Config(signature_version='s3v4'))
-s3bucket = s3resource.Bucket(BUCKET)
-
-s3_client = boto3.client('s3',
-                         endpoint_url=ENDPOINT,
-                         verify=False,
-                         config=Config(signature_version='s3v4'),
-                         aws_access_key_id=ACCESS_KEY,
-                         aws_secret_access_key=SECRET)
-
-
-@app.route("/")
-def index():
-    return render_template("index.html")
-
-
-@app.route("/api/waldump")
-def render_waldump():
-    return render_template("waldump.html")
-
-@app.route('/api/fetch_wal')
-def fetch_wal():
-    return waldump.fetch_wal(request, s3bucket);
-
-@app.route("/api/server_status")
-def server_status():
-    dirs = os.listdir("pgdatadirs")
-    dirs.sort()
-
-    primary = None
-    standbys = []
-
-    for dirname in dirs:
-        
-        result = run("pg_ctl status -D pgdatadirs/" + dirname, stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
-
-        srv = {
-            'datadir': dirname,
-            'status': result.stdout,
-            'port': None
-        }
-
-        if dirname == 'primary':
-            primary = srv;
-            primary['port'] = 5432;
-        else:
-            standby_match = re.search('standby_([0-9]+)', dirname)
-            if standby_match:
-                srv['port'] = int(standby_match.group(1))
-
-            standbys.append(srv);
-
-    return {'primary': primary, 'standbys': standbys}
-
-@app.route('/api/list_bucket')
-def list_bucket():
-
-    response = 'cloud bucket contents:<br>\n'
-
-    for file in s3bucket.objects.all():
-        response = response + html.escape(file.key) + '<br>\n'
-
-    return response
-
-def walpos_str(walpos):
-    return '{:X}/{:X}'.format(walpos >> 32, walpos & 0xFFFFFFFF)
-
-@app.route('/api/bucket_summary')
-def bucket_summary():
-
-    nonrelimages = []
-    minwal = int(0)
-    maxwal = int(0)
-    minseqwal = int(0)
-    maxseqwal = int(0)
-
-    for file in s3bucket.objects.all():
-        path = file.key
-        match = re.search('nonreldata/nonrel_([0-9A-F]+).tar', path)
-        if match:
-            walpos = int(match.group(1), 16)
-            nonrelimages.append(walpos_str(walpos))
-
-        match = re.search('nonreldata/nonrel_([0-9A-F]+)-([0-9A-F]+)', path)
-        if match:
-            endwal = int(match.group(2), 16)
-            if endwal > maxwal:
-                maxwal = endwal
-
-        match = re.search('walarchive/([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', path)
-        if match:
-            tli = int(match.group(1), 16)
-            logno = int(match.group(2), 16)
-            segno = int(match.group(3), 16)
-            # FIXME: this assumes default 16 MB wal segment size
-            logsegno = logno * (0x100000000 / (16*1024*1024)) + segno
-
-            seqwal = int((logsegno + 1) * (16*1024*1024))
-
-            if seqwal > maxseqwal:
-                maxseqwal = seqwal;
-            if minseqwal == 0 or seqwal < minseqwal:
-                minseqwal = seqwal;
-
-    return {
-        'nonrelimages': nonrelimages,
-        'minwal': walpos_str(minwal),
-        'maxwal': walpos_str(maxwal),
-        'minseqwal': walpos_str(minseqwal),
-        'maxseqwal': walpos_str(maxseqwal)
-        }
-
-def print_cmd_result(cmd_result):
-    return print_cmd_result_ex(cmd_result.args, cmd_result.returncode, cmd_result.stdout)
-
-def print_cmd_result_ex(cmd, returncode, stdout):
-    res = ''
-    res += 'ran command:\n' + str(cmd) + '\n'
-    res += 'It returned code ' + str(returncode) + '\n'
-    res += '\n'
-    res += 'stdout/stderr:\n'
-    res += stdout
-
-    return res
-
-@app.route('/api/init_primary', methods=['GET', 'POST'])
-def init_primary():
-    
-    initdb_result = run("initdb -D pgdatadirs/primary --username=zenith --pwfile=pg-password.txt", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
-    if initdb_result.returncode != 0:
-        return print_cmd_result(initdb_result)
-    
-    # Append archive_mode and archive_command and port to postgresql.conf
-    f=open("pgdatadirs/primary/postgresql.conf", "a+")
-    f.write("listen_addresses='*'\n")
-    f.write("archive_mode=on\n")
-    f.write("archive_command='zenith_push --archive-wal-path=%p --archive-wal-fname=%f'\n")
-    f.write("ssl=on\n")
-    f.close()
-
-    f=open("pgdatadirs/primary/pg_hba.conf", "a+")
-    f.write("# allow SSL connections with password from anywhere\n")
-    f.write("hostssl    all             all             0.0.0.0/0           md5\n")
-    f.write("hostssl    all             all             ::0/0               md5\n")
-    f.close()
-
-    shutil.copyfile("server.crt", "pgdatadirs/primary/server.crt")
-    shutil.copyfile("server.key", "pgdatadirs/primary/server.key")
-    os.chmod("pgdatadirs/primary/server.key", 0o0600)
-    
-    start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-    start_rc = start_proc.wait()
-    start_stdout, start_stderr = start_proc.communicate()
-
-    responsestr = print_cmd_result(initdb_result) + '\n'
-    responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
-
-    return responsestr
-
-@app.route('/api/zenith_push', methods=['GET', 'POST'])
-def zenith_push():
-    # Stop the primary if it's running
-    stop_result = run(args=["pg_ctl", "stop", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-    
-    # Call zenith_push
-    push_result = run("zenith_push -D pgdatadirs/primary", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
-
-    # Restart the primary
-    start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-    start_rc = start_proc.wait()
-    start_stdout, start_stderr = start_proc.communicate()
-    
-    responsestr = print_cmd_result(stop_result) + '\n'
-    responsestr += print_cmd_result(push_result) + '\n'
-    responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout) + '\n'
-
-    return responsestr
-
-@app.route('/api/create_standby', methods=['GET', 'POST'])
-def create_standby():
-
-    walpos = request.form.get('walpos')
-    if not walpos:
-        return 'no walpos'
-    
-    dirs = os.listdir("pgdatadirs")
-
-    last_port = 5432
-
-    for dirname in dirs:
-
-        standby_match = re.search('standby_([0-9]+)', dirname)
-        if standby_match:
-            port = int(standby_match.group(1))
-            if port > last_port:
-                last_port = port
-
-    standby_port = last_port + 1
-
-    standby_dir = "pgdatadirs/standby_" + str(standby_port)
-
-    # Call zenith_restore
-    restore_result = run(["zenith_restore", "--end=" + walpos, "-D", standby_dir], stdout=PIPE, stderr=STDOUT, encoding='latin1')
-    responsestr = print_cmd_result(restore_result)
-
-    if restore_result.returncode == 0:
-        # Append hot_standby and port to postgresql.conf
-        f=open(standby_dir + "/postgresql.conf", "a+")
-        f.write("hot_standby=on\n")
-        f.write("port=" + str(standby_port) + "\n")
-        f.close()
-
-        start_proc = Popen(args=["pg_ctl", "start", "-D", standby_dir, "-l", standby_dir + "/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-        start_rc = start_proc.wait()
-        start_stdout, start_stderr = start_proc.communicate()
-        responsestr += '\n\n' + print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
-
-    return responsestr
-
-@app.route('/api/destroy_server', methods=['GET', 'POST'])
-def destroy_primary():
-
-    datadir = request.form.get('datadir')
-
-    # Check that the datadir parameter doesn't contain anything funny.
-    if not re.match("^[A-Za-z0-9_-]+$", datadir):
-        raise Exception('invalid datadir: ' + datadir)
-    
-    # Stop the server if it's running
-    stop_result = run(args=["pg_ctl", "stop", "-m", "immediate", "-D", "pgdatadirs/" + datadir], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-
-    shutil.rmtree('pgdatadirs/' + datadir, ignore_errors=True)
-
-    responsestr = print_cmd_result(stop_result) + '\n'
-    responsestr += 'Deleted datadir ' + datadir + '.\n'
-
-    return responsestr
-
-@app.route('/api/restore_primary', methods=['GET', 'POST'])
-def restore_primary():
-
-    # Call zenith_restore
-    restore_result = run(["zenith_restore", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, encoding='latin1')
-    responsestr = print_cmd_result(restore_result)
-
-    # Append restore_command to postgresql.conf, so that it can find the last raw WAL segments
-    f=open("pgdatadirs/primary/postgresql.conf", "a+")
-    f.write("listen_addresses='*'\n")
-    f.write("restore_command='zenith_restore --archive-wal-path=%p --archive-wal-fname=%f'\n")
-    f.write("ssl=on\n")
-    f.close()
-    
-    if restore_result.returncode == 0:
-        start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
-        start_rc = start_proc.wait()
-        start_stdout, start_stderr = start_proc.communicate()
-        responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
-
-    return responsestr
-
-@app.route('/api/slicedice', methods=['GET', 'POST'])
-def run_slicedice():
-    result = run("zenith_slicedice", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
-    
-    responsestr = print_cmd_result(result)
-
-    return responsestr
-
-@app.route('/api/reset_demo', methods=['POST'])
-def reset_all():
-    result = run("pkill -9 postgres", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
-
-    dirs = os.listdir("pgdatadirs")
-    for dirname in dirs:
-        shutil.rmtree('pgdatadirs/' + dirname)
-        
-    for file in s3bucket.objects.all():
-        s3_client.delete_object(Bucket = BUCKET, Key = file.key)
-
-    responsestr = print_cmd_result(result) + '\n'
-    responsestr += '''
-Deleted all Postgres datadirs.
-Deleted all files in object storage bucket.
-'''
-
-    return responsestr
-
-if __name__ == '__main__':
-    app.run()
--- a/mgmt-console/babel.config.js
+++ b/mgmt-console/babel.config.js
@@ -1,3 +0,0 @@
-module.exports = {
-    presets: ["@babel/preset-env", "@babel/preset-react"],
-};
--- a/mgmt-console/demo-howto.txt
+++ b/mgmt-console/demo-howto.txt
@@ -1,67 +0,0 @@
-Mock implementation of a management console.
-
-This isn't very different from a "normal" PostgreSQL installation with
-a base backup and WAL archive. The main user-visible difference is
-that when you create a standby server, we don't restore the whole data
-directory, but only the "non-relation" files. Relation files are
-restored on demand, when they're accessed the first time. That makes
-the "create standby" operation is very fast, but with some delay when
-you connect and start running queries instead.  Most visible if you
-have a large database. (However, see note below about large databases)
-
-Note: lots of things are broken/unsafe. Things will fail if a table is
-larger than 1 GB. Or if there are more than 1000 files in the cloud
-bucket.
-
-How to use this demo:
-
-1. If there are any leftovers from previous runs, reset by clicking
-   the RESET DEMO button.  This kills and deletes all Postgres servers,
-   and empties the cloud storage bucket
-
-2. Create primary server by clicking on the "Init primary" button
-
-3. Push a base image of the primary to cloud storage, by clicking the
-   "push base image" button.  (This takes about 30 seconds, be
-   patient)
-
-4. Connect to primary with psql, and create a test table with a little data.
-
-      psql postgres  -p5432 -U zenith -h<host>
-
-      create table mytable (i int4);
-
-      insert into mytable values (1);
-      select pg_switch_wal();
-
-   The Postgres password is the same as for the management console.
-
-3. Now that there's a new WAL segment in the arhive, we can "slice &
-   dice" it. Click on the "Slice & dice button".
-
-4. Perform more updates on the primary, to generate more WAL.
-
-      insert into mytable values (2); select pg_switch_wal();
-      insert into mytable values (3); select pg_switch_wal();
-      insert into mytable values (4); select pg_switch_wal();
-      insert into mytable values (5); select pg_switch_wal();
-
-5. Slice & Dice the WAL again
-
-6. Now you can create read-only standby servers at any point in the
-   WAL. Type a WAL position in the text box (or use the slider), and
-   click "Create new standby". The first standby is created at port 5433,
-   the second at port 5434, and so forth.
-
-7. Connect to the standby with "psql -p 5433". Note that it takes a
-   few seconds until the connection is established. That's because the
-   standby has to restore the basic system catalogs, like pg_database and
-   pg_authid from the backup. After connecting, you can do "\d" to list
-   tables, this will also take a few seconds, as more catalog tables are
-   restored from backup.  Subsequent commands will be faster.
-
-   Run queries in the standby:
-
-      select * from mytable;
-
-   the result depends on the LSN that you picked when you created the server.
--- a/mgmt-console/js/app.js
+++ b/mgmt-console/js/app.js
@@ -1,463 +0,0 @@
-import React, { useState, useEffect } from 'react';
-import ReactDOM from 'react-dom';
-import Loader from "react-loader-spinner";
-import { Router, Route, Link, IndexRoute, hashHistory, browserHistory } from 'react-router';
-
-function ServerStatus(props) {
-    const datadir = props.server.datadir;
-    const status = props.server.status;
-    const port = props.server.port;
-
-    return (
-	<div>
-	    <h2>{ datadir == 'primary' ? 'Primary' : datadir }</h2>
-	    status: <div className='status'>{status}</div><br/>
-	    to connect: <span className='shellcommand'>psql -h { window.location.hostname } -p { port } -U zenith postgres</span><br/>
-	</div>
-    );
-}
-
-function StandbyList(props) {
-    const bucketSummary = props.bucketSummary;
-    const standbys = props.standbys;
-    const maxwalpos = bucketSummary.maxwal ? walpos_to_int(bucketSummary.maxwal) : 0;
-
-    const [walposInput, setWalposInput] = useState({ src: 'text', value: '0/0'});
-
-    // find earliest base image
-    const minwalpos = bucketSummary.nonrelimages ? bucketSummary.nonrelimages.reduce((minpos, imgpos_str, index, array) => {
-	const imgpos = walpos_to_int(imgpos_str);
-	return (minpos == 0 || imgpos < minpos) ? imgpos : minpos;
-    }, 0) : 0;
-
-    const can_create_standby = minwalpos > 0 && maxwalpos > 0 && maxwalpos >= minwalpos;
-    var walpos_valid = true;
-
-    function create_standby() {
-	const formdata = new FormData();
-	formdata.append("walpos", walposStr);
-
-	props.startOperation('Creating new standby at ' + walposStr + '...',
-			     fetch("/api/create_standby", { method: 'POST', body: formdata }));
-    }
-
-    function destroy_standby(datadir) {
-	const formdata = new FormData();
-	formdata.append("datadir", datadir);
-	props.startOperation('Destroying ' + datadir + '...',
-			     fetch("/api/destroy_server", { method: 'POST', body: formdata }));
-    }
-
-    const handleSliderChange = (event) => {
-	setWalposInput({ src: 'slider', value: event.target.value });
-    }    
-
-    const handleWalposChange = (event) => {
-	setWalposInput({ src: 'text', value: event.target.value });
-    }
-
-    var sliderValue;
-    var walposStr;
-    if (walposInput.src == 'text')
-    {
-	const walpos = walpos_to_int(walposInput.value);
-
-	if (walpos >= minwalpos && walpos <= maxwalpos)
-	    walpos_valid = true;
-	else
-	    walpos_valid = false;
-	
-	sliderValue = Math.round((walpos - minwalpos) / (maxwalpos - minwalpos) * 100);
-	walposStr = walposInput.value;
-    }
-    else
-    {
-	const slider = walposInput.value;
-	const new_walpos = minwalpos + slider / 100 * (maxwalpos - minwalpos);
-
-	console.log('minwalpos: '+ minwalpos);
-	console.log('maxwalpos: '+ maxwalpos);
-
-	walposStr = int_to_walpos(Math.round(new_walpos));
-	walpos_valid = true;
-	console.log(walposStr);
-    }
-
-    var standbystatus = ''
-    if (standbys)
-    {
-	standbystatus = 
-	    <div>
-		{
-		    standbys.length > 0 ? 
- 			standbys.map((server) =>
-			    <>
-				<ServerStatus key={ 'status_' + server.datadir} server={server}/>
-				<button key={ 'destroy_' + server.datadir} onClick={e => destroy_standby(server.datadir)}>Destroy standby</button>
-			    </>
-			) : "no standby servers"
-		}
-	    </div>
-    }
-
-    return (
-	<div>
-	    <h2>Standbys</h2>
-	    <button onClick={create_standby} disabled={!can_create_standby || !walpos_valid}>Create new Standby</button> at LSN 
-            <input type="text" id="walpos_input" value={ walposStr } onChange={handleWalposChange} disabled={!can_create_standby}/>
-	    <input type="range" id="walpos_slider" min="0" max="100" steps="1" value={sliderValue}  onChange={handleSliderChange} disabled={!can_create_standby}/>
-	    <br/>
-	    { standbystatus }
-	</div>
-    );
-}
-
-function ServerList(props) {
-    const primary = props.serverStatus ? props.serverStatus.primary : null;
-    const standbys = props.serverStatus ? props.serverStatus.standbys : [];
-    const bucketSummary = props.bucketSummary;
-
-    var primarystatus = '';
-
-    function destroy_primary() {
-	const formdata = new FormData();
-	formdata.append("datadir", 'primary');
-	props.startOperation('Destroying primary...',
-			     fetch("/api/destroy_server", { method: 'POST', body: formdata }));
-    }    
-
-    function restore_primary() {
-	props.startOperation('Restoring primary...',
-			     fetch("/api/restore_primary", { method: 'POST' }));
-    }    
-    
-    if (primary)
-    {
-	primarystatus =
-	    <div>
-		<ServerStatus server={primary}/>
-		<button onClick={destroy_primary}>Destroy primary</button>
-	    </div>
-    }
-    else
-    {
-	primarystatus =
-	    <div>
-		no primary server<br/>
-		<button onClick={restore_primary}>Restore primary</button>
-	    </div>
-    }
-
-    return (
-	<>
-	    { primarystatus }
-	    <StandbyList standbys={standbys} startOperation={props.startOperation} bucketSummary={props.bucketSummary}/>
-	    <p className="todo">
-		Should we list the WAL safekeeper nodes here? Or are they part of the Storage? Or not visible to users at all?
-	    </p>
-	</>
-    );
-}
-
-function BucketSummary(props) {
-    const bucketSummary = props.bucketSummary;
-    const startOperation = props.startOperation;
-
-    function slicedice() {
-	startOperation('Slicing sequential WAL to per-relation WAL...',
-		       fetch("/api/slicedice", { method: 'POST' }));
-    }
-    
-    if (!bucketSummary.nonrelimages)
-    {
-	return <>loading...</>
-    }
-
-    return (
-	<div>
-	    <div>Base images at following WAL positions:
-		<ul>
-		    {bucketSummary.nonrelimages.map((img) => (
-			<li key={img}>{img}</li>
-		    ))}
-		</ul>
-	    </div>
-            Sliced WAL is available up to { bucketSummary.maxwal }<br/>
-	    Raw WAL is available up to { bucketSummary.maxseqwal }<br/>
-
-	    <br/>
-	    <button onClick={slicedice}>Slice & Dice WAL</button>
-	    <p className="todo">
-		Currently, the slicing or "sharding" of the WAL needs to be triggered manually, by clicking the above button.
-		<br/>
-		TODO: make it a continuous process that runs in the WAL safekeepers, or in the Page Servers, or as a standalone service.
-	    </p>
-	</div>
-    );
-}
-
-function ProgressIndicator()
-{
-    return (
-	<div>
-	    <Loader
-		type="Puff"
-		color="#00BFFF"
-		height={100}
-		width={100}
-	    />
-	</div>
-    )
-}
-
-function walpos_to_int(walpos)
-{
-    const [hi, lo] = walpos.split('/');
-
-    return parseInt(hi, 16) + parseInt(lo, 16);
-}
-
-function int_to_walpos(x)
-{
-    console.log('converting ' + x);
-    return (Math.floor((x / 0x100000000)).toString(16) + '/' + (x % 0x100000000).toString(16)).toUpperCase();
-}
-
-function OperationStatus(props) {
-    const lastOperation = props.lastOperation;
-    const inProgress = props.inProgress;
-    const operationResult = props.operationResult;
-
-    if (lastOperation)
-    {
-	return (
-	    <div><h2>Last operation:</h2>
-		<div>{lastOperation} { (!inProgress && lastOperation) ? 'done!' : '' }</div>
-		<div className='result'>
-		    {inProgress ? <ProgressIndicator/> : <pre>{operationResult}</pre>}
-		</div>
-	    </div>
-	);
-    }
-    else
-	return '';
-}
-
-function ActionButtons(props) {
-
-    const startOperation = props.startOperation;
-    const bucketSummary = props.bucketSummary;
-    
-    function reset_demo() {
-	startOperation('resetting everything...',
-		       fetch("/api/reset_demo", { method: 'POST' }));
-    }
-
-    function init_primary() {
-	startOperation('Initializing new primary...',
-		       fetch("/api/init_primary", { method: 'POST' }));
-    }
-
-    function zenith_push() {
-	startOperation('Pushing new base image...',
-		       fetch("/api/zenith_push", { method: 'POST' }));
-    }
-	
-    return (
-	<div>
-	    <p className="todo">
-		RESET DEMO deletes everything in the storage bucket, and stops and destroys all servers. This resets the whole demo environment to the initial state.
-	    </p>
-	    <button onClick={reset_demo}>RESET DEMO</button>
-	    <p className="todo">
-		Init Primary runs initdb to create a new primary server. Click this after Resetting the demo.
-	    </p>
-
-	    <button onClick={init_primary}>Init primary</button>
-
-	    <p className="todo">
-		Push Base Image stops the primary, copies the current state of the primary to the storage bucket as a new base backup, and restarts the primary.
-		<br/>
-		TODO: This should be handled by a continuous background process, probably running in the storage nodes. And without having to shut down the cluster, of course.
-	    </p>
-
-	    <button onClick={zenith_push}>Push base image</button>
-
-	</div>
-    );
-}
-
-function Sidenav(props)
-{
-    const toPage = (page) => (event) => {
-	//event.preventDefault()
-	props.switchPage(page);
-    };
-    return (
-	<div>
-	    <h3 className="sidenav-item">Menu</h3>
-	    <a href="#servers" onClick={toPage('servers')} className="sidenav-item">Servers</a>
-	    <a href="#storage" onClick={toPage('storage')} className="sidenav-item">Storage</a>
-	    <a href="#snapshots" onClick={toPage('snapshots')} className="sidenav-item">Snapshots</a>
-	    <a href="#demo" onClick={toPage('demo')} className="sidenav-item">Demo</a>
-	    <a href="#import" onClick={toPage('import')}  className="sidenav-item">Import / Export</a>
-	    <a href="#jobs" onClick={toPage('jobs')} className="sidenav-item">Jobs</a>
-	</div>
-    );
-}
-
-function App()
-{
-    const [page, setPage] = useState('servers');
-    const [serverStatus, setServerStatus] = useState({});
-    const [bucketSummary, setBucketSummary] = useState({});
-    const [lastOperation, setLastOperation] = useState('');
-    const [inProgress, setInProgress] = useState('');
-    const [operationResult, setOperationResult] = useState('');
-
-    useEffect(() => {
-	reloadStatus();
-    }, []);
-
-    function startOperation(operation, promise)
-    {
-	promise.then(result => result.text()).then(resultText => {
-	    operationFinished(resultText);
-	});
-	
-	setLastOperation(operation);
-	setInProgress(true);
-	setOperationResult('');
-    }
-
-    function operationFinished(result)
-    {
-	setInProgress(false);
-	setOperationResult(result);
-	reloadStatus();
-    }
-
-    function clearOperation()
-    {
-	setLastOperation('')
-	setInProgress('');
-	setOperationResult('');
-	console.log("cleared");
-    }
-    
-    function reloadStatus()
-    {
-	fetch('/api/server_status').then(res => res.json()).then(data => {
-	    setServerStatus(data);
-	});
-
-	fetch('/api/bucket_summary').then(res => res.json()).then(data => {
-	    setBucketSummary(data);
-	});
-    }
-
-    const content = () => {
-	console.log(page);
-	if (page === 'servers') {
-	    return (
-		<>
-		    <h1>Server status</h1>
-		    <ServerList startOperation={ startOperation }
-				serverStatus={ serverStatus }
-				bucketSummary={ bucketSummary }/>
-		</>
-	    );
-	} else if (page === 'storage') {
-	    return (
-		<>
-		    <h1>Storage bucket status</h1>
-		    <BucketSummary startOperation={ startOperation }
-				   bucketSummary={ bucketSummary }/>
-		</>
-	    );
-	} else if (page === 'snapshots') {
-	    return (
-		<>
-		    <h1>Snapshots</h1>
-		    <p className="todo">
-			In Zenith, snapshots are just specific points (LSNs) in the WAL history, with a label. A snapshot prevents garbage collecting old data that's still needed to reconstruct the database at that LSN.
-		    </p>
-		    <p className="todo">
-			TODO:
-			<ul>
-			    <li>List existing snapshots</li>
-			    <li>Create new snapshot manually, from current state or from a given LSN</li>
-			    <li>Drill into the WAL stream to see what have happened. Provide tools for e.g. finding point where a table was dropped</li>
-			    <li>Create snapshots automatically based on events in the WAL, like if you call pg_create_restore_point(() in the primary</li>
-			    <li>Launch new reader instance at a snapshot</li>
-			    <li>Export snapshot</li>
-			    <li>Rollback cluster to a snapshot</li>
-			</ul>
-		    </p>
-		</>
-	    );
-	} else if (page === 'demo') {
-	    return (
-		<>
-		    <h1>Misc actions</h1>
-		    <ActionButtons startOperation={ startOperation }
-				   bucketSummary={ bucketSummary }/>
-		</>
-	    );
-	} else if (page === 'import') {
-	    return (
-		<>
-		    <h1>Import & Export tools</h1>
-		    <p className="TODO">TODO:
-			<ul>
-			    <li>Initialize database from existing backup (pg_basebackup, WAL-G, pgbackrest)</li>
-			    <li>Initialize from a pg_dump or other SQL script</li>
-			    <li>Launch batch job to import data files from S3</li>
-			    <li>Launch batch job to export database with pg_dump to S3</li>
-			</ul>
-			These jobs can be run in against reader processing nodes. We can even
-			spawn a new reader node dedicated to a job, and destry it when the job is done.
-		    </p>
-		</>
-	    );
-	} else if (page === 'jobs') {
-	    return (
-		<>
-		    <h1>Batch jobs</h1>
-		    <p className="TODO">TODO:
-			<ul>
-			    <li>List running jobs launched from Import & Export tools</li>
-			    <li>List other batch jobs launched by the user</li>
-			    <li>Launch new batch jobs</li>
-			</ul>
-		    </p>
-		</>
-	    );
-	}
-    }
-
-    function switchPage(page)
-    {
-	console.log("topage " + page);
-	setPage(page)
-	clearOperation();
-    };
-
-    return (
-	<div className="row">
-	    <div className="sidenav">
-		<Sidenav switchPage={switchPage} className="column"/>
-	    </div>
-	    <div className="column">
-		<div>
-		    { content() }
-		</div>
-		<OperationStatus lastOperation={ lastOperation }
-				 inProgress = { inProgress }
-				 operationResult = { operationResult }/>
-	    </div>
-	</div>
-    );
-}
-
-ReactDOM.render(<App/>, document.getElementById('reactApp'));
--- a/mgmt-console/js/waldump.js
+++ b/mgmt-console/js/waldump.js
@@ -1,105 +0,0 @@
-import React, { useState, useEffect } from 'react';
-import ReactDOM from 'react-dom';
-import Loader from "react-loader-spinner";
-
-function walpos_to_int(walpos)
-{
-    const [hi, lo] = walpos.split('/');
-
-    return parseInt(hi, 16) + parseInt(lo, 16);
-}
-
-const palette = [
-    "#003f5c",
-    "#2f4b7c",
-    "#665191",
-    "#a05195",
-    "#d45087",
-    "#f95d6a",
-    "#ff7c43",
-    "#ffa600"];
-
-function WalRecord(props)
-{
-    const firstwalpos = props.firstwalpos;
-    const endwalpos = props.endwalpos;
-    const record = props.record;
-    const index = props.index;
-    const xidmap = props.xidmap;
-
-    const startpos = walpos_to_int(record.start)
-    const endpos = walpos_to_int(record.end)
-
-    const scale = 1000 / (16*1024*1024)
-    const startx = (startpos - firstwalpos) * scale;
-    const endx = (endpos - firstwalpos) * scale;
-
-    const xidindex = xidmap[record.xid];
-    const color = palette[index % palette.length];
-
-    const y = 5 + (xidindex) * 20 + (index % 2) * 2;
-    
-    return (
-	<line x1={ startx } y1={y} x2={endx} y2={y} stroke={ color } strokeWidth="5">
-	    <title>
-		start: { record.start } end: { record.end }
-	    </title>
-	</line>
-    )
-}
-
-function WalFile(props)
-{
-    const walContent = props.walContent;
-    const firstwalpos = props.firstwalpos;
-    const xidmap = props.xidmap;
-   
-    return <svg width="1000" height="200">
-	       {
-		   walContent.records ? 
- 		       walContent.records.map((record, index) =>
-			   <WalRecord key={record.start} firstwalpos={firstwalpos} record={record} index={index} xidmap={xidmap}/>
-		       ) : "no records"
-	       }
-	   </svg>
-}
-
-function WalDumpApp()
-{
-    const [walContent, setWalContent] = useState({});
-
-    const filename = '00000001000000000000000C';
-
-    useEffect(() => {
-	fetch('/fetch_wal?filename='+filename).then(res => res.json()).then(data => {
-	    setWalContent(data);
-	});
-    }, []);
-
-    var firstwalpos = 0;
-    var endwalpos = 0;
-    var numxids = 0;
-    var xidmap = {};
-    if (walContent.records && walContent.records.length > 0)
-    {
-	firstwalpos = walpos_to_int(walContent.records[0].start);
-	endwalpos = firstwalpos + 16*1024*1024;
-
-	walContent.records.forEach(rec => {
-	    if (!xidmap[rec.xid])
-	    {
-		xidmap[rec.xid] = ++numxids;
-	    }
-	});
-    }
-
-    return (
-	<>
-	    <h2>{filename}</h2>
-	    <WalFile walContent={walContent} firstwalpos={firstwalpos} endwalpos={endwalpos} xidmap={xidmap}/>
-	</>
-    );
-}
-
-console.log('hey there');
-ReactDOM.render(<WalDumpApp/>, document.getElementById('waldump'));
--- a/mgmt-console/launch-google-cloud.sh
+++ b/mgmt-console/launch-google-cloud.sh
@@ -1,9 +0,0 @@
-#!/bin/bash
-#
-# NOTE: You must set the following environment variables before running this:
-#  BASIC_AUTH_PASSWORD - basic http auth password
-#  S3_ACCESSKEY
-#  S3_SECRET
-
-
-S3_ENDPOINT=https://storage.googleapis.com S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql-install/bin:$PATH flask run --host=0.0.0.0
--- a/mgmt-console/launch-local.sh
+++ b/mgmt-console/launch-local.sh
@@ -1,8 +0,0 @@
-#!/bin/bash
-#
-# NOTE: You should set the BASIC_AUTH_PASSWORD environment variable before calling
-
-# Launch S3 server
-(cd ms3 && python3 -m ms3.app --listen-address=localhost) &
-
-FLASK_ENV=development S3_REGION=auto S3_ENDPOINT=http://localhost:9009 S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql.fsmfork/bin:$PATH flask run --host=0.0.0.0
--- a/mgmt-console/package-lock.json
+++ b/mgmt-console/package-lock.json
--- a/mgmt-console/package.json
+++ b/mgmt-console/package.json
@@ -1,27 +0,0 @@
-{
-  "name": "starter-kit",
-  "version": "1.1.0",
-  "description": "",
-  "main": "index.js",
-  "scripts": {
-    "test": "echo \"Error: no test specified\" && exit 1",
-    "build": "webpack",
-    "start": "python app.py"
-  },
-  "author": "",
-  "license": "ISC",
-  "dependencies": {
-    "react": "^17.0.1",
-    "react-dom": "^17.0.1",
-    "react-loader-spinner": "^4.0.0",
-    "react-router": "^5.2.0"
-  },
-  "devDependencies": {
-    "@babel/core": "^7.13.1",
-    "@babel/preset-env": "^7.13.5",
-    "@babel/preset-react": "^7.12.13",
-    "babel-loader": "^8.2.2",
-    "webpack": "^5.24.2",
-    "webpack-cli": "^4.5.0"
-  }
-}
--- a/mgmt-console/templates/index.html
+++ b/mgmt-console/templates/index.html
@@ -1,58 +0,0 @@
-<head>
-
-<style>
-  .status {
-      font-family: monospace;
-      background-color: lightgrey;
-  }
-  .shellcommand {
-      font-family: monospace;
-      background-color: lightgrey;
-  }
-  .result {
-      font-family: monospace;
-      background-color: lightgrey;
-      padding: 10px;
-  }
-
-
-  .todo   {font-style: italic;}
-
-
-  h1   {color: blue;}
-
-  .column {
-      float: left;
-      width: 50%;
-      padding: 10px;
-  }
-  /* Clear floats after the columns */
-  .row:after {
-      content: "";
-      display: table;
-      clear: both;
-  }
-
-  .sidenav {
-      float: left;
-      width: 150px;
-      padding: 10px;
-      background-color: pink;
-  }
-
-  .sidenav-item {
-      padding:10px 0px;
-      border:none;
-      display:block;
-  }
-
-</style>
-
-</head>
-
-<body>
-  <div id="reactApp"></div>
-
-  <!-- Attach React components -->
-  <script type="text/javascript" src="{{ url_for('static', filename='app_bundle.js') }}"></script>
-</body>
--- a/mgmt-console/templates/waldump.html
+++ b/mgmt-console/templates/waldump.html
@@ -1,46 +0,0 @@
-<head>
-
-<style>
-  .status {
-      font-family: monospace;
-      background-color: lightgrey;
-  }
-  .shellcommand {
-      font-family: monospace;
-      background-color: lightgrey;
-  }
-  .result {
-      font-family: monospace;
-      background-color: lightgrey;
-      padding: 10px;
-  }
-h1   {color: blue;}
-p    {color: red;}
-
-* {
-  box-sizing: border-box;
-}
-
-.row {
-  display: flex;
-}
-
-/* Create two equal columns that sits next to each other */
-.column1 {
-  flex: 30%;
-  padding: 10px;
-}
-.column2 {
-  flex: 70%;
-  padding: 10px;
-}
-</style>
-
-</head>
-
-<body>
-  <div id="waldump"></div>
-
-  <!-- Attach React components -->
-  <script type="text/javascript" src="{{ url_for('static', filename='waldump_bundle.js') }}"></script>
-</body>
--- a/mgmt-console/waldump.py
+++ b/mgmt-console/waldump.py
@@ -1,25 +0,0 @@
-#
-# This file contains work-in-progress code to visualize WAL contents.
-#
-# This is the API endpoint that calls a 'zenith_wal_to_json' executable,
-# which is a hacked version of pg_waldump that prints information about the
-# records in JSON format. The code in js/waldump.js displays it.
-#
-
-import os
-import re
-from subprocess import PIPE, STDOUT, run, Popen
-
-def fetch_wal(request, s3bucket):
-    filename = request.args.get('filename')
-    if not re.match("^[A-Za-z0-9_]+$", filename):
-        raise Exception('invalid WAL filename: ' + filename)
-
-    # FIXME: this downloads the WAL file to current dir. Use a temp dir? Pipe?
-    s3bucket.download_file('walarchive/' + filename, filename)
-
-    result = run("zenith_wal_to_json " + filename, stdout=PIPE, universal_newlines=True, shell=True)
-
-    os.unlink(filename);
-
-    return result.stdout
--- a/mgmt-console/webpack.config.js
+++ b/mgmt-console/webpack.config.js
@@ -1,27 +0,0 @@
-var webpack = require('webpack');  
-module.exports = {  
-    entry: {
-	app: './js/app.js',
-	waldump: './js/waldump.js'
-    },
-    output: {
-	filename: "[name]_bundle.js",
-	path: __dirname + '/static'
-    },
-    module: {
-	rules: [
-	    {
-		test: /\.js?$/,
-		exclude: /node_modules/,
-		use: {
-		    loader: 'babel-loader',
-		    options: {
-			presets: ['@babel/preset-env']
-		    }
-		}
-	    }
-	]
-    },
-    plugins: [
-    ]
-};
--- a/mgmt-console/zenith.py
+++ b/mgmt-console/zenith.py
@@ -1,179 +0,0 @@
-#zenith.py
-import click
-import testgres
-import os
-
-from testgres import PostgresNode
-from tabulate import tabulate
-
-zenith_base_dir = '/home/anastasia/zenith/basedir'
-
-@click.group()
-def main():
-    """Run the Zenith CLI."""
-
-@click.group()
-def pg():
-    """Db operations
-
-        NOTE: 'database' here means one postgresql node
-    """
-
-@click.command(name='create')
-@click.option('--name', required=True)
-@click.option('-s', '--storage-name', help='Name of the storage',
-                                 default='zenith-local',
-                                 show_default=True)
-@click.option('--snapshot', help='init from the snapshot. Snap is a name or URL')
-@click.option('--no-start', is_flag=True, help='Do not start created node',
-                            default=False, show_default=True)
-def pg_create(name, storage_name, snapshot, no_start):
-    """Initialize the database"""
-    node = PostgresNode()
-    base_dir = os.path.join(zenith_base_dir, 'pg', name)
-    node = testgres.get_new_node(name, base_dir=base_dir)
-    # TODO skip init, instead of that link node with storage or upload it from snapshot
-    node.init()
-    if(no_start==False):
-        node.start()
-
-@click.command(name='start')
-@click.option('--name', required=True)
-@click.option('--snapshot')
-@click.option('--read-only', is_flag=True, help='Start read-only node', show_default=True)
-def pg_start(name, snapshot, read_only):
-    """Start the database"""
-    node = PostgresNode()
-    base_dir = os.path.join(zenith_base_dir, 'pg', name)
-    node = testgres.get_new_node(name, base_dir=base_dir)
-    # TODO pass snapshot as a parameter
-    node.start()
-
-@click.command(name='stop')
-@click.option('--name', required=True)
-def pg_stop(name):
-    """Stop the database"""
-    node = PostgresNode()
-    base_dir = os.path.join(zenith_base_dir, 'pg', name)
-    node = testgres.get_new_node(name, base_dir=base_dir)
-    node.stop()
-
-@click.command(name='destroy')
-@click.option('--name', required=True)
-def pg_destroy(name):
-    """Drop the database"""
-    node = PostgresNode()
-    base_dir = os.path.join(zenith_base_dir, 'pg', name)
-    node = testgres.get_new_node(name, base_dir=base_dir)
-    node.cleanup()
-
-@click.command(name='list')
-def pg_list():
-    """List existing databases"""
-    dirs = os.listdir(os.path.join(zenith_base_dir, 'pg'))
-    path={}
-    status={}
-    data=[]
-
-    for dirname in dirs:
-        path[dirname] = os.path.join(zenith_base_dir, 'pg', dirname)
-        fname = os.path.join( path[dirname], 'data/postmaster.pid')
-        try:
-            f = open(fname,'r')
-            status[dirname] = f.readlines()[-1]
-        except OSError as err:
-            status[dirname]='inactive'
-        data.append([dirname , status[dirname], path[dirname]])
-
-    print(tabulate(data, headers=['Name', 'Status', 'Path']))
-
-pg.add_command(pg_create)
-pg.add_command(pg_destroy)
-pg.add_command(pg_start)   
-pg.add_command(pg_stop)   
-pg.add_command(pg_list)
-
-
-
-@click.group()
-def storage():
-    """Storage operations"""
-
-@click.command(name='attach')
-@click.option('--name')
-def storage_attach(name):
-    """Attach the storage"""
-
-@click.command(name='detach')
-@click.option('--name')
-@click.option('--force', is_flag=True, show_default=True)
-def storage_detach(name):
-    """Detach the storage"""
-
-@click.command(name='list')
-def storage_list():
-    """List existing storages"""
-
-storage.add_command(storage_attach)
-storage.add_command(storage_detach)
-storage.add_command(storage_list)
-
-@click.group()
-def snapshot():
-    """Snapshot operations"""
-
-@click.command(name='create')
-def snapshot_create():
-    """Create new snapshot"""
-
-@click.command(name='destroy')
-def snapshot_destroy():
-    """Destroy the snapshot"""
-
-@click.command(name='pull')
-def snapshot_pull():
-    """Pull remote snapshot"""
-
-@click.command(name='push')
-def snapshot_push():
-    """Push snapshot to remote"""
-
-@click.command(name='import')
-def snapshot_import():
-    """Convert given format to zenith snapshot"""
-
-@click.command(name='export')
-def snapshot_export():
-    """Convert zenith snapshot to PostgreSQL compatible format"""
-
-snapshot.add_command(snapshot_create)
-snapshot.add_command(snapshot_destroy)
-snapshot.add_command(snapshot_pull)
-snapshot.add_command(snapshot_push)
-snapshot.add_command(snapshot_import)
-snapshot.add_command(snapshot_export)
-
-@click.group()
-def wal():
-    """WAL operations"""
-
-@click.command()
-def wallist(name="list"):
-    """List WAL files"""
-
-wal.add_command(wallist)
-
-
-@click.command()
-def console():
-    """Open web console"""
-
-main.add_command(pg)
-main.add_command(storage)
-main.add_command(snapshot)
-main.add_command(wal)
-main.add_command(console)
-
-
-if __name__ == '__main__':
-    main()
--- a/pageserver/Cargo.lock
+++ b/pageserver/Cargo.lock
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,10 +8,9 @@ edition = "2018"

 [dependencies]
 chrono = "0.4.19"
-crossbeam-channel = "0.5.0"
 rand = "0.8.3"
 regex = "1.4.5"
-bytes = "1.0.1"
+bytes = { version = "1.0.1", features = ['serde'] }
 byteorder = "1.4.3"
 futures = "0.3.13"
 lazy_static = "1.4.0"
@@ -25,21 +24,26 @@ clap = "2.33.0"
 termion = "1.5.6"
 tui = "0.14.0"
 daemonize = "0.4.1"
-rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] }
+rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
 tokio = { version = "1.3.0", features = ["full"] }
 tokio-stream = { version = "0.1.4" }
-tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
-rocksdb = "0.16.0"
+postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
+# by default rust-rocksdb tries to build a lot of compression algos. Use lz4 only for now as it is simplest dependency.
+rocksdb = { version = "0.16.0", features = ["lz4"], default-features = false }
 anyhow = "1.0"
 crc32c = "0.6.0"
 walkdir = "2"
 thiserror = "1.0"
 hex = "0.4.3"
 tar = "0.4.33"
-parse_duration = "*"
+humantime = "2.1.0"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1"
+fs_extra = "1.2.0"
+toml = "0.5"

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_utils = { path = "../zenith_utils" }
+workspace_hack = { path = "../workspace_hack" }
--- a/pageserver/README
+++ b/pageserver/README
@@ -1,82 +1,4 @@
-Page Server
-===========
-
-
-How to test
-----------
-
-
-1. Compile and install Postgres from this repository (there are
-   modifications, so vanilla Postgres won't do)
-
-    ./configure --prefix=/home/heikki/zenith-install
-
-2. Compile the page server
-
-    cd pageserver
-    cargo build
-
-3. Create another "dummy" cluster that will be used by the page server when it applies
-   the WAL records. (shouldn't really need this, getting rid of it is a TODO):
-
-    /home/heikki/zenith-install/bin/initdb -D /data/zenith-dummy
-
-
-4. Initialize and start a new postgres cluster
-
-    /home/heikki/zenith-install/bin/initdb -D /data/zenith-test-db --username=postgres
-    /home/heikki/zenith-install/bin/postgres -D /data/zenith-test-db
-
-5. In another terminal, start the page server.
-
-    PGDATA=/data/zenith-dummy PATH=/home/heikki/zenith-install/bin:$PATH ./target/debug/pageserver
-
-   It should connect to the postgres instance using streaming replication, and print something
-   like this:
-
-    $ PGDATA=/data/zenith-dummy PATH=/home/heikki/zenith-install/bin:$PATH ./target/debug/pageserver
-    Starting WAL receiver
-    connecting...
-    Starting page server on 127.0.0.1:5430
-    connected!
-    page cache is empty
-
-6. You can now open another terminal and issue DDL commands. Generated WAL records will
-   be streamed to the page servers, and attached to blocks that they apply to in its
-   page cache
-
-    $ psql postgres -U postgres
-    psql (14devel)
-    Type "help" for help.
-    
-    postgres=# create table mydata (i int4);
-    CREATE TABLE
-    postgres=# insert into mydata select g from generate_series(1,100) g;
-    INSERT 0 100
-    postgres=# 
-
-7. The GetPage@LSN interface to the compute nodes isn't working yet, but to simulate
-   that, the page server generates a test GetPage@LSN call every 5 seconds on a random
-   block that's in the page cache. In a few seconds, you should see output from that:
-
-    testing GetPage@LSN for block 0
-    WAL record at LSN 23584576 initializes the page
-    2021-03-19 11:03:13.791 EET [11439] LOG:  applied WAL record at 0/167DF40
-    2021-03-19 11:03:13.791 EET [11439] LOG:  applied WAL record at 0/167DF80
-    2021-03-19 11:03:13.791 EET [11439] LOG:  applied WAL record at 0/167DFC0
-    2021-03-19 11:03:13.791 EET [11439] LOG:  applied WAL record at 0/167E018
-    2021-03-19 11:03:13.791 EET [11439] LOG:  applied WAL record at 0/167E058
-    2021-03-19 11:03:13.791 EET [11439] LOG:  applied WAL record at 0/167E098
-    2021-03-19 11:03:13.791 EET [11439] LOG:  applied WAL record at 0/167E0D8
-    2021-03-19 11:03:13.792 EET [11439] LOG:  applied WAL record at 0/167E118
-    2021-03-19 11:03:13.792 EET [11439] LOG:  applied WAL record at 0/167E158
-    2021-03-19 11:03:13.792 EET [11439] LOG:  applied WAL record at 0/167E198
-    applied 10 WAL records to produce page image at LSN 18446744073709547246
-
-
-
-Architecture
-============
+## Page server architecture

 The Page Server is responsible for all operations on a number of
 "chunks" of relation data. A chunk corresponds to a PostgreSQL
@@ -84,8 +6,10 @@ relation segment (i.e. one max. 1 GB file in the data directory), but
 it holds all the different versions of every page in the segment that
 are still needed by the system.

-Determining which chunk each Page Server holds is handled elsewhere. (TODO:
-currently, there is only one Page Server which holds all chunks)
+Currently we do not specifically organize data in chunks.
+All page images and corresponding WAL records are stored as entries in a key-value storage,
+where StorageKey is a zenith_timeline_id + BufferTag + LSN.
+

 The Page Server has a few different duties:

@@ -154,11 +78,33 @@ and stores them to the page cache.
 Page Cache
 ----------

-The Page Cache is a data structure, to hold all the different page versions.
-It is accessed by all the other threads, to perform their duties.
+The Page Cache is a switchboard to access different Repositories.

-Currently, the page cache is implemented fully in-memory. TODO: Store it
-on disk. Define a file format.
+#### Repository
+Repository corresponds to one .zenith directory.
+Repository is needed to manage Timelines.
+
+#### Timeline
+Timeline is a page cache workhorse that accepts page changes
+and serves get_page_at_lsn() and get_rel_size() requests.
+Note: this has nothing to do with PostgreSQL WAL timeline.
+
+#### Branch
+We can create branch at certain LSN.
+Each Branch lives in a corresponding timeline and has an ancestor.
+
+To get full snapshot of data at certain moment we need to traverse timeline and its ancestors.
+
+#### ObjectRepository
+ObjectRepository implements Repository and has associated ObjectStore and WAL redo service.
+
+#### ObjectStore
+ObjectStore is an interface for key-value store for page images and wal records.
+Currently it has one implementation - RocksDB.
+
+#### WAL redo service
+WAL redo service - service that runs PostgreSQL in a special wal_redo mode
+to apply given WAL records over an old page image and return new page image.


 TODO: Garbage Collection / Compaction
@@ -177,3 +123,7 @@ The backup service is responsible for periodically pushing the chunks to S3.
 TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
 a chunk we don't currently have? Or when an external Control Plane tells us?

+TODO: Sharding
+--------------------
+
+We should be able to run multiple Page Servers that handle sharded data.
--- a/pageserver/build.rs
+++ b/pageserver/build.rs
@@ -1,41 +0,0 @@
-//
-//   Triggers postgres build if there is no postgres binary present at
-// 'REPO_ROOT/tmp_install/bin/postgres'.
-//
-//   I can see a lot of disadvantages with such automatization and main
-// advantage here is ability to build everything and run integration tests
-// in a bare repo by running 'cargo test'.
-//
-//   We can interceipt whether it is debug or release build and run
-// corresponding pg build. But it seems like an overkill for now.
-//
-// Problem #1 -- language server in my editor likes calling 'cargo build'
-// by himself. So if I delete tmp_install directory it would magically reappear
-// after some time. During this compilation 'cargo build' may whine about
-// "waiting for file lock on build directory".
-//
-// Problem #2 -- cargo build would run this only if something is changed in
-// the crate.
-//
-//   And generally speaking postgres is not a build dependency for the pageserver,
-// just for integration tests. So let's not mix that. I'll leave this file in
-// place for some time just in case if anybody would start doing the same.
-//
-
-// use std::path::Path;
-// use std::process::{Command};
-
-fn main() {
-    // // build some postgres if it is not done none yet
-    // if !Path::new("../tmp_install/bin/postgres").exists() {
-    //     let make_res = Command::new("make")
-    //         .arg("postgres")
-    //         .env_clear()
-    //         .status()
-    //         .expect("failed to execute 'make postgres'");
-
-    //     if !make_res.success() {
-    //         panic!("postgres build failed");
-    //     }
-    // }
-}
--- a/pageserver/launch.sh
+++ b/pageserver/launch.sh
@@ -1,62 +0,0 @@
-#!/bin/sh
-#
-# Set up a simple Compute Node + Page Server combination locally.
-#
-# NOTE: This doesn't clean up between invocations. You'll need to manually:
-#
-# - Kill any previous 'postgres' and 'pageserver' processes
-# - Clear the S3 bucket
-# - Remove the 'zenith-pgdata' directory
-
-
-set -e
-
-# Set up some config.
-#
-# CHANGE THESE ACCORDING TO YOUR S3 INSTALLATION
-export S3_REGION=auto
-export S3_ENDPOINT=https://localhost:9000
-export S3_ACCESSKEY=minioadmin
-export S3_SECRET=pikkunen
-export S3_BUCKET=zenith-testbucket
-
-
-COMPUTE_NODE_PGDATA=zenith-pgdata
-
-
-# 1. Initialize a cluster.
-initdb -D $COMPUTE_NODE_PGDATA -U zenith
-
-echo "port=65432" >> $COMPUTE_NODE_PGDATA/postgresql.conf
-echo "log_connections=on" >> $COMPUTE_NODE_PGDATA/postgresql.conf
-
-# Use a small shared_buffers, so that we hit the Page Server more
-# easily.
-echo "shared_buffers = 1MB" >> $COMPUTE_NODE_PGDATA/postgresql.conf
-
-# TODO: page server should use a replication slot, or some other mechanism
-# to make sure that the primary doesn't lose data that the page server still
-# needs. (The WAL safekeepers should ensure that)
-echo "wal_keep_size=10GB" >> $COMPUTE_NODE_PGDATA/postgresql.conf
-
-# Tell the Postgres server how to connect to the Page Server
-echo "page_server_connstring='host=localhost port=5430'" >> $COMPUTE_NODE_PGDATA/postgresql.conf
-
-
-# 2. Run zenith_push to push a base backup fo the database to an S3 bucket. The
-# Page Server will read it from there
-zenith_push -D $COMPUTE_NODE_PGDATA
-
-
-# 3. Launch page server
-rm -rf /tmp/pgdata-dummy
-initdb -N -D /tmp/pgdata-dummy
-PGDATA=/tmp/pgdata-dummy ./target/debug/pageserver  &
-
-# 4. Start up the Postgres server
-postgres -D $COMPUTE_NODE_PGDATA &
-
-
-echo "ALL SET! You can now connect to Postgres with something like:"
-echo ""
-echo 'psql "dbname=postgres host=localhost user=zenith port=65432"'
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -1,156 +1,281 @@
+//!
+//! Generate a tarball with files needed to bootstrap ComputeNode.
+//!
+//! TODO: this module has nothing to do with PostgreSQL pg_basebackup.
+//! It could use a better name.
+//!
+//! Stateless Postgres compute node is launched by sending tarball which contains non-relational data (multixacts, clog, filenodemaps, twophase files)
+//! and generate pg_control and dummy segment of WAL. This module is responsible for creation of such tarball from snapshot directory and
+//! data stored in object storage.
+//!
+use crate::ZTimelineId;
+use bytes::{BufMut, BytesMut};
 use log::*;
-use regex::Regex;
-use std::fmt;
 use std::io::Write;
-use tar::Builder;
+use std::sync::Arc;
+use std::time::SystemTime;
+use tar::{Builder, Header};
 use walkdir::WalkDir;

-use crate::ZTimelineId;
+use crate::object_key::*;
+use crate::repository::Timeline;
+use postgres_ffi::relfile_utils::*;
+use postgres_ffi::xlog_utils::*;
+use postgres_ffi::*;
+use zenith_utils::lsn::Lsn;

-pub fn send_snapshot_tarball(
-    write: &mut dyn Write,
-    timelineid: ZTimelineId,
-    snapshotlsn: u64,
-) -> Result<(), std::io::Error> {
-    let mut ar = Builder::new(write);
+/// This is short-living object only for the time of tarball creation,
+/// created mostly to avoid passing a lot of parameters between various functions
+/// used for constructing tarball.
+pub struct Basebackup<'a> {
+    ar: Builder<&'a mut dyn Write>,
+    timeline: &'a Arc<dyn Timeline>,
+    lsn: Lsn,
+    prev_record_lsn: Lsn,
+    snappath: String,
+    slru_buf: [u8; pg_constants::SLRU_SEG_SIZE],
+    slru_segno: u32,
+    slru_path: &'static str,
+}

-    let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn);
-    let walpath = format!("timelines/{}/wal", timelineid);
-
-    debug!("sending tarball of snapshot in {}", snappath);
-    //ar.append_dir_all("", &snappath)?;
-
-    for entry in WalkDir::new(&snappath) {
-        let entry = entry?;
-        let fullpath = entry.path();
-        let relpath = entry.path().strip_prefix(&snappath).unwrap();
-
-        if relpath.to_str().unwrap() == "" {
-            continue;
+impl<'a> Basebackup<'a> {
+    pub fn new(
+        write: &'a mut dyn Write,
+        timelineid: ZTimelineId,
+        timeline: &'a Arc<dyn Timeline>,
+        lsn: Lsn,
+        prev_record_lsn: Lsn,
+        snapshot_lsn: Lsn,
+    ) -> Basebackup<'a> {
+        Basebackup {
+            ar: Builder::new(write),
+            timeline,
+            lsn,
+            prev_record_lsn,
+            snappath: format!("timelines/{}/snapshots/{:016X}", timelineid, snapshot_lsn.0),
+            slru_path: "",
+            slru_segno: u32::MAX,
+            slru_buf: [0u8; pg_constants::SLRU_SEG_SIZE],
        }
+    }

-        if entry.file_type().is_dir() {
-            trace!(
-                "sending dir {} as {}",
-                fullpath.display(),
-                relpath.display()
-            );
-            ar.append_dir(relpath, fullpath)?;
-        } else if entry.file_type().is_symlink() {
-            error!("ignoring symlink in snapshot dir");
-        } else if entry.file_type().is_file() {
-            // Shared catalogs are exempt
-            if relpath.starts_with("global/") {
-                trace!("sending shared catalog {}", relpath.display());
-                ar.append_path_with_name(fullpath, relpath)?;
-            } else if !is_rel_file_path(relpath.to_str().unwrap()) {
-                trace!("sending {}", relpath.display());
-                ar.append_path_with_name(fullpath, relpath)?;
-            } else {
-                trace!("not sending {}", relpath.display());
-                // FIXME: send all files for now
-                ar.append_path_with_name(fullpath, relpath)?;
+    pub fn send_tarball(&mut self) -> anyhow::Result<()> {
+        debug!("sending tarball of snapshot in {}", self.snappath);
+        for entry in WalkDir::new(&self.snappath) {
+            let entry = entry?;
+            let fullpath = entry.path();
+            let relpath = entry.path().strip_prefix(&self.snappath).unwrap();
+
+            if relpath.to_str().unwrap() == "" {
+                continue;
            }
+
+            if entry.file_type().is_dir() {
+                trace!(
+                    "sending dir {} as {}",
+                    fullpath.display(),
+                    relpath.display()
+                );
+                self.ar.append_dir(relpath, fullpath)?;
+            } else if entry.file_type().is_symlink() {
+                error!("ignoring symlink in snapshot dir");
+            } else if entry.file_type().is_file() {
+                if !is_rel_file_path(relpath.to_str().unwrap()) {
+                    if entry.file_name() != "pg_filenode.map" // this files will be generated from object storage
+                        && !relpath.starts_with("pg_xact/")
+                        && !relpath.starts_with("pg_multixact/")
+                    {
+                        trace!("sending {}", relpath.display());
+                        self.ar.append_path_with_name(fullpath, relpath)?;
+                    }
+                } else {
+                    // relation pages are loaded on demand and should not be included in tarball
+                    trace!("not sending {}", relpath.display());
+                }
+            } else {
+                error!("unknown file type: {}", fullpath.display());
+            }
+        }
+
+        // Generate non-relational files.
+        // Iteration is sorted order: all objects of the same time are grouped and traversed
+        // in key ascending order. For example all pg_xact records precede pg_multixact records and are sorted by block number.
+        // It allows to easily construct SLRU segments (32 blocks).
+        for obj in self.timeline.list_nonrels(self.lsn)? {
+            match obj {
+                ObjectTag::Clog(slru) => self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
+                ObjectTag::MultiXactMembers(slru) => {
+                    self.add_slru_segment("pg_multixact/members", &obj, slru.blknum)?
+                }
+                ObjectTag::MultiXactOffsets(slru) => {
+                    self.add_slru_segment("pg_multixact/offsets", &obj, slru.blknum)?
+                }
+                ObjectTag::FileNodeMap(db) => self.add_relmap_file(&obj, &db)?,
+                ObjectTag::TwoPhase(prepare) => self.add_twophase_file(&obj, prepare.xid)?,
+                _ => {}
+            }
+        }
+        self.finish_slru_segment()?; // write last non-completed SLRU segment (if any)
+        self.add_pgcontrol_file()?;
+        self.ar.finish()?;
+        debug!("all tarred up!");
+        Ok(())
+    }
+
+    //
+    // Generate SLRU segment files from repository. Path identifies SLRU kind (pg_xact, pg_multixact/members, ...).
+    // Intially pass an empty string.
+    //
+    fn add_slru_segment(
+        &mut self,
+        path: &'static str,
+        tag: &ObjectTag,
+        page: u32,
+    ) -> anyhow::Result<()> {
+        let img = self
+            .timeline
+            .get_page_at_lsn_nowait(*tag, self.lsn, false)?;
+        // Zero length image indicates truncated segment: just skip it
+        if !img.is_empty() {
+            assert!(img.len() == pg_constants::BLCKSZ as usize);
+            let segno = page / pg_constants::SLRU_PAGES_PER_SEGMENT;
+            if self.slru_path != "" && (self.slru_segno != segno || self.slru_path != path) {
+                // Switch to new segment: save old one
+                let segname = format!("{}/{:>04X}", self.slru_path, self.slru_segno);
+                let header = new_tar_header(&segname, pg_constants::SLRU_SEG_SIZE as u64)?;
+                self.ar.append(&header, &self.slru_buf[..])?;
+                self.slru_buf = [0u8; pg_constants::SLRU_SEG_SIZE]; // reinitialize segment buffer
+            }
+            self.slru_segno = segno;
+            self.slru_path = path;
+            let offs_start = (page % pg_constants::SLRU_PAGES_PER_SEGMENT) as usize
+                * pg_constants::BLCKSZ as usize;
+            let offs_end = offs_start + pg_constants::BLCKSZ as usize;
+            self.slru_buf[offs_start..offs_end].copy_from_slice(&img);
+        }
+        Ok(())
+    }
+
+    //
+    // We flush SLRU segments to the tarball once them are completed.
+    // This method is used to flush last (may be incompleted) segment.
+    //
+    fn finish_slru_segment(&mut self) -> anyhow::Result<()> {
+        if self.slru_path != "" {
+            // is there is some incompleted segment
+            let segname = format!("{}/{:>04X}", self.slru_path, self.slru_segno);
+            let header = new_tar_header(&segname, pg_constants::SLRU_SEG_SIZE as u64)?;
+            self.ar.append(&header, &self.slru_buf[..])?;
+        }
+        Ok(())
+    }
+
+    //
+    // Extract pg_filenode.map files from repository
+    //
+    fn add_relmap_file(&mut self, tag: &ObjectTag, db: &DatabaseTag) -> anyhow::Result<()> {
+        let img = self
+            .timeline
+            .get_page_at_lsn_nowait(*tag, self.lsn, false)?;
+        info!("add_relmap_file {:?}", db);
+        let path = if db.spcnode == pg_constants::GLOBALTABLESPACE_OID {
+            String::from("global/pg_filenode.map") // filenode map for global tablespace
        } else {
-            error!("unknown file type: {}", fullpath.display());
-        }
-    }
-
-    // FIXME: also send all the WAL
-    for entry in std::fs::read_dir(&walpath)? {
-        let entry = entry?;
-        let fullpath = &entry.path();
-        let relpath = fullpath.strip_prefix(&walpath).unwrap();
-
-        if !entry.path().is_file() {
-            continue;
-        }
-
-        let archive_fname = relpath.to_str().unwrap();
-        let archive_fname = archive_fname
-            .strip_suffix(".partial")
-            .unwrap_or(&archive_fname);
-        let archive_path = "pg_wal/".to_owned() + archive_fname;
-        ar.append_path_with_name(fullpath, archive_path)?;
-    }
-
-    ar.finish()?;
-    debug!("all tarred up!");
-    Ok(())
-}
-
-// formats:
-// <oid>
-// <oid>_<fork name>
-// <oid>.<segment number>
-// <oid>_<fork name>.<segment number>
-
-#[derive(Debug)]
-struct FilePathError {
-    msg: String,
-}
-
-impl FilePathError {
-    fn new(msg: &str) -> FilePathError {
-        FilePathError {
-            msg: msg.to_string(),
-        }
-    }
-}
-
-impl From<core::num::ParseIntError> for FilePathError {
-    fn from(e: core::num::ParseIntError) -> Self {
-        return FilePathError {
-            msg: format!("invalid filename: {}", e),
+            // User defined tablespaces are not supported
+            assert!(db.spcnode == pg_constants::DEFAULTTABLESPACE_OID);
+            let src_path = format!("{}/base/1/PG_VERSION", self.snappath);
+            let dst_path = format!("base/{}/PG_VERSION", db.dbnode);
+            self.ar.append_path_with_name(&src_path, &dst_path)?;
+            format!("base/{}/pg_filenode.map", db.dbnode)
        };
+        assert!(img.len() == 512);
+        let header = new_tar_header(&path, img.len() as u64)?;
+        self.ar.append(&header, &img[..])?;
+        Ok(())
+    }
+
+    //
+    // Extract twophase state files
+    //
+    fn add_twophase_file(&mut self, tag: &ObjectTag, xid: TransactionId) -> anyhow::Result<()> {
+        // Include in tarball two-phase files only of in-progress transactions
+        if self.timeline.get_tx_status(xid, self.lsn)?
+            == pg_constants::TRANSACTION_STATUS_IN_PROGRESS
+        {
+            let img = self
+                .timeline
+                .get_page_at_lsn_nowait(*tag, self.lsn, false)?;
+            let mut buf = BytesMut::new();
+            buf.extend_from_slice(&img[..]);
+            let crc = crc32c::crc32c(&img[..]);
+            buf.put_u32_le(crc);
+            let path = format!("pg_twophase/{:>08X}", xid);
+            let header = new_tar_header(&path, buf.len() as u64)?;
+            self.ar.append(&header, &buf[..])?;
+        }
+        Ok(())
+    }
+
+    //
+    // Add generated pg_control file
+    //
+    fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
+        let checkpoint_bytes =
+            self.timeline
+                .get_page_at_lsn_nowait(ObjectTag::Checkpoint, self.lsn, false)?;
+        let pg_control_bytes =
+            self.timeline
+                .get_page_at_lsn_nowait(ObjectTag::ControlFile, self.lsn, false)?;
+        let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
+        let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
+
+        // Generate new pg_control and WAL needed for bootstrap
+        let checkpoint_segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
+        let checkpoint_lsn = XLogSegNoOffsetToRecPtr(
+            checkpoint_segno,
+            XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
+            pg_constants::WAL_SEGMENT_SIZE,
+        );
+        checkpoint.redo = self.lsn.0 + self.lsn.calc_padding(8u32);
+
+        //reset some fields we don't want to preserve
+        checkpoint.oldestActiveXid = 0;
+
+        //save new values in pg_control
+        pg_control.checkPoint = checkpoint_lsn;
+        pg_control.checkPointCopy = checkpoint;
+        info!("pg_control.state = {}", pg_control.state);
+        pg_control.state = pg_constants::DB_SHUTDOWNED;
+
+        // add zenith.signal file
+        self.ar.append(
+            &new_tar_header("zenith.signal", 8)?,
+            &self.prev_record_lsn.0.to_le_bytes()[..],
+        )?;
+
+        //send pg_control
+        let pg_control_bytes = pg_control.encode();
+        let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
+        self.ar.append(&header, &pg_control_bytes[..])?;
+
+        //send wal segment
+        let wal_file_name = XLogFileName(
+            1, // FIXME: always use Postgres timeline 1
+            checkpoint_segno,
+            pg_constants::WAL_SEGMENT_SIZE,
+        );
+        let wal_file_path = format!("pg_wal/{}", wal_file_name);
+        let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
+        let wal_seg = generate_wal_segment(&pg_control);
+        self.ar.append(&header, &wal_seg[..])?;
+        Ok(())
    }
 }

-impl fmt::Display for FilePathError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "invalid filename")
-    }
-}
-
-fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
-    match forkname {
-        // "main" is not in filenames, it's implicit if the fork name is not present
-        None => Ok(0),
-        Some("fsm") => Ok(1),
-        Some("vm") => Ok(2),
-        Some("init") => Ok(3),
-        Some(_) => Err(FilePathError::new("invalid forkname")),
-    }
-}
-
-fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
-    let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
-
-    let caps = re
-        .captures(fname)
-        .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
-
-    let relnode_str = caps.name("relnode").unwrap().as_str();
-    let relnode = u32::from_str_radix(relnode_str, 10)?;
-
-    let forkname_match = caps.name("forkname");
-    let forkname = if forkname_match.is_none() {
-        None
-    } else {
-        Some(forkname_match.unwrap().as_str())
-    };
-    let forknum = forkname_to_forknum(forkname)?;
-
-    let segno_match = caps.name("segno");
-    let segno = if segno_match.is_none() {
-        0
-    } else {
-        u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
-    };
-
-    Ok((relnode, forknum, segno))
-}
-
+///
+/// Parse a path, relative to the root of PostgreSQL data directory, as
+/// a PostgreSQL relation data file.
+///
 fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
    /*
     * Relation data files can be in one of the following directories:
@@ -170,33 +295,52 @@ fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
     * <oid>.<segment number>
     */
    if let Some(fname) = path.strip_prefix("global/") {
-        let (_relnode, _forknum, _segno) = parse_filename(fname)?;
+        let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;

        Ok(())
    } else if let Some(dbpath) = path.strip_prefix("base/") {
        let mut s = dbpath.split('/');
-        let dbnode_str = s
-            .next()
-            .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
-        let _dbnode = u32::from_str_radix(dbnode_str, 10)?;
-        let fname = s
-            .next()
-            .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
+        let dbnode_str = s.next().ok_or(FilePathError::InvalidFileName)?;
+        let _dbnode = dbnode_str.parse::<u32>()?;
+        let fname = s.next().ok_or(FilePathError::InvalidFileName)?;
        if s.next().is_some() {
-            return Err(FilePathError::new("invalid relation data file name"));
+            return Err(FilePathError::InvalidFileName);
        };

-        let (_relnode, _forknum, _segno) = parse_filename(fname)?;
+        let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;

        Ok(())
-    } else if let Some(_) = path.strip_prefix("pg_tblspc/") {
+    } else if path.strip_prefix("pg_tblspc/").is_some() {
        // TODO
-        Err(FilePathError::new("tablespaces not supported"))
+        error!("tablespaces not implemented yet");
+        Err(FilePathError::InvalidFileName)
    } else {
-        Err(FilePathError::new("invalid relation data file name"))
+        Err(FilePathError::InvalidFileName)
    }
 }

+//
+// Check if it is relational file
+//
 fn is_rel_file_path(path: &str) -> bool {
    parse_rel_file_path(path).is_ok()
 }
+
+//
+// Create new tarball entry header
+//
+fn new_tar_header(path: &str, size: u64) -> anyhow::Result<Header> {
+    let mut header = Header::new_gnu();
+    header.set_size(size);
+    header.set_path(path)?;
+    header.set_mode(0b110000000); // -rw-------
+    header.set_mtime(
+        // use currenttime as last modified time
+        SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_secs(),
+    );
+    header.set_cksum();
+    Ok(header)
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -3,24 +3,116 @@
 //

 use log::*;
-use parse_duration::parse;
-use std::fs::{self, OpenOptions};
-use std::io;
-use std::path::PathBuf;
-use std::process::exit;
-use std::thread;
-use std::time::Duration;
+use serde::{Deserialize, Serialize};
+use std::{
+    env,
+    fs::{File, OpenOptions},
+    io,
+    net::TcpListener,
+    path::{Path, PathBuf},
+    process::exit,
+    thread,
+    time::Duration,
+};

 use anyhow::{Context, Result};
-use clap::{App, Arg};
+use clap::{App, Arg, ArgMatches};
 use daemonize::Daemonize;

-use slog::Drain;
+use slog::{Drain, FnValue};

-use pageserver::{page_service, tui, zenith_repo_dir, PageServerConf};
+use pageserver::{branches, page_cache, page_service, tui, PageServerConf};
+
+const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000";

 const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-const DEFAULT_GC_PERIOD_SEC: u64 = 10;
+const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
+
+const DEFAULT_WAL_REDOERS: usize = 1;
+
+/// String arguments that can be declared via CLI or config file
+#[derive(Serialize, Deserialize)]
+struct CfgFileParams {
+    listen_addr: Option<String>,
+    gc_horizon: Option<String>,
+    gc_period: Option<String>,
+    wal_redoers: Option<String>,
+    pg_distrib_dir: Option<String>,
+}
+
+impl CfgFileParams {
+    /// Extract string arguments from CLI
+    fn from_args(arg_matches: &ArgMatches) -> Self {
+        let get_arg = |arg_name: &str| -> Option<String> {
+            arg_matches.value_of(arg_name).map(str::to_owned)
+        };
+
+        Self {
+            listen_addr: get_arg("listen"),
+            gc_horizon: get_arg("gc_horizon"),
+            gc_period: get_arg("gc_period"),
+            wal_redoers: get_arg("wal_redoers"),
+            pg_distrib_dir: get_arg("postgres-distrib"),
+        }
+    }
+
+    /// Fill missing values in `self` with `other`
+    fn or(self, other: CfgFileParams) -> Self {
+        // TODO cleaner way to do this
+        Self {
+            listen_addr: self.listen_addr.or(other.listen_addr),
+            gc_horizon: self.gc_horizon.or(other.gc_horizon),
+            gc_period: self.gc_period.or(other.gc_period),
+            wal_redoers: self.wal_redoers.or(other.wal_redoers),
+            pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
+        }
+    }
+
+    /// Create a PageServerConf from these string parameters
+    fn try_into_config(&self) -> Result<PageServerConf> {
+        let listen_addr = match self.listen_addr.as_ref() {
+            Some(addr) => addr.clone(),
+            None => DEFAULT_LISTEN_ADDR.to_owned(),
+        };
+
+        let gc_horizon: u64 = match self.gc_horizon.as_ref() {
+            Some(horizon_str) => horizon_str.parse()?,
+            None => DEFAULT_GC_HORIZON,
+        };
+        let gc_period = match self.gc_period.as_ref() {
+            Some(period_str) => humantime::parse_duration(period_str)?,
+            None => DEFAULT_GC_PERIOD,
+        };
+
+        let wal_redoers = match self.wal_redoers.as_ref() {
+            Some(wal_redoers_str) => wal_redoers_str.parse::<usize>()?,
+            None => DEFAULT_WAL_REDOERS,
+        };
+
+        let pg_distrib_dir = match self.pg_distrib_dir.as_ref() {
+            Some(pg_distrib_dir_str) => PathBuf::from(pg_distrib_dir_str),
+            None => env::current_dir()?.join("tmp_install"),
+        };
+
+        if !pg_distrib_dir.join("bin/postgres").exists() {
+            anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
+        }
+
+        Ok(PageServerConf {
+            daemonize: false,
+            interactive: false,
+            materialize: false,
+
+            listen_addr,
+            gc_horizon,
+            gc_period,
+            wal_redoers,
+            workdir: PathBuf::from("."),
+
+            pg_distrib_dir,
+        })
+    }
+}

 fn main() -> Result<()> {
    let arg_matches = App::new("Zenith page server")
@@ -39,6 +131,12 @@ fn main() -> Result<()> {
                .takes_value(false)
                .help("Interactive mode"),
        )
+        .arg(
+            Arg::with_name("materialize")
+                .long("materialize")
+                .takes_value(false)
+                .help("Materialize pages constructed by get_page_at"),
+        )
        .arg(
            Arg::with_name("daemonize")
                .short("d")
@@ -46,6 +144,12 @@ fn main() -> Result<()> {
                .takes_value(false)
                .help("Run in the background"),
        )
+        .arg(
+            Arg::with_name("init")
+                .long("init")
+                .takes_value(false)
+                .help("Initialize pageserver repo"),
+        )
        .arg(
            Arg::with_name("gc_horizon")
                .long("gc_horizon")
@@ -58,22 +162,53 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Interval between garbage collector iterations"),
        )
+        .arg(
+            Arg::with_name("wal_redoers")
+                .long("wal_redoers")
+                .takes_value(true)
+                .help("Number of wal-redo postgres instances"),
+        )
+        .arg(
+            Arg::with_name("workdir")
+                .short("D")
+                .long("workdir")
+                .takes_value(true)
+                .help("Working directory for the pageserver"),
+        )
+        .arg(
+            Arg::with_name("postgres-distrib")
+                .long("postgres-distrib")
+                .takes_value(true)
+                .help("Postgres distribution directory"),
+        )
        .get_matches();

-    let mut conf = PageServerConf {
-        daemonize: false,
-        interactive: false,
-        gc_horizon: DEFAULT_GC_HORIZON,
-        gc_period: Duration::from_secs(DEFAULT_GC_PERIOD_SEC),
-        listen_addr: "127.0.0.1:5430".parse().unwrap(),
+    let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
+    let cfg_file_path = workdir.canonicalize()?.join("pageserver.toml");
+
+    let args_params = CfgFileParams::from_args(&arg_matches);
+
+    let init = arg_matches.is_present("init");
+    let params = if init {
+        // We're initializing the repo, so there's no config file yet
+        args_params
+    } else {
+        // Supplement the CLI arguments with the config file
+        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)?;
+        let file_params: CfgFileParams = toml::from_str(&cfg_file_contents)?;
+        args_params.or(file_params)
    };

-    if arg_matches.is_present("daemonize") {
-        conf.daemonize = true;
-    }
+    // Ensure the config is valid, even if just init-ing
+    let mut conf = params.try_into_config()?;

-    if arg_matches.is_present("interactive") {
-        conf.interactive = true;
+    conf.daemonize = arg_matches.is_present("daemonize");
+    conf.interactive = arg_matches.is_present("interactive");
+    conf.materialize = arg_matches.is_present("materialize");
+
+    if init && (conf.daemonize || conf.interactive) {
+        eprintln!("--daemonize and --interactive may not be used with --init");
+        exit(1);
    }

    if conf.daemonize && conf.interactive {
@@ -81,67 +216,73 @@ fn main() -> Result<()> {
        exit(1);
    }

-    if let Some(addr) = arg_matches.value_of("listen") {
-        conf.listen_addr = addr.parse()?;
+    // The configuration is all set up now. Turn it into a 'static
+    // that can be freely stored in structs and passed across threads
+    // as a ref.
+    let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+    // Create repo and exit if init was requested
+    if init {
+        branches::init_repo(conf, &workdir)?;
+
+        // write the config file
+        let cfg_file_contents = toml::to_string_pretty(&params)?;
+        std::fs::write(&cfg_file_path, cfg_file_contents)?;
+
+        return Ok(());
    }

-    if let Some(horizon) = arg_matches.value_of("gc_horizon") {
-        conf.gc_horizon = horizon.parse()?;
-    }
+    // Set CWD to workdir for non-daemon modes
+    env::set_current_dir(&workdir)?;

-    if let Some(period) = arg_matches.value_of("gc_period") {
-        conf.gc_period = parse(period)?;
-    }
-
-    start_pageserver(&conf)
+    start_pageserver(conf)
 }

-fn start_pageserver(conf: &PageServerConf) -> Result<()> {
+fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
+    let log_filename = "pageserver.log";
+    // Don't open the same file for output multiple times;
+    // the different fds could overwrite each other's output.
+    let log_file = OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(&log_filename)
+        .with_context(|| format!("failed to open {:?}", &log_filename))?;
+
    // Initialize logger
-    let _scope_guard = init_logging(&conf)?;
+    let logger_file = log_file.try_clone().unwrap();
+    let _scope_guard = init_logging(&conf, logger_file)?;
    let _log_guard = slog_stdlog::init()?;

    // Note: this `info!(...)` macro comes from `log` crate
    info!("standard logging redirected to slog");

-    let tui_thread: Option<thread::JoinHandle<()>>;
-    if conf.interactive {
+    let tui_thread = if conf.interactive {
        // Initialize the UI
-        tui_thread = Some(
+        Some(
            thread::Builder::new()
                .name("UI thread".into())
                .spawn(|| {
                    let _ = tui::ui_main();
                })
                .unwrap(),
-        );
-        //threads.push(tui_thread);
+        )
    } else {
-        tui_thread = None;
-    }
+        None
+    };
+
+    // TODO: Check that it looks like a valid repository before going further

    if conf.daemonize {
        info!("daemonizing...");

-        let repodir = PathBuf::from(zenith_repo_dir());
-
        // There should'n be any logging to stdin/stdout. Redirect it to the main log so
        // that we will see any accidental manual fprintf's or backtraces.
-        let log_filename = repodir.join("pageserver.log");
-        let stdout = OpenOptions::new()
-            .create(true)
-            .append(true)
-            .open(&log_filename)
-            .with_context(|| format!("failed to open {:?}", &log_filename))?;
-        let stderr = OpenOptions::new()
-            .create(true)
-            .append(true)
-            .open(&log_filename)
-            .with_context(|| format!("failed to open {:?}", &log_filename))?;
+        let stdout = log_file.try_clone().unwrap();
+        let stderr = log_file;

        let daemonize = Daemonize::new()
-            .pid_file(repodir.join("pageserver.pid"))
-            .working_directory(repodir)
+            .pid_file("pageserver.pid")
+            .working_directory(".")
            .stdout(stdout)
            .stderr(stderr);

@@ -149,77 +290,64 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
            Ok(_) => info!("Success, daemonized"),
            Err(e) => error!("Error, {}", e),
        }
-    } else {
-        // change into the repository directory. In daemon mode, Daemonize
-        // does this for us.
-        let repodir = zenith_repo_dir();
-        std::env::set_current_dir(&repodir)?;
-        info!("Changed current directory to repository in {:?}", &repodir);
    }

-    let mut threads = Vec::new();
+    // Check that we can bind to address before starting threads to simplify shutdown
+    // sequence if port is occupied.
+    info!("Starting pageserver on {}", conf.listen_addr);
+    let pageserver_listener = TcpListener::bind(conf.listen_addr.clone())?;

-    // TODO: Check that it looks like a valid repository before going further
+    // Initialize page cache, this will spawn walredo_thread
+    page_cache::init(conf);

-    // Create directory for wal-redo datadirs
-    match fs::create_dir("wal-redo") {
-        Ok(_) => {}
-        Err(e) => match e.kind() {
-            io::ErrorKind::AlreadyExists => {}
-            _ => {
-                anyhow::bail!("Failed to create wal-redo data directory: {}", e);
-            }
-        },
-    }
-
-    // GetPage@LSN requests are served by another thread. (It uses async I/O,
-    // but the code in page_service sets up it own thread pool for that)
-    let conf_copy = conf.clone();
-    let page_server_thread = thread::Builder::new()
+    // Spawn a thread to listen for connections. It will spawn further threads
+    // for each connection.
+    let page_service_thread = thread::Builder::new()
        .name("Page Service thread".into())
-        .spawn(move || {
-            // thread code
-            page_service::thread_main(&conf_copy);
-        })
-        .unwrap();
-    threads.push(page_server_thread);
+        .spawn(move || page_service::thread_main(conf, pageserver_listener))?;

    if let Some(tui_thread) = tui_thread {
        // The TUI thread exits when the user asks to Quit.
        tui_thread.join().unwrap();
    } else {
-        // In non-interactive mode, wait forever.
-        for t in threads {
-            t.join().unwrap()
-        }
+        page_service_thread
+            .join()
+            .expect("Page service thread has panicked")?
    }
+
    Ok(())
 }

-fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
+fn init_logging(
+    conf: &PageServerConf,
+    log_file: File,
+) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
    if conf.interactive {
        Ok(tui::init_logging())
    } else if conf.daemonize {
-        let log = zenith_repo_dir().join("pageserver.log");
-        let log_file = OpenOptions::new()
-            .create(true)
-            .append(true)
-            .open(&log)
-            .map_err(|err| {
-                // We failed to initialize logging, so we can't log this message with error!
-                eprintln!("Could not create log file {:?}: {}", log, err);
-                err
-            })?;
        let decorator = slog_term::PlainSyncDecorator::new(log_file);
-        let drain = slog_term::CompactFormat::new(decorator).build();
+        let drain = slog_term::FullFormat::new(decorator).build();
        let drain = slog::Filter::new(drain, |record: &slog::Record| {
-            if record.level().is_at_least(slog::Level::Debug) {
+            if record.level().is_at_least(slog::Level::Info) {
                return true;
            }
            false
        });
        let drain = std::sync::Mutex::new(drain).fuse();
-        let logger = slog::Logger::root(drain, slog::o!());
+        let logger = slog::Logger::root(
+            drain,
+            slog::o!(
+                "location" =>
+                FnValue(move |record| {
+                    format!("{}, {}:{}",
+                            record.module(),
+                            record.file(),
+                            record.line()
+                            )
+                    }
+                )
+            ),
+        );
        Ok(slog_scope::set_global_logger(logger))
    } else {
        let decorator = slog_term::TermDecorator::new().build();
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -0,0 +1,433 @@
+//!
+//! Branch management code
+//!
+// TODO: move all paths construction to conf impl
+//
+
+use anyhow::{anyhow, bail, Context, Result};
+use fs::File;
+use postgres_ffi::{pg_constants, xlog_utils, ControlFileData};
+use rand::Rng;
+use serde::{Deserialize, Serialize};
+use std::env;
+use std::io::{Read, Write};
+use std::{
+    collections::HashMap,
+    fs, io,
+    path::{Path, PathBuf},
+    process::{Command, Stdio},
+    str::FromStr,
+};
+use zenith_utils::lsn::Lsn;
+
+use crate::page_cache;
+use crate::restore_local_repo;
+use crate::{repository::Repository, PageServerConf, ZTimelineId};
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct BranchInfo {
+    pub name: String,
+    pub timeline_id: ZTimelineId,
+    pub latest_valid_lsn: Option<Lsn>,
+    pub ancestor_id: Option<String>,
+    pub ancestor_lsn: Option<String>,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct PointInTime {
+    pub timelineid: ZTimelineId,
+    pub lsn: Lsn,
+}
+
+pub fn init_repo(conf: &'static PageServerConf, repo_dir: &Path) -> Result<()> {
+    // top-level dir may exist if we are creating it through CLI
+    fs::create_dir_all(repo_dir)
+        .with_context(|| format!("could not create directory {}", repo_dir.display()))?;
+
+    env::set_current_dir(repo_dir)?;
+
+    fs::create_dir(std::path::Path::new("timelines"))?;
+    fs::create_dir(std::path::Path::new("refs"))?;
+    fs::create_dir(std::path::Path::new("refs").join("branches"))?;
+    fs::create_dir(std::path::Path::new("refs").join("tags"))?;
+
+    println!("created directory structure in {}", repo_dir.display());
+
+    // Run initdb
+    //
+    // We create the cluster temporarily in a "tmp" directory inside the repository,
+    // and move it to the right location from there.
+    let tmppath = std::path::Path::new("tmp");
+
+    print!("running initdb... ");
+    io::stdout().flush()?;
+
+    let initdb_path = conf.pg_bin_dir().join("initdb");
+    let initdb_otput = Command::new(initdb_path)
+        .args(&["-D", tmppath.to_str().unwrap()])
+        .arg("--no-instructions")
+        .env_clear()
+        .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+        .stdout(Stdio::null())
+        .output()
+        .with_context(|| "failed to execute initdb")?;
+    if !initdb_otput.status.success() {
+        anyhow::bail!(
+            "initdb failed: '{}'",
+            String::from_utf8_lossy(&initdb_otput.stderr)
+        );
+    }
+    println!("initdb succeeded");
+
+    // Read control file to extract the LSN and system id
+    let controlfile_path = tmppath.join("global").join("pg_control");
+    let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
+    // let systemid = controlfile.system_identifier;
+    let lsn = controlfile.checkPoint;
+    let lsnstr = format!("{:016X}", lsn);
+
+    // Bootstrap the repository by loading the newly-initdb'd cluster into 'main' branch.
+    let tli = create_timeline(conf, None)?;
+    let timelinedir = conf.timeline_path(tli);
+
+    // We don't use page_cache here, because we don't want to spawn the WAL redo thread during
+    // repository initialization.
+    //
+    // FIXME: That caused trouble, because the WAL redo thread launched initdb in the background,
+    // and it kept running even after the "zenith init" had exited. In tests, we started the
+    // page server immediately after that, so that initdb was still running in the background,
+    // and we failed to run initdb again in the same directory. This has been solved for the
+    // rapid init+start case now, but the general race condition remains if you restart the
+    // server quickly.
+    let storage = crate::rocksdb_storage::RocksObjectStore::create(conf)?;
+    //let storage = crate::inmem_storage::InmemObjectStore::create(conf)?;
+
+    let repo = crate::object_repository::ObjectRepository::new(
+        conf,
+        std::sync::Arc::new(storage),
+        std::sync::Arc::new(crate::walredo::DummyRedoManager {}),
+    );
+    let timeline = repo.create_empty_timeline(tli, Lsn(lsn))?;
+
+    restore_local_repo::import_timeline_from_postgres_datadir(&tmppath, &*timeline, Lsn(lsn))?;
+
+    // Move the initial WAL file
+    fs::rename(
+        tmppath.join("pg_wal").join("000000010000000000000001"),
+        timelinedir
+            .join("wal")
+            .join("000000010000000000000001.partial"),
+    )?;
+    println!("created initial timeline {}", tli);
+
+    let data = tli.to_string();
+    fs::write(conf.branch_path("main"), data)?;
+    println!("created main branch");
+
+    // Remove pg_wal
+    fs::remove_dir_all(tmppath.join("pg_wal"))?;
+
+    // Move the data directory as an initial base backup.
+    // FIXME: It would be enough to only copy the non-relation files here, the relation
+    // data was already loaded into the repository.
+    let target = timelinedir.join("snapshots").join(&lsnstr);
+    fs::rename(tmppath, &target)?;
+
+    println!(
+        "new zenith repository was created in {}",
+        repo_dir.display()
+    );
+
+    Ok(())
+}
+
+pub(crate) fn get_branches(conf: &PageServerConf) -> Result<Vec<BranchInfo>> {
+    let repo = page_cache::get_repository();
+
+    // Each branch has a corresponding record (text file) in the refs/branches
+    // with timeline_id.
+    let branches_dir = std::path::Path::new("refs").join("branches");
+
+    std::fs::read_dir(&branches_dir)?
+        .map(|dir_entry_res| {
+            let dir_entry = dir_entry_res?;
+            let name = dir_entry.file_name().to_str().unwrap().to_string();
+            let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
+
+            let latest_valid_lsn = repo
+                .get_timeline(timeline_id)
+                .map(|timeline| timeline.get_last_valid_lsn())
+                .ok();
+
+            let ancestor_path = conf.ancestor_path(timeline_id);
+            let mut ancestor_id: Option<String> = None;
+            let mut ancestor_lsn: Option<String> = None;
+
+            if ancestor_path.exists() {
+                let ancestor = std::fs::read_to_string(ancestor_path)?;
+                let mut strings = ancestor.split('@');
+
+                ancestor_id = Some(
+                    strings
+                        .next()
+                        .with_context(|| "wrong branch ancestor point in time format")?
+                        .to_owned(),
+                );
+                ancestor_lsn = Some(
+                    strings
+                        .next()
+                        .with_context(|| "wrong branch ancestor point in time format")?
+                        .to_owned(),
+                );
+            }
+
+            Ok(BranchInfo {
+                name,
+                timeline_id,
+                latest_valid_lsn,
+                ancestor_id,
+                ancestor_lsn,
+            })
+        })
+        .collect()
+}
+
+pub(crate) fn get_system_id(conf: &PageServerConf) -> Result<u64> {
+    // let branches = get_branches();
+
+    let branches_dir = std::path::Path::new("refs").join("branches");
+    let branches = std::fs::read_dir(&branches_dir)?
+        .map(|dir_entry_res| {
+            let dir_entry = dir_entry_res?;
+            let name = dir_entry.file_name().to_str().unwrap().to_string();
+            let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
+            Ok((name, timeline_id))
+        })
+        .collect::<Result<HashMap<String, ZTimelineId>>>()?;
+
+    let main_tli = branches
+        .get("main")
+        .ok_or_else(|| anyhow!("Branch main not found"))?;
+
+    let (_, main_snap_dir) = find_latest_snapshot(conf, *main_tli)?;
+    let controlfile_path = main_snap_dir.join("global").join("pg_control");
+    let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
+    Ok(controlfile.system_identifier)
+}
+
+pub(crate) fn create_branch(
+    conf: &PageServerConf,
+    branchname: &str,
+    startpoint_str: &str,
+) -> Result<BranchInfo> {
+    let repo = page_cache::get_repository();
+
+    if conf.branch_path(&branchname).exists() {
+        anyhow::bail!("branch {} already exists", branchname);
+    }
+
+    let mut startpoint = parse_point_in_time(conf, startpoint_str)?;
+
+    if startpoint.lsn == Lsn(0) {
+        // Find end of WAL on the old timeline
+        let end_of_wal = repo
+            .get_timeline(startpoint.timelineid)?
+            .get_last_record_lsn();
+        println!("branching at end of WAL: {}", end_of_wal);
+        startpoint.lsn = end_of_wal;
+    }
+
+    // create a new timeline directory for it
+    let newtli = create_timeline(conf, Some(startpoint))?;
+    let newtimelinedir = conf.timeline_path(newtli);
+
+    // Let the Repository backend do its initialization
+    repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;
+
+    // Copy the latest snapshot (TODO: before the startpoint) and all WAL
+    // TODO: be smarter and avoid the copying...
+    let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(conf, startpoint.timelineid)?;
+    let copy_opts = fs_extra::dir::CopyOptions::new();
+    fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), &copy_opts)?;
+
+    let oldtimelinedir = conf.timeline_path(startpoint.timelineid);
+    copy_wal(
+        &oldtimelinedir.join("wal"),
+        &newtimelinedir.join("wal"),
+        startpoint.lsn,
+        pg_constants::WAL_SEGMENT_SIZE,
+    )?;
+
+    // Remember the human-readable branch name for the new timeline.
+    // FIXME: there's a race condition, if you create a branch with the same
+    // name concurrently.
+    let data = newtli.to_string();
+    fs::write(conf.branch_path(&branchname), data)?;
+
+    Ok(BranchInfo {
+        name: branchname.to_string(),
+        timeline_id: newtli,
+        latest_valid_lsn: Some(startpoint.lsn),
+        ancestor_id: None,
+        ancestor_lsn: None,
+    })
+}
+
+//
+// Parse user-given string that represents a point-in-time.
+//
+// We support multiple variants:
+//
+// Raw timeline id in hex, meaning the end of that timeline:
+//    bc62e7d612d0e6fe8f99a6dd2f281f9d
+//
+// A specific LSN on a timeline:
+//    bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
+//
+// Same, with a human-friendly branch name:
+//    main
+//    main@2/15D3DD8
+//
+// Human-friendly tag name:
+//    mytag
+//
+//
+fn parse_point_in_time(conf: &PageServerConf, s: &str) -> Result<PointInTime> {
+    let mut strings = s.split('@');
+    let name = strings.next().unwrap();
+
+    let lsn: Option<Lsn>;
+    if let Some(lsnstr) = strings.next() {
+        lsn = Some(
+            Lsn::from_str(lsnstr).with_context(|| "invalid LSN in point-in-time specification")?,
+        );
+    } else {
+        lsn = None
+    }
+
+    // Check if it's a tag
+    if lsn.is_none() {
+        let tagpath = conf.tag_path(name);
+        if tagpath.exists() {
+            let pointstr = fs::read_to_string(tagpath)?;
+
+            return parse_point_in_time(conf, &pointstr);
+        }
+    }
+
+    // Check if it's a branch
+    // Check if it's branch @ LSN
+    let branchpath = conf.branch_path(name);
+    if branchpath.exists() {
+        let pointstr = fs::read_to_string(branchpath)?;
+
+        let mut result = parse_point_in_time(conf, &pointstr)?;
+
+        result.lsn = lsn.unwrap_or(Lsn(0));
+        return Ok(result);
+    }
+
+    // Check if it's a timelineid
+    // Check if it's timelineid @ LSN
+    if let Ok(timelineid) = ZTimelineId::from_str(name) {
+        let tlipath = conf.timeline_path(timelineid);
+        if tlipath.exists() {
+            return Ok(PointInTime {
+                timelineid,
+                lsn: lsn.unwrap_or(Lsn(0)),
+            });
+        }
+    }
+
+    bail!("could not parse point-in-time {}", s);
+}
+
+fn create_timeline(conf: &PageServerConf, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
+    // Create initial timeline
+    let mut tli_buf = [0u8; 16];
+    rand::thread_rng().fill(&mut tli_buf);
+    let timelineid = ZTimelineId::from(tli_buf);
+
+    let timelinedir = conf.timeline_path(timelineid);
+
+    fs::create_dir(&timelinedir)?;
+    fs::create_dir(&timelinedir.join("snapshots"))?;
+    fs::create_dir(&timelinedir.join("wal"))?;
+
+    if let Some(ancestor) = ancestor {
+        let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
+        fs::write(timelinedir.join("ancestor"), data)?;
+    }
+
+    Ok(timelineid)
+}
+
+///
+/// Copy all WAL segments from one directory to another, up to given LSN.
+///
+/// If the given LSN is in the middle of a segment, the last segment containing it
+/// is written out as .partial, and padded with zeros.
+///
+fn copy_wal(src_dir: &Path, dst_dir: &Path, upto: Lsn, wal_seg_size: usize) -> Result<()> {
+    let last_segno = upto.segment_number(wal_seg_size);
+    let last_segoff = upto.segment_offset(wal_seg_size);
+
+    for entry in fs::read_dir(src_dir).unwrap().flatten() {
+        let entry_name = entry.file_name();
+        let fname = entry_name.to_str().unwrap();
+
+        // Check if the filename looks like an xlog file, or a .partial file.
+        if !xlog_utils::IsXLogFileName(fname) && !xlog_utils::IsPartialXLogFileName(fname) {
+            continue;
+        }
+        let (segno, _tli) = xlog_utils::XLogFromFileName(fname, wal_seg_size as usize);
+
+        let copylen;
+        let mut dst_fname = PathBuf::from(fname);
+        if segno > last_segno {
+            // future segment, skip
+            continue;
+        } else if segno < last_segno {
+            copylen = wal_seg_size;
+            dst_fname.set_extension("");
+        } else {
+            copylen = last_segoff;
+            dst_fname.set_extension("partial");
+        }
+
+        let src_file = File::open(entry.path())?;
+        let mut dst_file = File::create(dst_dir.join(&dst_fname))?;
+        std::io::copy(&mut src_file.take(copylen as u64), &mut dst_file)?;
+
+        if copylen < wal_seg_size {
+            std::io::copy(
+                &mut std::io::repeat(0).take((wal_seg_size - copylen) as u64),
+                &mut dst_file,
+            )?;
+        }
+    }
+    Ok(())
+}
+
+// Find the latest snapshot for a timeline
+fn find_latest_snapshot(conf: &PageServerConf, timeline: ZTimelineId) -> Result<(Lsn, PathBuf)> {
+    let snapshotsdir = conf.snapshots_path(timeline);
+    let paths = fs::read_dir(&snapshotsdir)?;
+    let mut maxsnapshot = Lsn(0);
+    let mut snapshotdir: Option<PathBuf> = None;
+    for path in paths {
+        let path = path?;
+        let filename = path.file_name().to_str().unwrap().to_owned();
+        if let Ok(lsn) = Lsn::from_hex(&filename) {
+            maxsnapshot = std::cmp::max(lsn, maxsnapshot);
+            snapshotdir = Some(path.path());
+        }
+    }
+    if maxsnapshot == Lsn(0) {
+        // TODO: check ancestor timeline
+        anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
+    }
+
+    Ok((maxsnapshot, snapshotdir.unwrap()))
+}
--- a/pageserver/src/inmem_storage.rs
+++ b/pageserver/src/inmem_storage.rs
@@ -0,0 +1,345 @@
+//!
+//! An implementation of the ObjectStore interface, backed by BTreeMap
+//!
+use crate::object_key::*;
+use crate::object_store::ObjectStore;
+use crate::repository::RelTag;
+use crate::PageServerConf;
+use crate::ZTimelineId;
+use anyhow::{bail, Result};
+use serde::{Deserialize, Serialize};
+use std::collections::{BTreeMap, HashSet};
+use std::fs::File;
+use std::io::prelude::*;
+use std::ops::Bound::*;
+use std::sync::RwLock;
+use zenith_utils::bin_ser::BeSer;
+use zenith_utils::lsn::Lsn;
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize)]
+pub struct StorageKey {
+    obj_key: ObjectKey,
+    lsn: Lsn,
+}
+
+impl StorageKey {
+    /// The first key for a given timeline
+    fn timeline_start(timeline: ZTimelineId) -> Self {
+        Self {
+            obj_key: ObjectKey {
+                timeline,
+                tag: ObjectTag::FirstTag,
+            },
+            lsn: Lsn(0),
+        }
+    }
+}
+
+pub struct InmemObjectStore {
+    conf: &'static PageServerConf,
+    db: RwLock<BTreeMap<StorageKey, Vec<u8>>>,
+}
+
+impl ObjectStore for InmemObjectStore {
+    fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
+        let db = self.db.read().unwrap();
+        let val = db.get(&StorageKey {
+            obj_key: key.clone(),
+            lsn,
+        });
+        if let Some(val) = val {
+            Ok(val.clone())
+        } else {
+            bail!("could not find page {:?}", key);
+        }
+    }
+
+    fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
+        let search_key = StorageKey {
+            obj_key: key.clone(),
+            lsn: Lsn(0),
+        };
+        let db = self.db.read().unwrap();
+        for pair in db.range(&search_key..) {
+            let key = pair.0;
+            return Ok(Some(key.obj_key.clone()));
+        }
+        Ok(None)
+    }
+
+    fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
+        let mut db = self.db.write().unwrap();
+        db.insert(
+            StorageKey {
+                obj_key: key.clone(),
+                lsn,
+            },
+            value.to_vec(),
+        );
+        Ok(())
+    }
+
+    fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
+        let mut db = self.db.write().unwrap();
+        db.remove(&StorageKey {
+            obj_key: key.clone(),
+            lsn,
+        });
+        Ok(())
+    }
+
+    /// Iterate through page versions of given page, starting from the given LSN.
+    /// The versions are walked in descending LSN order.
+    fn object_versions<'a>(
+        &'a self,
+        key: &ObjectKey,
+        lsn: Lsn,
+    ) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
+        let from = StorageKey {
+            obj_key: key.clone(),
+            lsn: Lsn(0),
+        };
+        let till = StorageKey {
+            obj_key: key.clone(),
+            lsn,
+        };
+        let db = self.db.read().unwrap();
+        let versions: Vec<(Lsn, Vec<u8>)> = db
+            .range(from..=till)
+            .map(|pair| (pair.0.lsn, pair.1.clone()))
+            .collect();
+        Ok(Box::new(InmemObjectVersionIter::new(versions)))
+    }
+
+    /// Iterate through all timeline objects
+    fn list_objects<'a>(
+        &'a self,
+        timeline: ZTimelineId,
+        nonrel_only: bool,
+        lsn: Lsn,
+    ) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
+        let curr_key = StorageKey::timeline_start(timeline);
+
+        Ok(Box::new(InmemObjectIter {
+            store: &self,
+            curr_key,
+            timeline,
+            nonrel_only,
+            lsn,
+        }))
+    }
+
+    /// Get a list of all distinct relations in given tablespace and database.
+    ///
+    /// TODO: This implementation is very inefficient, it scans
+    /// through all entries in the given database. In practice, this
+    /// is used for CREATE DATABASE, and usually the template database is small.
+    /// But if it's not, this will be slow.
+    fn list_rels(
+        &self,
+        timelineid: ZTimelineId,
+        spcnode: u32,
+        dbnode: u32,
+        lsn: Lsn,
+    ) -> Result<HashSet<RelTag>> {
+        // FIXME: This scans everything. Very slow
+
+        let mut rels: HashSet<RelTag> = HashSet::new();
+
+        let mut search_rel_tag = RelTag {
+            spcnode,
+            dbnode,
+            relnode: 0,
+            forknum: 0u8,
+        };
+        let db = self.db.read().unwrap();
+        'outer: loop {
+            let search_key = StorageKey {
+                obj_key: ObjectKey {
+                    timeline: timelineid,
+                    tag: ObjectTag::RelationMetadata(search_rel_tag),
+                },
+                lsn: Lsn(0),
+            };
+            for pair in db.range(&search_key..) {
+                let key = pair.0;
+
+                if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
+                    if spcnode != 0 && rel_tag.spcnode != spcnode
+                        || dbnode != 0 && rel_tag.dbnode != dbnode
+                    {
+                        break 'outer;
+                    }
+                    if key.lsn <= lsn {
+                        // visible in this snapshot
+                        rels.insert(rel_tag);
+                    }
+                    search_rel_tag = rel_tag;
+                    // skip to next relation
+                    // FIXME: What if relnode is u32::MAX ?
+                    search_rel_tag.relnode += 1;
+                    continue 'outer;
+                } else {
+                    // no more relation metadata entries
+                    break 'outer;
+                }
+            }
+        }
+
+        Ok(rels)
+    }
+
+    /// Iterate through versions of all objects in a timeline.
+    ///
+    /// Returns objects in increasing key-version order.
+    /// Returns all versions up to and including the specified LSN.
+    fn objects<'a>(
+        &'a self,
+        timeline: ZTimelineId,
+        lsn: Lsn,
+    ) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
+        let curr_key = StorageKey::timeline_start(timeline);
+
+        Ok(Box::new(InmemObjects {
+            store: &self,
+            curr_key,
+            timeline,
+            lsn,
+        }))
+    }
+
+    fn compact(&self) {}
+}
+
+impl Drop for InmemObjectStore {
+    fn drop(&mut self) {
+        let path = self.conf.workdir.join("objstore.dmp");
+        let mut f = File::create(path).unwrap();
+        f.write(&self.db.ser().unwrap()).unwrap();
+    }
+}
+
+impl InmemObjectStore {
+    pub fn open(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
+        let path = conf.workdir.join("objstore.dmp");
+        let mut f = File::open(path)?;
+        let mut buffer = Vec::new();
+        // read the whole file
+        f.read_to_end(&mut buffer)?;
+        let db = RwLock::new(BTreeMap::des(&buffer)?);
+        Ok(InmemObjectStore { conf: conf, db })
+    }
+
+    pub fn create(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
+        Ok(InmemObjectStore {
+            conf: conf,
+            db: RwLock::new(BTreeMap::new()),
+        })
+    }
+}
+
+///
+/// Iterator for `object_versions`. Returns all page versions of a given block, in
+/// reverse LSN order.
+///
+struct InmemObjectVersionIter {
+    versions: Vec<(Lsn, Vec<u8>)>,
+    curr: usize,
+}
+impl InmemObjectVersionIter {
+    fn new(versions: Vec<(Lsn, Vec<u8>)>) -> InmemObjectVersionIter {
+        let curr = versions.len();
+        InmemObjectVersionIter { versions, curr }
+    }
+}
+impl Iterator for InmemObjectVersionIter {
+    type Item = (Lsn, Vec<u8>);
+
+    fn next(&mut self) -> std::option::Option<Self::Item> {
+        if self.curr == 0 {
+            None
+        } else {
+            self.curr -= 1;
+            Some(self.versions[self.curr].clone())
+        }
+    }
+}
+
+struct InmemObjects<'r> {
+    store: &'r InmemObjectStore,
+    curr_key: StorageKey,
+    timeline: ZTimelineId,
+    lsn: Lsn,
+}
+
+impl<'r> Iterator for InmemObjects<'r> {
+    // TODO consider returning Box<[u8]>
+    type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_result().transpose()
+    }
+}
+
+impl<'r> InmemObjects<'r> {
+    fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
+        let db = self.store.db.read().unwrap();
+        for pair in db.range((Excluded(&self.curr_key), Unbounded)) {
+            let key = pair.0;
+            if key.obj_key.timeline != self.timeline {
+                return Ok(None);
+            }
+            if key.lsn > self.lsn {
+                // TODO can speed up by seeking iterator
+                continue;
+            }
+            self.curr_key = key.clone();
+            let value = pair.1.clone();
+            return Ok(Some((key.obj_key.tag, key.lsn, value)));
+        }
+        Ok(None)
+    }
+}
+
+///
+/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
+///
+struct InmemObjectIter<'a> {
+    store: &'a InmemObjectStore,
+    curr_key: StorageKey,
+    timeline: ZTimelineId,
+    nonrel_only: bool,
+    lsn: Lsn,
+}
+
+impl<'a> Iterator for InmemObjectIter<'a> {
+    type Item = ObjectTag;
+
+    fn next(&mut self) -> std::option::Option<Self::Item> {
+        let db = self.store.db.read().unwrap();
+        'outer: loop {
+            for pair in db.range((Excluded(&self.curr_key), Unbounded)) {
+                let key = pair.0;
+                if key.obj_key.timeline != self.timeline {
+                    return None;
+                }
+                self.curr_key = key.clone();
+                self.curr_key.lsn = Lsn(u64::MAX); // next seek should skip all versions
+                if key.lsn <= self.lsn {
+                    // visible in this snapshot
+                    if self.nonrel_only {
+                        match key.obj_key.tag {
+                            ObjectTag::RelationMetadata(_) => return None,
+                            ObjectTag::RelationBuffer(_) => return None,
+                            _ => return Some(key.obj_key.tag),
+                        }
+                    } else {
+                        return Some(key.obj_key.tag);
+                    }
+                }
+                continue 'outer;
+            }
+            return None;
+        }
+    }
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,14 +1,21 @@
+use serde::{Deserialize, Serialize};
+
 use std::fmt;
-use std::net::SocketAddr;
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::time::Duration;

 pub mod basebackup;
+pub mod branches;
+pub mod inmem_storage;
+pub mod object_key;
+pub mod object_repository;
+pub mod object_store;
 pub mod page_cache;
 pub mod page_service;
-pub mod pg_constants;
+pub mod repository;
 pub mod restore_local_repo;
+pub mod rocksdb_storage;
 pub mod tui;
 pub mod tui_event;
 mod tui_logger;
@@ -20,9 +27,59 @@ pub mod walredo;
 pub struct PageServerConf {
    pub daemonize: bool,
    pub interactive: bool,
-    pub listen_addr: SocketAddr,
+    pub materialize: bool,
+    pub listen_addr: String,
    pub gc_horizon: u64,
    pub gc_period: Duration,
+    pub wal_redoers: usize,
+
+    // Repository directory, relative to current working directory.
+    // Normally, the page server changes the current working directory
+    // to the repository, and 'workdir' is always '.'. But we don't do
+    // that during unit testing, because the current directory is global
+    // to the process but different unit tests work on different
+    // repositories.
+    pub workdir: PathBuf,
+
+    pub pg_distrib_dir: PathBuf,
+}
+
+impl PageServerConf {
+    //
+    // Repository paths, relative to workdir.
+    //
+
+    fn tag_path(&self, name: &str) -> PathBuf {
+        self.workdir.join("refs").join("tags").join(name)
+    }
+
+    fn branch_path(&self, name: &str) -> PathBuf {
+        self.workdir.join("refs").join("branches").join(name)
+    }
+
+    fn timeline_path(&self, timelineid: ZTimelineId) -> PathBuf {
+        self.workdir.join("timelines").join(timelineid.to_string())
+    }
+
+    fn snapshots_path(&self, timelineid: ZTimelineId) -> PathBuf {
+        self.timeline_path(timelineid).join("snapshots")
+    }
+
+    fn ancestor_path(&self, timelineid: ZTimelineId) -> PathBuf {
+        self.timeline_path(timelineid).join("ancestor")
+    }
+
+    //
+    // Postgres distribution paths
+    //
+
+    pub fn pg_bin_dir(&self) -> PathBuf {
+        self.pg_distrib_dir.join("bin")
+    }
+
+    pub fn pg_lib_dir(&self) -> PathBuf {
+        self.pg_distrib_dir.join("lib")
+    }
 }

 /// Zenith Timeline ID is a 128-bit random ID.
@@ -49,7 +106,7 @@ pub struct PageServerConf {
 /// is separate from PostgreSQL timelines, and doesn't have those
 /// limitations. A zenith timeline is identified by a 128-bit ID, which
 /// is usually printed out as a hex string.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
 pub struct ZTimelineId([u8; 16]);

 impl FromStr for ZTimelineId {
@@ -85,11 +142,3 @@ impl fmt::Display for ZTimelineId {
        f.write_str(&hex::encode(self.0))
    }
 }
-
-pub fn zenith_repo_dir() -> PathBuf {
-    // Find repository path
-    match std::env::var_os("ZENITH_REPO_DIR") {
-        Some(val) => PathBuf::from(val.to_str().unwrap()),
-        None => ".zenith".into(),
-    }
-}
--- a/pageserver/src/object_key.rs
+++ b/pageserver/src/object_key.rs
@@ -0,0 +1,84 @@
+use crate::repository::{BufferTag, RelTag};
+use crate::waldecoder::TransactionId;
+use crate::ZTimelineId;
+use serde::{Deserialize, Serialize};
+
+///
+/// ObjectKey is the key type used to identify objects stored in an object
+/// repository. It is shared between object_repository.rs and object_store.rs.
+/// It is mostly opaque to ObjectStore, it just stores and retrieves objects
+/// using the key given by the caller.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+pub struct ObjectKey {
+    pub timeline: ZTimelineId,
+    pub tag: ObjectTag,
+}
+
+///
+/// Non-relation transaction status files (clog (a.k.a. pg_xact) and pg_multixact)
+/// in Postgres are handled by SLRU (Simple LRU) buffer, hence the name.
+///
+/// These files are global for a postgres instance.
+///
+/// These files are divided into segments, which are divided into pages
+/// of the same BLCKSZ as used for relation files.
+///
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+pub struct SlruBufferTag {
+    pub blknum: u32,
+}
+
+///
+/// Special type of Postgres files: pg_filenode.map is needed to map
+/// catalog table OIDs to filenode numbers, which define filename.
+///
+/// Each database has a map file for its local mapped catalogs,
+/// and there is a separate map file for shared catalogs.
+///
+/// These files have untypical size of 512 bytes.
+///
+/// See PostgreSQL relmapper.c for details.
+///
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+pub struct DatabaseTag {
+    pub spcnode: u32,
+    pub dbnode: u32,
+}
+
+///
+/// Non-relation files that keep state for prepared transactions.
+/// Unlike other files these are not divided into pages.
+///
+/// See PostgreSQL twophase.c for details.
+///
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+pub struct PrepareTag {
+    pub xid: TransactionId,
+}
+
+/// ObjectTag is a part of ObjectKey that is specific to the type of
+/// the stored object.
+///
+/// NB: the order of the enum values is significant!  In particular,
+/// rocksdb_storage.rs assumes that TimelineMetadataTag is first
+///
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+pub enum ObjectTag {
+    // dummy tag preceeding all other keys
+    FirstTag,
+    TimelineMetadataTag,
+    // Special entry that represents PostgreSQL checkpoint.
+    // We use it to track fields needed to restore controlfile checkpoint.
+    Checkpoint,
+    // Various types of non-relation files.
+    // We need them to bootstrap compute node.
+    ControlFile,
+    Clog(SlruBufferTag),
+    MultiXactMembers(SlruBufferTag),
+    MultiXactOffsets(SlruBufferTag),
+    FileNodeMap(DatabaseTag),
+    TwoPhase(PrepareTag),
+    // put relations at the end of enum to allow efficient iterations through non-rel objects
+    RelationMetadata(RelTag),
+    RelationBuffer(BufferTag),
+}
--- a/pageserver/src/object_repository.rs
+++ b/pageserver/src/object_repository.rs
--- a/pageserver/src/object_store.rs
+++ b/pageserver/src/object_store.rs
@@ -0,0 +1,88 @@
+//! Low-level key-value storage abstraction.
+//!
+use crate::object_key::*;
+use crate::repository::RelTag;
+use crate::ZTimelineId;
+use anyhow::Result;
+use std::collections::HashSet;
+use std::iter::Iterator;
+use zenith_utils::lsn::Lsn;
+
+///
+/// Low-level storage abstraction.
+///
+/// All the data in the repository is stored in a key-value store. This trait
+/// abstracts the details of the key-value store.
+///
+/// A simple key-value store would support just GET and PUT operations with
+/// a key, but the upper layer needs slightly complicated read operations
+///
+/// The most frequently used function is 'object_versions'. It is used
+/// to look up a page version. It is LSN aware, in that the caller
+/// specifies an LSN, and the function returns all values for that
+/// block with the same or older LSN.
+///
+pub trait ObjectStore: Send + Sync {
+    ///
+    /// Store a value with given key.
+    ///
+    fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()>;
+
+    /// Read entry with the exact given key.
+    ///
+    /// This is used for retrieving metadata with special key that doesn't
+    /// correspond to any real relation.
+    fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>>;
+
+    /// Read key greater or equal than specified
+    fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>>;
+
+    /// Iterate through all page versions of one object.
+    ///
+    /// Returns all page versions in descending LSN order, along with the LSN
+    /// of each page version.
+    fn object_versions<'a>(
+        &'a self,
+        key: &ObjectKey,
+        lsn: Lsn,
+    ) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>>;
+
+    /// Iterate through versions of all objects in a timeline.
+    ///
+    /// Returns objects in increasing key-version order.
+    /// Returns all versions up to and including the specified LSN.
+    fn objects<'a>(
+        &'a self,
+        timeline: ZTimelineId,
+        lsn: Lsn,
+    ) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>>;
+
+    /// Iterate through all keys with given tablespace and database ID, and LSN <= 'lsn'.
+    /// Both dbnode and spcnode can be InvalidId (0) which means get all relations in tablespace/cluster
+    ///
+    /// This is used to implement 'create database'
+    fn list_rels(
+        &self,
+        timelineid: ZTimelineId,
+        spcnode: u32,
+        dbnode: u32,
+        lsn: Lsn,
+    ) -> Result<HashSet<RelTag>>;
+
+    /// Iterate through objects tags. If nonrel_only, then only non-relationa data is iterated.
+    ///
+    /// This is used to implement GC and preparing tarball for new node startup
+    /// Returns objects in increasing key-version order.
+    fn list_objects<'a>(
+        &'a self,
+        timelineid: ZTimelineId,
+        nonrel_only: bool,
+        lsn: Lsn,
+    ) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>>;
+
+    /// Unlink object (used by GC). This mehod may actually delete object or just mark it for deletion.
+    fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()>;
+
+    // Compact storage and remove versions marged for deletion
+    fn compact(&self);
+}
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -1,920 +1,37 @@
-//
-// Page Cache holds all the different page versions and WAL records
-//
-// The Page Cache is currenusing RocksDB for storing wal records and full page images, keyed by the RelFileNode, blocknumber, and the LSN.
-//
+//! This module acts as a switchboard to access different repositories managed by this
+//! page server. Currently, a Page Server can only manage one repository, so there
+//! isn't much here. If we implement multi-tenancy, this will probably be changed into
+//! a hash map, keyed by the tenant ID.

-use crate::restore_local_repo::restore_timeline;
-use crate::ZTimelineId;
-use crate::{walredo, zenith_repo_dir, PageServerConf};
-use anyhow::{bail, Context};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
-use crossbeam_channel::unbounded;
-use crossbeam_channel::{Receiver, Sender};
+use crate::object_repository::ObjectRepository;
+use crate::repository::Repository;
+use crate::rocksdb_storage::RocksObjectStore;
+//use crate::inmem_storage::InmemObjectStore;
+use crate::walredo::PostgresRedoManager;
+use crate::PageServerConf;
 use lazy_static::lazy_static;
-use log::*;
-use rocksdb;
-use std::cmp::min;
-use std::collections::HashMap;
-use std::sync::atomic::Ordering;
-use std::sync::atomic::{AtomicU64};
-use std::sync::{Arc, Condvar, Mutex};
-use std::thread;
-use std::time::Duration;
-use std::{convert::TryInto, ops::AddAssign};
-use zenith_utils::seqwait::SeqWait;
-
-// Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call.
-static TIMEOUT: Duration = Duration::from_secs(60);
-
-pub struct PageCache {
-    shared: Mutex<PageCacheShared>,
-
-    // RocksDB handle
-    db: rocksdb::DB,
-
-    // Channel for communicating with the WAL redo process here.
-    pub walredo_sender: Sender<Arc<CacheEntry>>,
-    pub walredo_receiver: Receiver<Arc<CacheEntry>>,
-
-    // Allows .await on the arrival of a particular LSN.
-    seqwait_lsn: SeqWait,
-
-    // Counters, for metrics collection.
-    pub num_entries: AtomicU64,
-    pub num_page_images: AtomicU64,
-    pub num_wal_records: AtomicU64,
-    pub num_getpage_requests: AtomicU64,
-
-    // copies of shared.first/last_valid_lsn fields (copied here so
-    // that they can be read without acquiring the mutex).
-    pub first_valid_lsn: AtomicU64,
-    pub last_valid_lsn: AtomicU64,
-    pub last_record_lsn: AtomicU64,
-}
-
-#[derive(Clone)]
-pub struct PageCacheStats {
-    pub num_entries: u64,
-    pub num_page_images: u64,
-    pub num_wal_records: u64,
-    pub num_getpage_requests: u64,
-    pub first_valid_lsn: u64,
-    pub last_valid_lsn: u64,
-    pub last_record_lsn: u64,
-}
-
-impl AddAssign for PageCacheStats {
-    fn add_assign(&mut self, other: Self) {
-        *self = Self {
-            num_entries: self.num_entries + other.num_entries,
-            num_page_images: self.num_page_images + other.num_page_images,
-            num_wal_records: self.num_wal_records + other.num_wal_records,
-            num_getpage_requests: self.num_getpage_requests + other.num_getpage_requests,
-            first_valid_lsn: self.first_valid_lsn + other.first_valid_lsn,
-            last_valid_lsn: self.last_valid_lsn + other.last_valid_lsn,
-            last_record_lsn: self.last_record_lsn + other.last_record_lsn,
-        }
-    }
-}
-
-//
-// Shared data structure, holding page cache and related auxiliary information
-//
-struct PageCacheShared {
-    // What page versions do we hold in the cache? If we get GetPage with
-    // LSN < first_valid_lsn, that's an error because we (no longer) hold that
-    // page version. If we get a request > last_valid_lsn, we need to wait until
-    // we receive all the WAL up to the request.
-    //
-    // last_record_lsn points to the end of last processed WAL record.
-    // It can lag behind last_valid_lsn, if the WAL receiver has received some WAL
-    // after the end of last record, but not the whole next record yet. In the
-    // page cache, we care about last_valid_lsn, but if the WAL receiver needs to
-    // restart the streaming, it needs to restart at the end of last record, so
-    // we track them separately. last_record_lsn should perhaps be in
-    // walreceiver.rs instead of here, but it seems convenient to keep all three
-    // values together.
-    //
-    first_valid_lsn: u64,
-    last_valid_lsn: u64,
-    last_record_lsn: u64,
-}
+use std::sync::{Arc, Mutex};

 lazy_static! {
-    pub static ref PAGECACHES: Mutex<HashMap<ZTimelineId, Arc<PageCache>>> =
-        Mutex::new(HashMap::new());
+    pub static ref REPOSITORY: Mutex<Option<Arc<dyn Repository>>> = Mutex::new(None);
 }

-// Get Page Cache for given timeline. It is assumed to already exist.
-pub fn get_pagecache(_conf: &PageServerConf, timelineid: ZTimelineId) -> Option<Arc<PageCache>> {
-    let pcaches = PAGECACHES.lock().unwrap();
+pub fn init(conf: &'static PageServerConf) {
+    let mut m = REPOSITORY.lock().unwrap();

-    match pcaches.get(&timelineid) {
-        Some(pcache) => Some(pcache.clone()),
-        None => None,
-    }
+    let obj_store = RocksObjectStore::open(conf).unwrap();
+    //let obj_store = InmemObjectStore::open(conf).unwrap();
+
+    // Set up a WAL redo manager, for applying WAL records.
+    let walredo_mgr = PostgresRedoManager::new(conf);
+
+    // we have already changed current dir to the repository.
+    let repo = ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr));
+
+    *m = Some(Arc::new(repo));
 }

-pub fn get_or_restore_pagecache(
-    conf: &PageServerConf,
-    timelineid: ZTimelineId,
-) -> anyhow::Result<Arc<PageCache>> {
-    let mut pcaches = PAGECACHES.lock().unwrap();
-
-    match pcaches.get(&timelineid) {
-        Some(pcache) => Ok(pcache.clone()),
-        None => {
-            let pcache = init_page_cache(conf, timelineid);
-
-            restore_timeline(conf, &pcache, timelineid)?;
-
-            let result = Arc::new(pcache);
-
-            pcaches.insert(timelineid, result.clone());
-
-            // Initialize the WAL redo thread
-            //
-            // Now join_handle is not saved any where and we won'try restart tharead
-            // if it is dead. We may later stop that treads after some inactivity period
-            // and restart them on demand.
-            let conf_copy = conf.clone();
-            let _walredo_thread = thread::Builder::new()
-                .name("WAL redo thread".into())
-                .spawn(move || {
-                    walredo::wal_redo_main(&conf_copy, timelineid);
-                })
-                .unwrap();
-            if conf.gc_horizon != 0 {
-                let conf_copy = conf.clone();
-                let _gc_thread = thread::Builder::new()
-                    .name("Garbage collection thread".into())
-                    .spawn(move || {
-                        gc_thread_main(&conf_copy, timelineid);
-                    })
-                    .unwrap();
-            }
-            Ok(result)
-        }
-    }
-}
-
-fn gc_thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
-    info!("Garbage collection thread started {}", timelineid);
-    let pcache = get_pagecache(conf, timelineid).unwrap();
-    pcache.do_gc(conf).unwrap();
-}
-
-fn open_rocksdb(_conf: &PageServerConf, timelineid: ZTimelineId) -> rocksdb::DB {
-    let path = zenith_repo_dir().join(timelineid.to_string());
-    let mut opts = rocksdb::Options::default();
-    opts.create_if_missing(true);
-    opts.set_use_fsync(true);
-    opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
-    opts.create_missing_column_families(true);
-    rocksdb::DB::open_cf(&opts, &path, &[rocksdb::DEFAULT_COLUMN_FAMILY_NAME]).unwrap()
-}
-
-fn init_page_cache(conf: &PageServerConf, timelineid: ZTimelineId) -> PageCache {
-    // Initialize the channel between the page cache and the WAL applicator
-    let (s, r) = unbounded();
-
-    PageCache {
-        db: open_rocksdb(&conf, timelineid),
-        shared: Mutex::new(PageCacheShared {
-            first_valid_lsn: 0,
-            last_valid_lsn: 0,
-            last_record_lsn: 0,
-        }),
-        seqwait_lsn: SeqWait::new(0),
-
-        walredo_sender: s,
-        walredo_receiver: r,
-
-        num_entries: AtomicU64::new(0),
-        num_page_images: AtomicU64::new(0),
-        num_wal_records: AtomicU64::new(0),
-        num_getpage_requests: AtomicU64::new(0),
-
-        first_valid_lsn: AtomicU64::new(0),
-        last_valid_lsn: AtomicU64::new(0),
-        last_record_lsn: AtomicU64::new(0),
-    }
-}
-
-//
-// We store two kinds of entries in the page cache:
-//
-// 1. Ready-made images of the block
-// 2. WAL records, to be applied on top of the "previous" entry
-//
-// Some WAL records will initialize the page from scratch. For such records,
-// the 'will_init' flag is set. They don't need the previous page image before
-// applying. The 'will_init' flag is set for records containing a full-page image,
-// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
-// stored directly in the cache entry in that you still need to run the WAL redo
-// routine to generate the page image.
-//
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
-pub struct CacheKey {
-    pub tag: BufferTag,
-    pub lsn: u64,
-}
-
-impl CacheKey {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        self.tag.pack(buf);
-        buf.put_u64(self.lsn);
-    }
-    pub fn unpack(buf: &mut BytesMut) -> CacheKey {
-        CacheKey {
-            tag: BufferTag::unpack(buf),
-            lsn: buf.get_u64(),
-        }
-    }
-}
-
-pub struct CacheEntry {
-    pub key: CacheKey,
-
-    pub content: Mutex<CacheEntryContent>,
-
-    // Condition variable used by the WAL redo service, to wake up
-    // requester.
-    //
-    // FIXME: this takes quite a lot of space. Consider using parking_lot::Condvar
-    // or something else.
-    pub walredo_condvar: Condvar,
-}
-
-pub struct CacheEntryContent {
-    pub page_image: Option<Bytes>,
-    pub wal_record: Option<WALRecord>,
-    pub apply_pending: bool,
-}
-
-impl CacheEntryContent {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        if let Some(image) = &self.page_image {
-            buf.put_u8(1);
-            buf.put_u16(image.len() as u16);
-            buf.put_slice(&image[..]);
-        } else if let Some(rec) = &self.wal_record {
-            buf.put_u8(0);
-            rec.pack(buf);
-        }
-    }
-    pub fn unpack(buf: &mut BytesMut) -> CacheEntryContent {
-        if buf.get_u8() == 1 {
-            let mut dst = vec![0u8; buf.get_u16() as usize];
-            buf.copy_to_slice(&mut dst);
-            CacheEntryContent {
-                page_image: Some(Bytes::from(dst)),
-                wal_record: None,
-                apply_pending: false,
-            }
-        } else {
-            CacheEntryContent {
-                page_image: None,
-                wal_record: Some(WALRecord::unpack(buf)),
-                apply_pending: false,
-            }
-        }
-    }
-}
-
-impl CacheEntry {
-    fn new(key: CacheKey, content: CacheEntryContent) -> CacheEntry {
-        CacheEntry {
-            key,
-            content: Mutex::new(content),
-            walredo_condvar: Condvar::new(),
-        }
-    }
-}
-
-#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy)]
-pub struct RelTag {
-    pub spcnode: u32,
-    pub dbnode: u32,
-    pub relnode: u32,
-    pub forknum: u8,
-}
-
-impl RelTag {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        buf.put_u32(self.spcnode);
-        buf.put_u32(self.dbnode);
-        buf.put_u32(self.relnode);
-        buf.put_u32(self.forknum as u32); // encode forknum as u32 to provide compatibility with wal_redo_postgres
-    }
-    pub fn unpack(buf: &mut BytesMut) -> RelTag {
-        RelTag {
-            spcnode: buf.get_u32(),
-            dbnode: buf.get_u32(),
-            relnode: buf.get_u32(),
-            forknum: buf.get_u32() as u8,
-        }
-    }
-}
-
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
-pub struct BufferTag {
-    pub rel: RelTag,
-    pub blknum: u32,
-}
-
-impl BufferTag {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        self.rel.pack(buf);
-        buf.put_u32(self.blknum);
-    }
-    pub fn unpack(buf: &mut BytesMut) -> BufferTag {
-        BufferTag {
-            rel: RelTag::unpack(buf),
-            blknum: buf.get_u32(),
-        }
-    }
-}
-
-#[derive(Clone)]
-pub struct WALRecord {
-    pub lsn: u64, // LSN at the *end* of the record
-    pub will_init: bool,
-    pub truncate: bool,
-    pub rec: Bytes,
-    // Remember the offset of main_data in rec,
-    // so that we don't have to parse the record again.
-    // If record has no main_data, this offset equals rec.len().
-    pub main_data_offset: u32,
-}
-
-impl WALRecord {
-    pub fn pack(&self, buf: &mut BytesMut) {
-        buf.put_u64(self.lsn);
-        buf.put_u8(self.will_init as u8);
-        buf.put_u8(self.truncate as u8);
-        buf.put_u32(self.main_data_offset);
-        buf.put_u32(self.rec.len() as u32);
-        buf.put_slice(&self.rec[..]);
-    }
-    pub fn unpack(buf: &mut BytesMut) -> WALRecord {
-        let lsn = buf.get_u64();
-        let will_init = buf.get_u8() != 0;
-        let truncate = buf.get_u8() != 0;
-        let main_data_offset = buf.get_u32();
-        let mut dst = vec![0u8; buf.get_u32() as usize];
-        buf.copy_to_slice(&mut dst);
-        WALRecord {
-            lsn,
-            will_init,
-            truncate,
-            rec: Bytes::from(dst),
-            main_data_offset,
-        }
-    }
-}
-
-// Public interface functions
-
-impl PageCache {
-    fn do_gc(&self, conf: &PageServerConf) -> anyhow::Result<Bytes> {
-        let mut minbuf = BytesMut::new();
-        let mut maxbuf = BytesMut::new();
-        let cf = self
-            .db
-            .cf_handle(rocksdb::DEFAULT_COLUMN_FAMILY_NAME)
-            .unwrap();
-        loop {
-            thread::sleep(conf.gc_period);
-            let last_lsn = self.get_last_valid_lsn();
-            if last_lsn > conf.gc_horizon {
-                let horizon = last_lsn - conf.gc_horizon;
-                let mut maxkey = CacheKey {
-                    tag: BufferTag {
-                        rel: RelTag {
-                            spcnode: u32::MAX,
-                            dbnode: u32::MAX,
-                            relnode: u32::MAX,
-                            forknum: u8::MAX,
-                        },
-                        blknum: u32::MAX,
-                    },
-                    lsn: u64::MAX,
-                };
-                loop {
-                    maxbuf.clear();
-                    maxkey.pack(&mut maxbuf);
-                    let mut iter = self.db.iterator(rocksdb::IteratorMode::From(
-                        &maxbuf[..],
-                        rocksdb::Direction::Reverse,
-                    ));
-                    if let Some((k, v)) = iter.next() {
-                        minbuf.clear();
-                        minbuf.extend_from_slice(&v);
-                        let content = CacheEntryContent::unpack(&mut minbuf);
-                        minbuf.clear();
-                        minbuf.extend_from_slice(&k);
-                        let key = CacheKey::unpack(&mut minbuf);
-
-                        // Construct boundaries for old records cleanup
-                        maxkey.tag = key.tag;
-                        let last_lsn = key.lsn;
-                        maxkey.lsn = min(horizon, last_lsn); // do not remove last version
-
-                        let mut minkey = maxkey.clone();
-                        minkey.lsn = 0;
-
-                        // reconstruct most recent page version
-                        if content.wal_record.is_some() {
-                            trace!("Reconstruct most recent page {:?}", key);
-                            // force reconstruction of most recent page version
-                            self.reconstruct_page(key, content)?;
-                        }
-
-                        maxbuf.clear();
-                        maxkey.pack(&mut maxbuf);
-
-                        if last_lsn > horizon {
-                            // locate most recent record before horizon
-                            let mut iter = self.db.iterator(rocksdb::IteratorMode::From(
-                                &maxbuf[..],
-                                rocksdb::Direction::Reverse,
-                            ));
-                            if let Some((k, v)) = iter.next() {
-                                minbuf.clear();
-                                minbuf.extend_from_slice(&v);
-                                let content = CacheEntryContent::unpack(&mut minbuf);
-                                if content.wal_record.is_some() {
-                                    minbuf.clear();
-                                    minbuf.extend_from_slice(&k);
-                                    let key = CacheKey::unpack(&mut minbuf);
-                                    trace!("Reconstruct horizon page {:?}", key);
-                                    self.reconstruct_page(key, content)?;
-                                }
-                            }
-                        }
-                        // remove records prior to horizon
-                        minbuf.clear();
-                        minkey.pack(&mut minbuf);
-                        trace!("Delete records in range {:?}..{:?}", minkey, maxkey);
-                        self.db.delete_range_cf(cf, &minbuf[..], &maxbuf[..])?;
-
-                        maxkey = minkey;
-                    } else {
-                        break;
-                    }
-                }
-            }
-        }
-    }
-
-    fn reconstruct_page(&self, key: CacheKey, content: CacheEntryContent) -> anyhow::Result<Bytes> {
-        let entry_rc = Arc::new(CacheEntry::new(key.clone(), content));
-
-        let mut entry_content = entry_rc.content.lock().unwrap();
-        entry_content.apply_pending = true;
-
-        let s = &self.walredo_sender;
-        s.send(entry_rc.clone())?;
-
-        while entry_content.apply_pending {
-            entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap();
-        }
-        // We should now have a page image. If we don't, it means that WAL redo
-        // failed to reconstruct it. WAL redo should've logged that error already.
-        let page_img = match &entry_content.page_image {
-            Some(p) => p.clone(),
-            None => {
-                error!("could not apply WAL to reconstruct page image for GetPage@LSN request");
-                bail!("could not apply WAL to reconstruct page image");
-            }
-        };
-        self.put_page_image(key.tag, key.lsn, page_img.clone());
-        Ok(page_img)
-    }
-
-    async fn wait_lsn(&self, lsn: u64) -> anyhow::Result<()> {
-        self.seqwait_lsn
-            .wait_for_timeout(lsn, TIMEOUT)
-            .await
-            .with_context(|| {
-                format!(
-                    "Timed out while waiting for WAL record at LSN {:X}/{:X} to arrive",
-                    lsn >> 32,
-                    lsn & 0xffff_ffff
-                )
-            })?;
-
-        Ok(())
-    }
-
-    //
-    // GetPage@LSN
-    //
-    // Returns an 8k page image
-    //
-    pub async fn get_page_at_lsn(&self, tag: BufferTag, req_lsn: u64) -> anyhow::Result<Bytes> {
-        self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
-
-        let mut lsn = req_lsn;
-        //When invalid LSN is requested, it means "don't wait, return latest version of the page"
-        //This is necessary for bootstrap.
-        if lsn == 0
-        {
-            lsn = self.last_valid_lsn.load(Ordering::Acquire);
-            trace!(
-                "walreceiver doesn't work yet last_valid_lsn {}, requested {}",
-                self.last_valid_lsn.load(Ordering::Acquire),
-                lsn
-            );
-        }
-        else
-        {
-            self.wait_lsn(lsn).await?;
-        }
-
-        // Look up cache entry. If it's a page image, return that. If it's a WAL record,
-        // ask the WAL redo service to reconstruct the page image from the WAL records.
-        let minkey = CacheKey { tag, lsn: 0 };
-        let maxkey = CacheKey { tag, lsn };
-
-        let mut buf = BytesMut::new();
-        minkey.pack(&mut buf);
-
-        let mut readopts = rocksdb::ReadOptions::default();
-        readopts.set_iterate_lower_bound(buf.to_vec());
-
-        buf.clear();
-        maxkey.pack(&mut buf);
-        let mut iter = self.db.iterator_opt(
-            rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse),
-            readopts,
-        );
-        let entry_opt = iter.next();
-
-        if entry_opt.is_none() {
-            static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
-            return Ok(Bytes::from_static(&ZERO_PAGE));
-            /* return Err("could not find page image")?; */
-        }
-        let (k, v) = entry_opt.unwrap();
-        buf.clear();
-        buf.extend_from_slice(&v);
-        let content = CacheEntryContent::unpack(&mut buf);
-        let page_img: Bytes;
-        if let Some(img) = &content.page_image {
-            page_img = img.clone();
-        } else if content.wal_record.is_some() {
-            buf.clear();
-            buf.extend_from_slice(&k);
-            let key = CacheKey::unpack(&mut buf);
-            page_img = self.reconstruct_page(key, content)?;
-        } else {
-            // No base image, and no WAL record. Huh?
-            bail!("no page image or WAL record for requested page");
-        }
-
-        // FIXME: assumes little-endian. Only used for the debugging log though
-        let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
-        let page_lsn_lo = u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
-        trace!(
-            "Returning page with LSN {:X}/{:X} for {}/{}/{}.{} blk {}",
-            page_lsn_hi,
-            page_lsn_lo,
-            tag.rel.spcnode,
-            tag.rel.dbnode,
-            tag.rel.relnode,
-            tag.rel.forknum,
-            tag.blknum
-        );
-
-        Ok(page_img)
-    }
-
-    //
-    // Collect all the WAL records that are needed to reconstruct a page
-    // image for the given cache entry.
-    //
-    // Returns an old page image (if any), and a vector of WAL records to apply
-    // over it.
-    //
-    pub fn collect_records_for_apply(&self, entry: &CacheEntry) -> (Option<Bytes>, Vec<WALRecord>) {
-        let minkey = CacheKey {
-            tag: BufferTag {
-                rel: entry.key.tag.rel,
-                blknum: 0,
-            },
-            lsn: 0,
-        };
-
-        let mut buf = BytesMut::new();
-        minkey.pack(&mut buf);
-
-        let mut readopts = rocksdb::ReadOptions::default();
-        readopts.set_iterate_lower_bound(buf.to_vec());
-
-        buf.clear();
-        entry.key.pack(&mut buf);
-        let iter = self.db.iterator_opt(
-            rocksdb::IteratorMode::From(&buf[..], rocksdb::Direction::Reverse),
-            readopts,
-        );
-
-        let mut base_img: Option<Bytes> = None;
-        let mut records: Vec<WALRecord> = Vec::new();
-
-        // Scan backwards, collecting the WAL records, until we hit an
-        // old page image.
-        for (_k, v) in iter {
-            buf.clear();
-            buf.extend_from_slice(&v);
-            let content = CacheEntryContent::unpack(&mut buf);
-            if let Some(img) = &content.page_image {
-                // We have a base image. No need to dig deeper into the list of
-                // records
-                base_img = Some(img.clone());
-                break;
-            } else if let Some(rec) = &content.wal_record {
-                records.push(rec.clone());
-
-                // If this WAL record initializes the page, no need to dig deeper.
-                if rec.will_init {
-                    break;
-                }
-            } else {
-                panic!("no base image and no WAL record on cache entry");
-            }
-        }
-
-        records.reverse();
-        (base_img, records)
-    }
-
-    //
-    // Adds a WAL record to the page cache
-    //
-    pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) {
-        let lsn = rec.lsn;
-        let key = CacheKey { tag, lsn };
-
-        let content = CacheEntryContent {
-            page_image: None,
-            wal_record: Some(rec),
-            apply_pending: false,
-        };
-
-        let mut key_buf = BytesMut::new();
-        key.pack(&mut key_buf);
-        let mut val_buf = BytesMut::new();
-        content.pack(&mut val_buf);
-
-        let _res = self.db.put(&key_buf[..], &val_buf[..]);
-        //trace!("put_wal_record lsn: {}", lsn);
-
-        self.num_entries.fetch_add(1, Ordering::Relaxed);
-        self.num_wal_records.fetch_add(1, Ordering::Relaxed);
-    }
-
-    //
-    // Adds a relation-wide WAL record (like truncate) to the page cache,
-    // associating it with all pages started with specified block number
-    //
-    pub async fn put_rel_wal_record(&self, tag: BufferTag, rec: WALRecord) -> anyhow::Result<()> {
-        let mut key = CacheKey { tag, lsn: rec.lsn };
-        let old_rel_size = self.relsize_get(&tag.rel, u64::MAX).await?;
-        let content = CacheEntryContent {
-            page_image: None,
-            wal_record: Some(rec),
-            apply_pending: false,
-        };
-        // set new relation size
-        trace!("Truncate relation {:?}", tag);
-        let mut key_buf = BytesMut::new();
-        let mut val_buf = BytesMut::new();
-        content.pack(&mut val_buf);
-
-        for blknum in tag.blknum..old_rel_size {
-            key_buf.clear();
-            key.tag.blknum = blknum;
-            key.pack(&mut key_buf);
-            trace!("put_wal_record lsn: {}", key.lsn);
-            let _res = self.db.put(&key_buf[..], &val_buf[..]);
-        }
-        let n = (old_rel_size - tag.blknum) as u64;
-        self.num_entries.fetch_add(n, Ordering::Relaxed);
-        self.num_wal_records.fetch_add(n, Ordering::Relaxed);
-        Ok(())
-    }
-
-    //
-    // Memorize a full image of a page version
-    //
-    pub fn put_page_image(&self, tag: BufferTag, lsn: u64, img: Bytes) {
-        let key = CacheKey { tag, lsn };
-        let content = CacheEntryContent {
-            page_image: Some(img),
-            wal_record: None,
-            apply_pending: false,
-        };
-
-        let mut key_buf = BytesMut::new();
-        key.pack(&mut key_buf);
-        let mut val_buf = BytesMut::new();
-        content.pack(&mut val_buf);
-
-        trace!("put_wal_record lsn: {}", key.lsn);
-        let _res = self.db.put(&key_buf[..], &val_buf[..]);
-
-        //debug!("inserted page image for {}/{}/{}_{} blk {} at {}",
-        //        tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn);
-        self.num_page_images.fetch_add(1, Ordering::Relaxed);
-    }
-
-    //
-    pub fn advance_last_valid_lsn(&self, lsn: u64) {
-        let mut shared = self.shared.lock().unwrap();
-
-        // Can't move backwards.
-        let oldlsn = shared.last_valid_lsn;
-        if lsn >= oldlsn {
-
-            shared.last_valid_lsn = lsn;
-            self.seqwait_lsn.advance(lsn);
-
-            self.last_valid_lsn.store(lsn, Ordering::Relaxed);
-        } else {
-            warn!(
-                "attempted to move last valid LSN backwards (was {:X}/{:X}, new {:X}/{:X})",
-                oldlsn >> 32,
-                oldlsn & 0xffffffff,
-                lsn >> 32,
-                lsn & 0xffffffff
-            );
-        }
-    }
-
-    //
-    // NOTE: this updates last_valid_lsn as well.
-    //
-    pub fn advance_last_record_lsn(&self, lsn: u64) {
-        let mut shared = self.shared.lock().unwrap();
-
-        // Can't move backwards.
-        assert!(lsn >= shared.last_valid_lsn);
-        assert!(lsn >= shared.last_record_lsn);
-
-        shared.last_valid_lsn = lsn;
-        shared.last_record_lsn = lsn;
-        self.seqwait_lsn.advance(lsn);
-
-        self.last_valid_lsn.store(lsn, Ordering::Relaxed);
-        self.last_record_lsn.store(lsn, Ordering::Relaxed);
-    }
-
-    //
-    pub fn _advance_first_valid_lsn(&self, lsn: u64) {
-        let mut shared = self.shared.lock().unwrap();
-
-        // Can't move backwards.
-        assert!(lsn >= shared.first_valid_lsn);
-
-        // Can't overtake last_valid_lsn (except when we're
-        // initializing the system and last_valid_lsn hasn't been set yet.
-        assert!(shared.last_valid_lsn == 0 || lsn < shared.last_valid_lsn);
-
-        shared.first_valid_lsn = lsn;
-        self.first_valid_lsn.store(lsn, Ordering::Relaxed);
-    }
-
-    pub fn init_valid_lsn(&self, lsn: u64) {
-        let mut shared = self.shared.lock().unwrap();
-
-        assert!(shared.first_valid_lsn == 0);
-        assert!(shared.last_valid_lsn == 0);
-        assert!(shared.last_record_lsn == 0);
-
-        shared.first_valid_lsn = lsn;
-        shared.last_valid_lsn = lsn;
-        shared.last_record_lsn = lsn;
-
-        self.first_valid_lsn.store(lsn, Ordering::Relaxed);
-        self.last_valid_lsn.store(lsn, Ordering::Relaxed);
-        self.last_record_lsn.store(lsn, Ordering::Relaxed);
-    }
-
-    pub fn get_last_valid_lsn(&self) -> u64 {
-        let shared = self.shared.lock().unwrap();
-
-        shared.last_record_lsn
-    }
-
-    pub async fn relsize_get(&self, rel: &RelTag, lsn: u64) -> anyhow::Result<u32> {
-        if lsn != u64::MAX {
-            self.wait_lsn(lsn).await?;
-        }
-
-        let mut key = CacheKey {
-            tag: BufferTag {
-                rel: *rel,
-                blknum: u32::MAX,
-            },
-            lsn,
-        };
-        let mut buf = BytesMut::new();
-
-        loop {
-            buf.clear();
-            key.pack(&mut buf);
-            let mut iter = self.db.iterator(rocksdb::IteratorMode::From(
-                &buf[..],
-                rocksdb::Direction::Reverse,
-            ));
-            if let Some((k, v)) = iter.next() {
-                buf.clear();
-                buf.extend_from_slice(&k);
-                let tag = BufferTag::unpack(&mut buf);
-                if tag.rel == *rel {
-                    buf.clear();
-                    buf.extend_from_slice(&v);
-                    let content = CacheEntryContent::unpack(&mut buf);
-                    if let Some(rec) = &content.wal_record {
-                        if rec.truncate {
-                            if tag.blknum > 0 {
-                                key.tag.blknum = tag.blknum - 1;
-                                continue;
-                            }
-                            break;
-                        }
-                    }
-                    let relsize = tag.blknum + 1;
-                    trace!("Size of relation {:?} at {} is {}", rel, lsn, relsize);
-                    return Ok(relsize);
-                }
-            }
-            break;
-        }
-        trace!("Size of relation {:?} at {} is zero", rel, lsn);
-        Ok(0)
-    }
-
-    pub async fn relsize_exist(&self, rel: &RelTag, lsn: u64) -> anyhow::Result<bool> {
-        self.wait_lsn(lsn).await?;
-
-        let key = CacheKey {
-            tag: BufferTag {
-                rel: *rel,
-                blknum: u32::MAX,
-            },
-            lsn,
-        };
-        let mut buf = BytesMut::new();
-        key.pack(&mut buf);
-        let mut iter = self.db.iterator(rocksdb::IteratorMode::From(
-            &buf[..],
-            rocksdb::Direction::Reverse,
-        ));
-        if let Some((k, _v)) = iter.next() {
-            buf.clear();
-            buf.extend_from_slice(&k);
-            let tag = BufferTag::unpack(&mut buf);
-            if tag.rel == *rel {
-                trace!("Relation {:?} exists at {}", rel, lsn);
-                return Ok(true);
-            }
-        }
-        trace!("Relation {:?} doesn't exist at {}", rel, lsn);
-        Ok(false)
-    }
-
-    pub fn get_stats(&self) -> PageCacheStats {
-        PageCacheStats {
-            num_entries: self.num_entries.load(Ordering::Relaxed),
-            num_page_images: self.num_page_images.load(Ordering::Relaxed),
-            num_wal_records: self.num_wal_records.load(Ordering::Relaxed),
-            num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed),
-            first_valid_lsn: self.first_valid_lsn.load(Ordering::Relaxed),
-            last_valid_lsn: self.last_valid_lsn.load(Ordering::Relaxed),
-            last_record_lsn: self.last_record_lsn.load(Ordering::Relaxed),
-        }
-    }
-}
-
-pub fn get_stats() -> PageCacheStats {
-    let pcaches = PAGECACHES.lock().unwrap();
-
-    let mut stats = PageCacheStats {
-        num_entries: 0,
-        num_page_images: 0,
-        num_wal_records: 0,
-        num_getpage_requests: 0,
-        first_valid_lsn: 0,
-        last_valid_lsn: 0,
-        last_record_lsn: 0,
-    };
-
-    pcaches.iter().for_each(|(_sys_id, pcache)| {
-        stats += pcache.get_stats();
-    });
-    stats
+pub fn get_repository() -> Arc<dyn Repository> {
+    let o = &REPOSITORY.lock().unwrap();
+    Arc::clone(o.as_ref().unwrap())
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
--- a/pageserver/src/pg_constants.rs
+++ b/pageserver/src/pg_constants.rs
@@ -1,65 +0,0 @@
-// From pg_tablespace_d.h
-//
-pub const DEFAULTTABLESPACE_OID: u32 = 1663;
-pub const GLOBALTABLESPACE_OID: u32 = 1664;
-//Special values for non-rel files' tags
-//TODO maybe use enum?
-pub const PG_CONTROLFILE_FORKNUM: u32 = 42;
-pub const PG_FILENODEMAP_FORKNUM: u32 = 43;
-pub const PG_XACT_FORKNUM: u32 = 44;
-pub const PG_MXACT_OFFSETS_FORKNUM: u32 = 45;
-pub const PG_MXACT_MEMBERS_FORKNUM: u32 = 46;
-
-//
-// constants from clog.h
-//
-pub const CLOG_XACTS_PER_BYTE: u32 = 4;
-pub const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
-pub const CLOG_BITS_PER_XACT: u8 = 2;
-pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
-
-pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
-pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
-pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
-
-pub const CLOG_ZEROPAGE: u8 = 0x00;
-pub const CLOG_TRUNCATE: u8 = 0x10;
-
-// From xact.h
-pub const XLOG_XACT_COMMIT: u8 = 0x00;
-pub const XLOG_XACT_ABORT: u8 = 0x20;
-
-/* mask for filtering opcodes out of xl_info */
-pub const XLOG_XACT_OPMASK: u8 = 0x70;
-/* does this record have a 'xinfo' field or not */
-pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
-
-/*
- * The following flags, stored in xinfo, determine which information is
- * contained in commit/abort records.
- */
-pub const XACT_XINFO_HAS_DBINFO: u32 = 1;
-pub const XACT_XINFO_HAS_SUBXACTS: u32 = 2;
-pub const XACT_XINFO_HAS_RELFILENODES: u32 = 4;
-
-// From pg_control.h and rmgrlist.h
-pub const XLOG_SWITCH: u8 = 0x40;
-pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
-pub const RM_XLOG_ID: u8 = 0;
-pub const RM_XACT_ID: u8 = 1;
-pub const RM_SMGR_ID: u8 = 2;
-pub const RM_CLOG_ID: u8 = 3;
-pub const RM_DBASE_ID: u8 = 4;
-pub const RM_TBLSPC_ID: u8 = 5;
-// pub const RM_MULTIXACT_ID:u8 = 6;
-
-// from xlogreader.h
-pub const XLR_INFO_MASK: u8 = 0x0F;
-pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
-
-// from dbcommands_xlog.h
-pub const XLOG_DBASE_CREATE: u8 = 0x00;
-pub const XLOG_DBASE_DROP: u8 = 0x10;
-
-pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
-pub const XLOG_TBLSPC_DROP: u8 = 0x10;
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -0,0 +1,643 @@
+use crate::object_key::*;
+use crate::waldecoder::TransactionId;
+use crate::ZTimelineId;
+use anyhow::Result;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use postgres_ffi::nonrelfile_utils::transaction_id_get_status;
+use postgres_ffi::pg_constants;
+use postgres_ffi::relfile_utils::forknumber_to_name;
+use serde::{Deserialize, Serialize};
+use std::collections::HashSet;
+use std::fmt;
+use std::iter::Iterator;
+use std::sync::Arc;
+use std::time::Duration;
+use zenith_utils::lsn::Lsn;
+
+///
+/// A repository corresponds to one .zenith directory. One repository holds multiple
+/// timelines, forked off from the same initial call to 'initdb'.
+pub trait Repository: Send + Sync {
+    /// Get Timeline handle for given zenith timeline ID.
+    fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
+
+    /// Create a new, empty timeline. The caller is responsible for loading data into it
+    fn create_empty_timeline(
+        &self,
+        timelineid: ZTimelineId,
+        start_lsn: Lsn,
+    ) -> Result<Arc<dyn Timeline>>;
+
+    /// Branch a timeline
+    fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
+
+    //fn get_stats(&self) -> RepositoryStats;
+}
+
+///
+/// Result of performing GC
+///
+#[derive(Default)]
+pub struct GcResult {
+    pub n_relations: u64,
+    pub inspected: u64,
+    pub truncated: u64,
+    pub deleted: u64,
+    pub prep_deleted: u64, // 2PC prepare
+    pub slru_deleted: u64, // SLRU (clog, multixact)
+    pub chkp_deleted: u64, // Checkpoints
+    pub dropped: u64,
+    pub elapsed: Duration,
+}
+
+pub trait Timeline: Send + Sync {
+    //------------------------------------------------------------------------------
+    // Public GET functions
+    //------------------------------------------------------------------------------
+
+    /// Look up given page in the cache.
+    fn get_page_at_lsn(&self, tag: ObjectTag, lsn: Lsn) -> Result<Bytes>;
+
+    /// Look up given page in the cache.
+    fn get_page_at_lsn_nowait(&self, tag: ObjectTag, lsn: Lsn, materialize: bool) -> Result<Bytes>;
+
+    /// Get size of relation
+    fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<u32>;
+
+    /// Does relation exist?
+    fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
+
+    /// Get a list of all distinct relations in given tablespace and database.
+    fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;
+
+    /// Get a list of non-relational objects
+    fn list_nonrels<'a>(&'a self, lsn: Lsn) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>>;
+
+    //------------------------------------------------------------------------------
+    // Public PUT functions, to update the repository with new page versions.
+    //
+    // These are called by the WAL receiver to digest WAL records.
+    //------------------------------------------------------------------------------
+
+    /// Put a new page version that can be constructed from a WAL record
+    ///
+    /// This will implicitly extend the relation, if the page is beyond the
+    /// current end-of-file.
+    fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()>;
+
+    /// Put raw data
+    fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()>;
+
+    /// Like put_wal_record, but with ready-made image of the page.
+    fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes, update_meta: bool)
+        -> Result<()>;
+
+    /// Truncate relation
+    fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;
+
+    /// Unlink relation. This method is used for marking dropped relations.
+    fn put_unlink(&self, tag: RelTag, lsn: Lsn) -> Result<()>;
+
+    /// Truncate SLRU segment
+    fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()>;
+
+    // Get object tag greater or equal than specified
+    fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>>;
+
+    /// Remember the all WAL before the given LSN has been processed.
+    ///
+    /// The WAL receiver calls this after the put_* functions, to indicate that
+    /// all WAL before this point has been digested. Before that, if you call
+    /// GET on an earlier LSN, it will block.
+    fn advance_last_valid_lsn(&self, lsn: Lsn);
+    fn get_last_valid_lsn(&self) -> Lsn;
+    fn init_valid_lsn(&self, lsn: Lsn);
+
+    /// Like `advance_last_valid_lsn`, but this always points to the end of
+    /// a WAL record, not in the middle of one.
+    ///
+    /// This must be <= last valid LSN. This is tracked separately from last
+    /// valid LSN, so that the WAL receiver knows where to restart streaming.
+    fn advance_last_record_lsn(&self, lsn: Lsn);
+    fn get_last_record_lsn(&self) -> Lsn;
+
+    // Like `advance_last_record_lsn`, but points to the start position of last record
+    fn get_prev_record_lsn(&self) -> Lsn;
+
+    ///
+    /// Flush to disk all data that was written with the put_* functions
+    ///
+    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
+    /// know anything about them here in the repository.
+    fn checkpoint(&self) -> Result<()>;
+
+    /// Events for all relations in the timeline.
+    /// Contains updates from start up to the last valid LSN
+    /// at time of history() call. This lsn can be read via the lsn() function.
+    ///
+    /// Relation size is increased implicitly and decreased with Truncate updates.
+    // TODO ordering guarantee?
+    fn history<'a>(&'a self) -> Result<Box<dyn History + 'a>>;
+
+    /// Perform one garbage collection iteration.
+    /// Garbage collection is periodically performed by GC thread,
+    /// but it can be explicitly requested through page server API.
+    ///
+    /// `horizon` specifies delta from last LSN to preserve all object versions (PITR interval).
+    /// `compact` parameter is used to force compaction of storage.
+    /// Some storage implementation are based on LSM tree and require periodic merge (compaction).
+    /// Usually storage implementation determines itself when compaction should be performed.
+    /// But for GC tests it way be useful to force compaction just after completion of GC iteration
+    /// to make sure that all detected garbage is removed.
+    /// So right now `compact` is set to true when GC explicitly requested through page srver API,
+    /// and is st to false in GC threads which infinitely repeats GC iterations in loop.
+    fn gc_iteration(&self, horizon: u64, compact: bool) -> Result<GcResult>;
+
+    // Check transaction status
+    fn get_tx_status(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<u8> {
+        let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+        let tag = ObjectTag::Clog(SlruBufferTag { blknum });
+        let clog_page = self.get_page_at_lsn(tag, lsn)?;
+        let status = transaction_id_get_status(xid, &clog_page[..]);
+        Ok(status)
+    }
+}
+
+pub trait History: Iterator<Item = Result<RelationUpdate>> {
+    /// The last_valid_lsn at the time of history() call.
+    fn lsn(&self) -> Lsn;
+}
+
+#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub struct RelationUpdate {
+    pub rel: RelTag,
+    pub lsn: Lsn,
+    pub update: Update,
+}
+
+#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum Update {
+    Page { blknum: u32, img: Bytes },
+    WALRecord { blknum: u32, rec: WALRecord },
+    Truncate { n_blocks: u32 },
+    Unlink,
+}
+
+#[derive(Clone)]
+pub struct RepositoryStats {
+    pub num_entries: Lsn,
+    pub num_page_images: Lsn,
+    pub num_wal_records: Lsn,
+    pub num_getpage_requests: Lsn,
+}
+
+///
+/// Relation data file segment id throughout the Postgres cluster.
+///
+/// Every data file in Postgres is uniquely identified by 4 numbers:
+/// - relation id / node (`relnode`)
+/// - database id (`dbnode`)
+/// - tablespace id (`spcnode`), in short this is a unique id of a separate
+///   directory to store data files.
+/// - forknumber (`forknum`) is used to split different kinds of data of the same relation
+///   between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`).
+///
+/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value
+/// are used for the same purpose.
+/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
+///
+#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
+pub struct RelTag {
+    pub forknum: u8,
+    pub spcnode: u32,
+    pub dbnode: u32,
+    pub relnode: u32,
+}
+
+impl RelTag {
+    pub const ZEROED: Self = Self {
+        forknum: 0,
+        spcnode: 0,
+        dbnode: 0,
+        relnode: 0,
+    };
+}
+
+/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
+///
+/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
+///
+impl fmt::Display for RelTag {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if let Some(forkname) = forknumber_to_name(self.forknum) {
+            write!(
+                f,
+                "{}/{}/{}_{}",
+                self.spcnode, self.dbnode, self.relnode, forkname
+            )
+        } else {
+            write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode)
+        }
+    }
+}
+
+///
+/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
+/// This is used as a part of the key inside key-value storage (RocksDB currently).
+///
+/// In Postgres `BufferTag` structure is used for exactly the same purpose.
+/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
+///
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
+pub struct BufferTag {
+    pub rel: RelTag,
+    pub blknum: u32,
+}
+
+impl BufferTag {
+    pub const ZEROED: Self = Self {
+        rel: RelTag::ZEROED,
+        blknum: 0,
+    };
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct WALRecord {
+    pub lsn: Lsn, // LSN at the *end* of the record
+    pub will_init: bool,
+    pub rec: Bytes,
+    // Remember the offset of main_data in rec,
+    // so that we don't have to parse the record again.
+    // If record has no main_data, this offset equals rec.len().
+    pub main_data_offset: u32,
+}
+
+impl WALRecord {
+    pub fn pack(&self, buf: &mut BytesMut) {
+        buf.put_u64(self.lsn.0);
+        buf.put_u8(self.will_init as u8);
+        buf.put_u32(self.main_data_offset);
+        buf.put_u32(self.rec.len() as u32);
+        buf.put_slice(&self.rec[..]);
+    }
+    pub fn unpack(buf: &mut Bytes) -> WALRecord {
+        let lsn = Lsn::from(buf.get_u64());
+        let will_init = buf.get_u8() != 0;
+        let main_data_offset = buf.get_u32();
+        let mut dst = vec![0u8; buf.get_u32() as usize];
+        buf.copy_to_slice(&mut dst);
+        WALRecord {
+            lsn,
+            will_init,
+            rec: Bytes::from(dst),
+            main_data_offset,
+        }
+    }
+}
+
+///
+/// Tests that should work the same with any Repository/Timeline implementation.
+///
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::object_repository::ObjectRepository;
+    use crate::rocksdb_storage::RocksObjectStore;
+    use crate::walredo::{WalRedoError, WalRedoManager};
+    use crate::PageServerConf;
+    use postgres_ffi::pg_constants;
+    use std::fs;
+    use std::path::PathBuf;
+    use std::str::FromStr;
+    use std::time::Duration;
+
+    /// Arbitrary relation tag, for testing.
+    const TESTREL_A: RelTag = RelTag {
+        spcnode: 0,
+        dbnode: 111,
+        relnode: 1000,
+        forknum: 0,
+    };
+    const TESTREL_B: RelTag = RelTag {
+        spcnode: 0,
+        dbnode: 111,
+        relnode: 1001,
+        forknum: 0,
+    };
+
+    /// Convenience function to create a BufferTag for testing.
+    /// Helps to keeps the tests shorter.
+    #[allow(non_snake_case)]
+    fn TEST_BUF(blknum: u32) -> ObjectTag {
+        ObjectTag::RelationBuffer(BufferTag {
+            rel: TESTREL_A,
+            blknum,
+        })
+    }
+
+    /// Convenience function to create a page image with given string as the only content
+    #[allow(non_snake_case)]
+    fn TEST_IMG(s: &str) -> Bytes {
+        let mut buf = BytesMut::new();
+        buf.extend_from_slice(s.as_bytes());
+        buf.resize(8192, 0);
+
+        buf.freeze()
+    }
+
+    fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
+        let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
+        let _ = fs::remove_dir_all(&repo_dir);
+        fs::create_dir_all(&repo_dir)?;
+
+        let conf = PageServerConf {
+            daemonize: false,
+            interactive: false,
+            materialize: false,
+            gc_horizon: 64 * 1024 * 1024,
+            gc_period: Duration::from_secs(10),
+            wal_redoers: 1,
+            listen_addr: "127.0.0.1:5430".to_string(),
+            workdir: repo_dir,
+            pg_distrib_dir: "".into(),
+        };
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
+
+        let obj_store = RocksObjectStore::create(conf)?;
+
+        let walredo_mgr = TestRedoManager {};
+
+        let repo = ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr));
+
+        Ok(Box::new(repo))
+    }
+
+    /// Test get_relsize() and truncation.
+    #[test]
+    fn test_relsize() -> Result<()> {
+        // get_timeline() with non-existent timeline id should fail
+        //repo.get_timeline("11223344556677881122334455667788");
+
+        // Create timeline to work on
+        let repo = get_test_repo("test_relsize")?;
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
+
+        tline.init_valid_lsn(Lsn(1));
+        tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
+        tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
+        tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
+        tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"), true)?;
+        tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"), true)?;
+
+        tline.advance_last_valid_lsn(Lsn(5));
+
+        // The relation was created at LSN 2, not visible at LSN 1 yet.
+        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(1))?, false);
+        assert!(tline.get_rel_size(TESTREL_A, Lsn(1)).is_err());
+
+        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(2))?, true);
+        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(2))?, 1);
+        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(5))?, 3);
+
+        // Check page contents at each LSN
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(2))?,
+            TEST_IMG("foo blk 0 at 2")
+        );
+
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(3))?,
+            TEST_IMG("foo blk 0 at 3")
+        );
+
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
+            TEST_IMG("foo blk 0 at 3")
+        );
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(1), Lsn(4))?,
+            TEST_IMG("foo blk 1 at 4")
+        );
+
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(5))?,
+            TEST_IMG("foo blk 0 at 3")
+        );
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(1), Lsn(5))?,
+            TEST_IMG("foo blk 1 at 4")
+        );
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
+            TEST_IMG("foo blk 2 at 5")
+        );
+
+        // Truncate last block
+        tline.put_truncation(TESTREL_A, Lsn(6), 2)?;
+        tline.advance_last_valid_lsn(Lsn(6));
+
+        // Check reported size and contents after truncation
+        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(6))?, 2);
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(6))?,
+            TEST_IMG("foo blk 0 at 3")
+        );
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(1), Lsn(6))?,
+            TEST_IMG("foo blk 1 at 4")
+        );
+
+        // should still see the truncated block with older LSN
+        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(5))?, 3);
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
+            TEST_IMG("foo blk 2 at 5")
+        );
+
+        Ok(())
+    }
+
+    /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
+    /// split into multiple 1 GB segments in Postgres.
+    ///
+    /// This isn't very interesting with the RocksDb implementation, as we don't pay
+    /// any attention to Postgres segment boundaries there.
+    #[test]
+    fn test_large_rel() -> Result<()> {
+        let repo = get_test_repo("test_large_rel")?;
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
+
+        tline.init_valid_lsn(Lsn(1));
+
+        let mut lsn = 0;
+        for i in 0..pg_constants::RELSEG_SIZE + 1 {
+            let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
+            lsn += 1;
+            tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img, true)?;
+        }
+        tline.advance_last_valid_lsn(Lsn(lsn));
+
+        assert_eq!(
+            tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
+            pg_constants::RELSEG_SIZE + 1
+        );
+
+        // Truncate one block
+        lsn += 1;
+        tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
+        tline.advance_last_valid_lsn(Lsn(lsn));
+        assert_eq!(
+            tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
+            pg_constants::RELSEG_SIZE
+        );
+
+        // Truncate another block
+        lsn += 1;
+        tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
+        tline.advance_last_valid_lsn(Lsn(lsn));
+        assert_eq!(
+            tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
+            pg_constants::RELSEG_SIZE - 1
+        );
+
+        Ok(())
+    }
+
+    ///
+    /// Test branch creation
+    ///
+    #[test]
+    fn test_branch() -> Result<()> {
+        let repo = get_test_repo("test_branch")?;
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
+
+        // Create a relation on the timeline
+        tline.init_valid_lsn(Lsn(1));
+        tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
+        tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
+        tline.put_page_image(TEST_BUF(0), Lsn(4), TEST_IMG("foo blk 0 at 4"), true)?;
+
+        // Create another relation
+        let buftag2 = ObjectTag::RelationBuffer(BufferTag {
+            rel: TESTREL_B,
+            blknum: 0,
+        });
+        tline.put_page_image(buftag2, Lsn(2), TEST_IMG("foobar blk 0 at 2"), true)?;
+
+        tline.advance_last_valid_lsn(Lsn(4));
+
+        // Branch the history, modify relation differently on the new timeline
+        let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
+        repo.branch_timeline(timelineid, newtimelineid, Lsn(3))?;
+        let newtline = repo.get_timeline(newtimelineid)?;
+
+        newtline.put_page_image(TEST_BUF(0), Lsn(4), TEST_IMG("bar blk 0 at 4"), true)?;
+        newtline.advance_last_valid_lsn(Lsn(4));
+
+        // Check page contents on both branches
+        assert_eq!(
+            tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
+            TEST_IMG("foo blk 0 at 4")
+        );
+
+        assert_eq!(
+            newtline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
+            TEST_IMG("bar blk 0 at 4")
+        );
+
+        assert_eq!(
+            newtline.get_page_at_lsn(buftag2, Lsn(4))?,
+            TEST_IMG("foobar blk 0 at 2")
+        );
+
+        assert_eq!(newtline.get_rel_size(TESTREL_B, Lsn(4))?, 1);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_history() -> Result<()> {
+        let repo = get_test_repo("test_snapshot")?;
+        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
+        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
+
+        let mut snapshot = tline.history()?;
+        assert_eq!(snapshot.lsn(), Lsn(0));
+        assert_eq!(None, snapshot.next().transpose()?);
+
+        // add a page and advance the last valid LSN
+        let rel = TESTREL_A;
+        let tag = TEST_BUF(1);
+        tline.put_page_image(tag, Lsn(1), TEST_IMG("blk 1 @ lsn 1"), true)?;
+        tline.advance_last_valid_lsn(Lsn(1));
+        let mut snapshot = tline.history()?;
+        assert_eq!(snapshot.lsn(), Lsn(1));
+        let expected_page = RelationUpdate {
+            rel: rel,
+            lsn: Lsn(1),
+            update: Update::Page {
+                blknum: 1,
+                img: TEST_IMG("blk 1 @ lsn 1"),
+            },
+        };
+        assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
+        assert_eq!(None, snapshot.next().transpose()?);
+
+        // truncate to zero, but don't advance the last valid LSN
+        tline.put_truncation(rel, Lsn(2), 0)?;
+        let mut snapshot = tline.history()?;
+        assert_eq!(snapshot.lsn(), Lsn(1));
+        assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
+        assert_eq!(None, snapshot.next().transpose()?);
+
+        // advance the last valid LSN and the truncation should be observable
+        tline.advance_last_valid_lsn(Lsn(2));
+        let mut snapshot = tline.history()?;
+        assert_eq!(snapshot.lsn(), Lsn(2));
+
+        // TODO ordering not guaranteed by API. But currently it returns the
+        // truncation entry before the block data.
+        let expected_truncate = RelationUpdate {
+            rel: rel,
+            lsn: Lsn(2),
+            update: Update::Truncate { n_blocks: 0 },
+        };
+        assert_eq!(Some(expected_truncate), snapshot.next().transpose()?);
+        assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
+        assert_eq!(None, snapshot.next().transpose()?);
+
+        Ok(())
+    }
+
+    // Mock WAL redo manager that doesn't do much
+    struct TestRedoManager {}
+
+    impl WalRedoManager for TestRedoManager {
+        fn request_redo(
+            &self,
+            tag: ObjectTag,
+            lsn: Lsn,
+            base_img: Option<Bytes>,
+            records: Vec<WALRecord>,
+        ) -> Result<Bytes, WalRedoError> {
+            let s = format!(
+                "redo for {:?} to get to {}, with {} and {} records",
+                tag,
+                lsn,
+                if base_img.is_some() {
+                    "base image"
+                } else {
+                    "no base image"
+                },
+                records.len()
+            );
+            println!("{}", s);
+            Ok(TEST_IMG(&s))
+        }
+    }
+}
--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
--- a/pageserver/src/restore_s3.rs
+++ b/pageserver/src/restore_s3.rs
@@ -1,322 +0,0 @@
-//
-// Restore chunks from S3
-//
-// This runs once at Page Server startup. It loads all the "base images" from
-// S3 into the in-memory page cache. It also initializes the "last valid LSN"
-// in the page cache to the LSN of the base image, so that when the WAL receiver
-// is started, it starts streaming from that LSN.
-//
-
-use bytes::{Buf, BytesMut};
-use log::*;
-use regex::Regex;
-use std::env;
-use std::fmt;
-
-use s3::bucket::Bucket;
-use s3::creds::Credentials;
-use s3::region::Region;
-use s3::S3Error;
-
-use tokio::runtime;
-
-use futures::future;
-
-use crate::{page_cache, PageServerConf};
-
-struct Storage {
-    region: Region,
-    credentials: Credentials,
-    bucket: String,
-}
-
-pub fn restore_main(conf: &PageServerConf) {
-    // Create a new thread pool
-    let runtime = runtime::Runtime::new().unwrap();
-
-    runtime.block_on(async {
-        let result = restore_chunk(conf).await;
-
-        match result {
-            Ok(_) => {}
-            Err(err) => {
-                error!("S3 error: {}", err);
-            }
-        }
-    });
-}
-
-//
-// Restores one chunk from S3.
-//
-// 1. Fetch the last base image >= given LSN
-// 2. Fetch all WAL
-//
-// Load it all into the page cache.
-//
-async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
-    let backend = Storage {
-        region: Region::Custom {
-            region: env::var("S3_REGION").unwrap(),
-            endpoint: env::var("S3_ENDPOINT").unwrap(),
-        },
-        credentials: Credentials::new(
-            Some(&env::var("S3_ACCESSKEY").unwrap()),
-            Some(&env::var("S3_SECRET").unwrap()),
-            None,
-            None,
-            None,
-        )
-        .unwrap(),
-        bucket: "zenith-testbucket".to_string(),
-    };
-
-    info!("Restoring from S3...");
-
-    // Create Bucket in REGION for BUCKET
-    let bucket = Bucket::new_with_path_style(&backend.bucket, backend.region, backend.credentials)?;
-
-    // List out contents of directory
-    let results: Vec<s3::serde_types::ListBucketResult> = bucket
-        .list("relationdata/".to_string(), Some("".to_string()))
-        .await?;
-
-    // TODO: get that from backup
-    let sys_id: u64 = 42;
-    let mut oldest_lsn = 0;
-    let mut slurp_futures: Vec<_> = Vec::new();
-
-    for result in results {
-        for object in result.contents {
-            // Download every relation file, slurping them into memory
-
-            let key = object.key;
-            let relpath = key.strip_prefix("relationdata/").unwrap();
-
-            let parsed = parse_rel_file_path(&relpath);
-
-            match parsed {
-                Ok(p) => {
-                    if oldest_lsn == 0 || p.lsn < oldest_lsn {
-                        oldest_lsn = p.lsn;
-                    }
-                    let b = bucket.clone();
-                    let f = slurp_base_file(conf, sys_id, b, key.to_string(), p);
-
-                    slurp_futures.push(f);
-                }
-                Err(e) => {
-                    warn!("unrecognized file: {} ({})", relpath, e);
-                }
-            };
-        }
-    }
-
-    if oldest_lsn == 0 {
-        panic!("no base backup found");
-    }
-
-    let pcache = page_cache::get_pagecache(conf, sys_id);
-    pcache.init_valid_lsn(oldest_lsn);
-
-    info!("{} files to restore...", slurp_futures.len());
-
-    future::join_all(slurp_futures).await;
-    info!("restored!");
-
-    Ok(())
-}
-
-// From pg_tablespace_d.h
-//
-// FIXME: we'll probably need these elsewhere too, move to some common location
-const DEFAULTTABLESPACE_OID: u32 = 1663;
-const GLOBALTABLESPACE_OID: u32 = 1664;
-
-#[derive(Debug)]
-struct FilePathError {
-    msg: String,
-}
-
-impl FilePathError {
-    fn new(msg: &str) -> FilePathError {
-        FilePathError {
-            msg: msg.to_string(),
-        }
-    }
-}
-
-impl From<core::num::ParseIntError> for FilePathError {
-    fn from(e: core::num::ParseIntError) -> Self {
-        return FilePathError {
-            msg: format!("invalid filename: {}", e),
-        };
-    }
-}
-
-impl fmt::Display for FilePathError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "invalid filename")
-    }
-}
-
-fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
-    match forkname {
-        // "main" is not in filenames, it's implicit if the fork name is not present
-        None => Ok(0),
-        Some("fsm") => Ok(1),
-        Some("vm") => Ok(2),
-        Some("init") => Ok(3),
-        Some(_) => Err(FilePathError::new("invalid forkname")),
-    }
-}
-
-#[derive(Debug)]
-struct ParsedBaseImageFileName {
-    pub spcnode: u32,
-    pub dbnode: u32,
-    pub relnode: u32,
-    pub forknum: u32,
-    pub segno: u32,
-
-    pub lsn: u64,
-}
-
-// formats:
-// <oid>
-// <oid>_<fork name>
-// <oid>.<segment number>
-// <oid>_<fork name>.<segment number>
-
-fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
-    let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
-
-    let caps = re
-        .captures(fname)
-        .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
-
-    let relnode_str = caps.name("relnode").unwrap().as_str();
-    let relnode: u32 = relnode_str.parse()?;
-
-    let forkname_match = caps.name("forkname");
-    let forkname = if forkname_match.is_none() {
-        None
-    } else {
-        Some(forkname_match.unwrap().as_str())
-    };
-    let forknum = forkname_to_forknum(forkname)?;
-
-    let segno_match = caps.name("segno");
-    let segno = if segno_match.is_none() {
-        0
-    } else {
-        segno_match.unwrap().as_str().parse::<u32>()?
-    };
-
-    let lsn_hi: u64 = caps.name("lsnhi").unwrap().as_str().parse()?;
-    let lsn_lo: u64 = caps.name("lsnlo").unwrap().as_str().parse()?;
-    let lsn = lsn_hi << 32 | lsn_lo;
-
-    Ok((relnode, forknum, segno, lsn))
-}
-
-fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
-    /*
-     * Relation data files can be in one of the following directories:
-     *
-     * global/
-     *		shared relations
-     *
-     * base/<db oid>/
-     *		regular relations, default tablespace
-     *
-     * pg_tblspc/<tblspc oid>/<tblspc version>/
-     *		within a non-default tablespace (the name of the directory
-     *		depends on version)
-     *
-     * And the relation data files themselves have a filename like:
-     *
-     * <oid>.<segment number>
-     */
-    if let Some(fname) = path.strip_prefix("global/") {
-        let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
-
-        Ok(ParsedBaseImageFileName {
-            spcnode: GLOBALTABLESPACE_OID,
-            dbnode: 0,
-            relnode,
-            forknum,
-            segno,
-            lsn,
-        })
-    } else if let Some(dbpath) = path.strip_prefix("base/") {
-        let mut s = dbpath.split("/");
-        let dbnode_str = s
-            .next()
-            .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
-        let dbnode: u32 = dbnode_str.parse()?;
-        let fname = s
-            .next()
-            .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
-        if s.next().is_some() {
-            return Err(FilePathError::new("invalid relation data file name"));
-        };
-
-        let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
-
-        Ok(ParsedBaseImageFileName {
-            spcnode: DEFAULTTABLESPACE_OID,
-            dbnode,
-            relnode,
-            forknum,
-            segno,
-            lsn,
-        })
-    } else if let Some(_) = path.strip_prefix("pg_tblspc/") {
-        // TODO
-        Err(FilePathError::new("tablespaces not supported"))
-    } else {
-        Err(FilePathError::new("invalid relation data file name"))
-    }
-}
-
-//
-// Load a base file from S3, and insert it into the page cache
-//
-async fn slurp_base_file(
-    conf: &PageServerConf,
-    sys_id: u64,
-    bucket: Bucket,
-    s3path: String,
-    parsed: ParsedBaseImageFileName,
-) {
-    // FIXME: rust-s3 opens a new connection for each request. Should reuse
-    // the reqwest::Client object. But that requires changes to rust-s3 itself.
-    let (data, code) = bucket.get_object(s3path.clone()).await.unwrap();
-
-    trace!("got response: {} on {}", code, &s3path);
-    assert_eq!(200, code);
-
-    let mut bytes = BytesMut::from(data.as_slice()).freeze();
-
-    // FIXME: use constants (BLCKSZ)
-    let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
-
-    let pcache = page_cache::get_pagecache(conf, sys_id);
-
-    while bytes.remaining() >= 8192 {
-        let tag = page_cache::BufferTag {
-            rel: page_cache::RelTag {
-                spcnode: parsed.spcnode,
-                dbnode: parsed.dbnode,
-                relnode: parsed.relnode,
-                forknum: parsed.forknum as u8,
-            },
-            blknum,
-        };
-
-        pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
-
-        blknum += 1;
-    }
-}
--- a/pageserver/src/rocksdb_storage.rs
+++ b/pageserver/src/rocksdb_storage.rs
@@ -0,0 +1,443 @@
+//!
+//! An implementation of the ObjectStore interface, backed by RocksDB
+//!
+use crate::object_key::*;
+use crate::object_store::ObjectStore;
+use crate::repository::RelTag;
+use crate::PageServerConf;
+use crate::ZTimelineId;
+use anyhow::{bail, Result};
+use serde::{Deserialize, Serialize};
+use std::collections::HashSet;
+use std::sync::{Arc, Mutex};
+use zenith_utils::bin_ser::BeSer;
+use zenith_utils::lsn::Lsn;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct StorageKey {
+    obj_key: ObjectKey,
+    lsn: Lsn,
+}
+
+impl StorageKey {
+    /// The first key for a given timeline
+    fn timeline_start(timeline: ZTimelineId) -> Self {
+        Self {
+            obj_key: ObjectKey {
+                timeline,
+                tag: ObjectTag::TimelineMetadataTag,
+            },
+            lsn: Lsn(0),
+        }
+    }
+}
+
+///
+/// RocksDB very inefficiently delete random record. Instead of it we have to use merge
+/// filter, which allows to throw away records at LSM merge phase.
+/// Unfortunately, it is hard (if ever possible)  to determine whether version can be removed
+/// at merge time. Version ca be removed if:
+/// 1. It is above PITR horizon (we need to get current LSN and gc_horizon from config)
+/// 2. Page is reconstructed at horizon (all WAL records above horizon are applied and can be removed)
+///
+/// So we have GC process which reconstructs pages at horizon and mark deteriorated WAL record
+/// for deletion. To mark object for deletion we can either set some flag in object itself.
+/// But it is complicated with new object value format, because RocksDB storage knows nothing about
+/// this format. Also updating whole record just to set one bit seems to be inefficient in any case.
+/// This is why we keep keys of marked for deletion versions in HashSet in memory.
+/// When LSM merge filter found key in this map, it removes it from the set preventing memory overflow.
+///
+struct GarbageCollector {
+    garbage: Mutex<HashSet<Vec<u8>>>,
+}
+
+impl GarbageCollector {
+    fn new() -> GarbageCollector {
+        GarbageCollector {
+            garbage: Mutex::new(HashSet::new()),
+        }
+    }
+
+    /// Called by GC to mark version as delete
+    fn mark_for_deletion(&self, key: &[u8]) {
+        let mut garbage = self.garbage.lock().unwrap();
+        garbage.insert(key.to_vec());
+    }
+
+    /// Called by LSM merge filter. If it finds key in the set, then
+    /// it doesn't merge it and removes from this set.
+    fn was_deleted(&self, key: &[u8]) -> bool {
+        let key = key.to_vec();
+        let mut garbage = self.garbage.lock().unwrap();
+        garbage.remove(&key)
+    }
+}
+
+pub struct RocksObjectStore {
+    _conf: &'static PageServerConf,
+
+    // RocksDB handle
+    db: rocksdb::DB,
+    gc: Arc<GarbageCollector>,
+}
+
+impl ObjectStore for RocksObjectStore {
+    fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
+        let val = self.db.get(StorageKey::ser(&StorageKey {
+            obj_key: key.clone(),
+            lsn,
+        })?)?;
+        if let Some(val) = val {
+            Ok(val)
+        } else {
+            bail!("could not find page {:?}", key);
+        }
+    }
+
+    fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
+        let mut iter = self.db.raw_iterator();
+        let search_key = StorageKey {
+            obj_key: key.clone(),
+            lsn: Lsn(0),
+        };
+        iter.seek(search_key.ser()?);
+        if !iter.valid() {
+            Ok(None)
+        } else {
+            let key = StorageKey::des(iter.key().unwrap())?;
+            Ok(Some(key.obj_key.clone()))
+        }
+    }
+
+    fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
+        self.db.put(
+            StorageKey::ser(&StorageKey {
+                obj_key: key.clone(),
+                lsn,
+            })?,
+            value,
+        )?;
+        Ok(())
+    }
+
+    fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
+        self.gc.mark_for_deletion(&StorageKey::ser(&StorageKey {
+            obj_key: key.clone(),
+            lsn,
+        })?);
+        Ok(())
+    }
+
+    /// Iterate through page versions of given page, starting from the given LSN.
+    /// The versions are walked in descending LSN order.
+    fn object_versions<'a>(
+        &'a self,
+        key: &ObjectKey,
+        lsn: Lsn,
+    ) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
+        let iter = RocksObjectVersionIter::new(&self.db, key, lsn)?;
+        Ok(Box::new(iter))
+    }
+
+    /// Iterate through all timeline objects
+    fn list_objects<'a>(
+        &'a self,
+        timeline: ZTimelineId,
+        nonrel_only: bool,
+        lsn: Lsn,
+    ) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
+        let iter = RocksObjectIter::new(&self.db, timeline, nonrel_only, lsn)?;
+        Ok(Box::new(iter))
+    }
+
+    /// Get a list of all distinct relations in given tablespace and database.
+    ///
+    /// TODO: This implementation is very inefficient, it scans
+    /// through all entries in the given database. In practice, this
+    /// is used for CREATE DATABASE, and usually the template database is small.
+    /// But if it's not, this will be slow.
+    fn list_rels(
+        &self,
+        timelineid: ZTimelineId,
+        spcnode: u32,
+        dbnode: u32,
+        lsn: Lsn,
+    ) -> Result<HashSet<RelTag>> {
+        // FIXME: This scans everything. Very slow
+
+        let mut rels: HashSet<RelTag> = HashSet::new();
+
+        let mut search_rel_tag = RelTag {
+            spcnode,
+            dbnode,
+            relnode: 0,
+            forknum: 0u8,
+        };
+        let mut iter = self.db.raw_iterator();
+        loop {
+            let search_key = StorageKey {
+                obj_key: ObjectKey {
+                    timeline: timelineid,
+                    tag: ObjectTag::RelationMetadata(search_rel_tag),
+                },
+                lsn: Lsn(0),
+            };
+            iter.seek(search_key.ser()?);
+            if !iter.valid() {
+                break;
+            }
+            let key = StorageKey::des(iter.key().unwrap())?;
+
+            if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
+                if spcnode != 0 && rel_tag.spcnode != spcnode
+                    || dbnode != 0 && rel_tag.dbnode != dbnode
+                {
+                    break;
+                }
+                if key.lsn <= lsn {
+                    // visible in this snapshot
+                    rels.insert(rel_tag);
+                }
+                search_rel_tag = rel_tag;
+                // skip to next relation
+                // FIXME: What if relnode is u32::MAX ?
+                search_rel_tag.relnode += 1;
+            } else {
+                // no more relation metadata entries
+                break;
+            }
+        }
+
+        Ok(rels)
+    }
+
+    /// Iterate through versions of all objects in a timeline.
+    ///
+    /// Returns objects in increasing key-version order.
+    /// Returns all versions up to and including the specified LSN.
+    fn objects<'a>(
+        &'a self,
+        timeline: ZTimelineId,
+        lsn: Lsn,
+    ) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
+        let start_key = StorageKey::timeline_start(timeline);
+        let start_key_bytes = StorageKey::ser(&start_key)?;
+        let iter = self.db.iterator(rocksdb::IteratorMode::From(
+            &start_key_bytes,
+            rocksdb::Direction::Forward,
+        ));
+
+        Ok(Box::new(RocksObjects {
+            iter,
+            timeline,
+            lsn,
+        }))
+    }
+
+    fn compact(&self) {
+        self.db.compact_range::<&[u8], &[u8]>(None, None);
+    }
+}
+
+impl RocksObjectStore {
+    /// Open a RocksDB database.
+    pub fn open(conf: &'static PageServerConf) -> Result<RocksObjectStore> {
+        let opts = Self::get_rocksdb_opts();
+        let obj_store = Self::new(conf, opts)?;
+        Ok(obj_store)
+    }
+
+    /// Create a new, empty RocksDB database.
+    pub fn create(conf: &'static PageServerConf) -> Result<RocksObjectStore> {
+        let path = conf.workdir.join("rocksdb-storage");
+        std::fs::create_dir(&path)?;
+
+        let mut opts = Self::get_rocksdb_opts();
+        opts.create_if_missing(true);
+        opts.set_error_if_exists(true);
+        let obj_store = Self::new(conf, opts)?;
+        Ok(obj_store)
+    }
+
+    fn new(conf: &'static PageServerConf, mut opts: rocksdb::Options) -> Result<RocksObjectStore> {
+        let path = conf.workdir.join("rocksdb-storage");
+        let gc = Arc::new(GarbageCollector::new());
+        let gc_ref = gc.clone();
+        opts.set_compaction_filter("ttl", move |_level: u32, key: &[u8], _val: &[u8]| {
+            if gc_ref.was_deleted(key) {
+                rocksdb::compaction_filter::Decision::Remove
+            } else {
+                rocksdb::compaction_filter::Decision::Keep
+            }
+        });
+        let db = rocksdb::DB::open(&opts, &path)?;
+        let obj_store = RocksObjectStore {
+            _conf: conf,
+            db,
+            gc,
+        };
+        Ok(obj_store)
+    }
+
+    /// common options used by `open` and `create`
+    fn get_rocksdb_opts() -> rocksdb::Options {
+        let mut opts = rocksdb::Options::default();
+        opts.set_use_fsync(true);
+        opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
+        opts
+    }
+}
+
+///
+/// Iterator for `object_versions`. Returns all page versions of a given block, in
+/// reverse LSN order.
+///
+struct RocksObjectVersionIter<'a> {
+    obj_key: ObjectKey,
+    dbiter: rocksdb::DBRawIterator<'a>,
+    first_call: bool,
+}
+impl<'a> RocksObjectVersionIter<'a> {
+    fn new(
+        db: &'a rocksdb::DB,
+        obj_key: &ObjectKey,
+        lsn: Lsn,
+    ) -> Result<RocksObjectVersionIter<'a>> {
+        let key = StorageKey {
+            obj_key: obj_key.clone(),
+            lsn,
+        };
+        let mut dbiter = db.raw_iterator();
+        dbiter.seek_for_prev(StorageKey::ser(&key)?); // locate last entry
+        Ok(RocksObjectVersionIter {
+            first_call: true,
+            obj_key: obj_key.clone(),
+            dbiter,
+        })
+    }
+}
+impl<'a> Iterator for RocksObjectVersionIter<'a> {
+    type Item = (Lsn, Vec<u8>);
+
+    fn next(&mut self) -> std::option::Option<Self::Item> {
+        if self.first_call {
+            self.first_call = false;
+        } else {
+            self.dbiter.prev(); // walk backwards
+        }
+
+        if !self.dbiter.valid() {
+            return None;
+        }
+        let key = StorageKey::des(self.dbiter.key().unwrap()).unwrap();
+        if key.obj_key.tag != self.obj_key.tag {
+            return None;
+        }
+        let val = self.dbiter.value().unwrap();
+        let result = val.to_vec();
+
+        Some((key.lsn, result))
+    }
+}
+
+struct RocksObjects<'r> {
+    iter: rocksdb::DBIterator<'r>,
+    timeline: ZTimelineId,
+    lsn: Lsn,
+}
+
+impl<'r> Iterator for RocksObjects<'r> {
+    // TODO consider returning Box<[u8]>
+    type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_result().transpose()
+    }
+}
+
+impl<'r> RocksObjects<'r> {
+    fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
+        for (key_bytes, v) in &mut self.iter {
+            let key = StorageKey::des(&key_bytes)?;
+
+            if key.obj_key.timeline != self.timeline {
+                return Ok(None);
+            }
+
+            if key.lsn > self.lsn {
+                // TODO can speed up by seeking iterator
+                continue;
+            }
+
+            return Ok(Some((key.obj_key.tag, key.lsn, v.to_vec())));
+        }
+
+        Ok(None)
+    }
+}
+
+///
+/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
+///
+struct RocksObjectIter<'a> {
+    timeline: ZTimelineId,
+    key: StorageKey,
+    nonrel_only: bool,
+    lsn: Lsn,
+    dbiter: rocksdb::DBRawIterator<'a>,
+}
+impl<'a> RocksObjectIter<'a> {
+    fn new(
+        db: &'a rocksdb::DB,
+        timeline: ZTimelineId,
+        nonrel_only: bool,
+        lsn: Lsn,
+    ) -> Result<RocksObjectIter<'a>> {
+        let key = StorageKey {
+            obj_key: ObjectKey {
+                timeline,
+                tag: ObjectTag::FirstTag,
+            },
+            lsn: Lsn(0),
+        };
+        let dbiter = db.raw_iterator();
+        Ok(RocksObjectIter {
+            key,
+            timeline,
+            nonrel_only,
+            lsn,
+            dbiter,
+        })
+    }
+}
+impl<'a> Iterator for RocksObjectIter<'a> {
+    type Item = ObjectTag;
+
+    fn next(&mut self) -> std::option::Option<Self::Item> {
+        loop {
+            self.dbiter.seek(StorageKey::ser(&self.key).unwrap());
+            if !self.dbiter.valid() {
+                return None;
+            }
+            let key = StorageKey::des(self.dbiter.key().unwrap()).unwrap();
+            if key.obj_key.timeline != self.timeline {
+                // End of this timeline
+                return None;
+            }
+            self.key = key.clone();
+            self.key.lsn = Lsn(u64::MAX); // next seek should skip all versions
+            if key.lsn <= self.lsn {
+                // visible in this snapshot
+                if self.nonrel_only {
+                    match key.obj_key.tag {
+                        ObjectTag::RelationMetadata(_) => return None,
+                        ObjectTag::RelationBuffer(_) => return None,
+                        _ => return Some(key.obj_key.tag),
+                    }
+                } else {
+                    return Some(key.obj_key.tag);
+                }
+            }
+        }
+    }
+}
--- a/pageserver/src/tui.rs
+++ b/pageserver/src/tui.rs
@@ -171,6 +171,11 @@ pub fn ui_main() -> Result<(), Box<dyn Error>> {
        })?;

        // If ther user presses 'q', quit.
+
+        // silence clippy's suggestion to rewrite this as an if-statement. Match
+        // makes more sense as soon as we get another command than 'q'.
+        #[allow(clippy::single_match)]
+        #[allow(clippy::collapsible_match)]
        if let Event::Input(key) = events.next()? {
            match key {
                Key::Char('q') => {
@@ -229,7 +234,7 @@ impl<'a> Widget for LogWidget<'a> {
 // Render a widget to show some metrics
 struct MetricsWidget {}

-fn get_metric_u64(title: &str, value: u64) -> Spans {
+fn _get_metric_u64(title: &str, value: u64) -> Spans {
    Spans::from(vec![
        Span::styled(format!("{:<20}", title), Style::default()),
        Span::raw(": "),
@@ -240,7 +245,9 @@ fn get_metric_u64(title: &str, value: u64) -> Spans {
    ])
 }

-fn get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
+// This is not used since LSNs were removed from page cache stats.
+// Maybe it will be used in the future?
+fn _get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
    Spans::from(vec![
        Span::styled(format!("{:<20}", title), Style::default()),
        Span::raw(": "),
@@ -248,13 +255,6 @@ fn get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
    ])
 }

-// FIXME: We really should define a datatype for LSNs, with Display trait and
-// helper functions. There's one in tokio-postgres, but I don't think we want
-// to rely on that.
-fn format_lsn(lsn: u64) -> String {
-    return format!("{:X}/{:X}", lsn >> 32, lsn & 0xffff_ffff);
-}
-
 impl tui::widgets::Widget for MetricsWidget {
    fn render(self, area: Rect, buf: &mut Buffer) {
        let block = Block::default()
@@ -265,17 +265,24 @@ impl tui::widgets::Widget for MetricsWidget {

        block.render(area, buf);

+        #[allow(unused_mut)]
        let mut lines: Vec<Spans> = Vec::new();

-        let page_cache_stats = crate::page_cache::get_stats();
+        // FIXME
+        //let page_cache_stats = crate::page_cache::get_stats();
+
+        // This is not used since LSNs were removed from page cache stats.
+        // Maybe it will be used in the future?
+        /*
        let lsnrange = format!(
            "{} - {}",
-            format_lsn(page_cache_stats.first_valid_lsn),
-            format_lsn(page_cache_stats.last_valid_lsn)
+            page_cache_stats.first_valid_lsn, page_cache_stats.last_valid_lsn
        );
-        let last_valid_recordlsn_str = format_lsn(page_cache_stats.last_record_lsn);
+        let last_valid_recordlsn_str = page_cache_stats.last_record_lsn.to_string();
        lines.push(get_metric_str("Valid LSN range", &lsnrange));
        lines.push(get_metric_str("Last record LSN", &last_valid_recordlsn_str));
+        */
+        /*
        lines.push(get_metric_u64(
            "# of cache entries",
            page_cache_stats.num_entries,
@@ -292,7 +299,7 @@ impl tui::widgets::Widget for MetricsWidget {
            "# of GetPage@LSN calls",
            page_cache_stats.num_getpage_requests,
        ));
-
+        */
        let text = Text::from(lines);

        Paragraph::new(text).render(inner_area, buf);
--- a/pageserver/src/tui_event.rs
+++ b/pageserver/src/tui_event.rs
@@ -54,14 +54,14 @@ impl Events {
            thread::spawn(move || {
                let stdin = io::stdin();
                for evt in stdin.keys() {
-                    if let Ok(key) = evt {
-                        if let Err(err) = tx.send(Event::Input(key)) {
-                            eprintln!("{}", err);
-                            return;
-                        }
-                        if !ignore_exit_key.load(Ordering::Relaxed) && key == config.exit_key {
-                            return;
-                        }
+                    // This will panic if stdin returns EOF.
+                    let key = evt.unwrap();
+                    if let Err(err) = tx.send(Event::Input(key)) {
+                        eprintln!("{}", err);
+                        return;
+                    }
+                    if !ignore_exit_key.load(Ordering::Relaxed) && key == config.exit_key {
+                        return;
                    }
                }
            })
--- a/pageserver/src/waldecoder.rs
+++ b/pageserver/src/waldecoder.rs
@@ -1,49 +1,31 @@
-use crate::pg_constants;
+//!
+//! WAL decoder. For each WAL record, it decodes the record to figure out which data blocks
+//! the record affects, to add the records to the page cache.
+//!
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use log::*;
+use postgres_ffi::pg_constants;
+use postgres_ffi::xlog_utils::*;
+use postgres_ffi::XLogLongPageHeaderData;
+use postgres_ffi::XLogPageHeaderData;
+use postgres_ffi::XLogRecord;
 use std::cmp::min;
 use thiserror::Error;
-use std::str;
+use zenith_utils::lsn::Lsn;

-const XLOG_BLCKSZ: u32 = 8192;
-
-// FIXME: this is configurable in PostgreSQL, 16 MB is the default
-const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024;
-
-// From PostgreSQL headers
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XLogPageHeaderData {
-    xlp_magic: u16,    /* magic value for correctness checks */
-    xlp_info: u16,     /* flag bits, see below */
-    xlp_tli: u32,      /* TimeLineID of first record on page */
-    xlp_pageaddr: u64, /* XLOG address of this page */
-    xlp_rem_len: u32,  /* total len of remaining data for record */
-}
-
-// FIXME: this assumes MAXIMUM_ALIGNOF 8. There are 4 padding bytes at end
-#[allow(non_upper_case_globals)]
-const SizeOfXLogShortPHD: usize = 2 + 2 + 4 + 8 + 4 + 4;
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XLogLongPageHeaderData {
-    std: XLogPageHeaderData, /* standard header fields */
-    xlp_sysid: u64,          /* system identifier from pg_control */
-    xlp_seg_size: u32,       /* just as a cross-check */
-    xlp_xlog_blcksz: u32,    /* just as a cross-check */
-}
-
-// FIXME: this assumes MAXIMUM_ALIGNOF 8.
-#[allow(non_upper_case_globals)]
-const SizeOfXLogLongPHD: usize = (2 + 2 + 4 + 8 + 4) + 4 + 8 + 4 + 4;
+pub type Oid = u32;
+pub type TransactionId = u32;
+pub type BlockNumber = u32;
+pub type OffsetNumber = u16;
+pub type MultiXactId = TransactionId;
+pub type MultiXactOffset = u32;
+pub type MultiXactStatus = u32;

 #[allow(dead_code)]
 pub struct WalStreamDecoder {
-    lsn: u64,
+    lsn: Lsn,

-    startlsn: u64, // LSN where this record starts
+    startlsn: Lsn, // LSN where this record starts
    contlen: u32,
    padlen: u32,

@@ -56,7 +38,7 @@ pub struct WalStreamDecoder {
 #[error("{msg} at {lsn}")]
 pub struct WalDecodeError {
    msg: String,
-    lsn: u64,
+    lsn: Lsn,
 }

 //
@@ -64,11 +46,11 @@ pub struct WalDecodeError {
 // FIXME: This isn't a proper rust stream
 //
 impl WalStreamDecoder {
-    pub fn new(lsn: u64) -> WalStreamDecoder {
+    pub fn new(lsn: Lsn) -> WalStreamDecoder {
        WalStreamDecoder {
            lsn,

-            startlsn: 0,
+            startlsn: Lsn(0),
            contlen: 0,
            padlen: 0,

@@ -85,22 +67,23 @@ impl WalStreamDecoder {
    /// decoder so far.
    ///
    /// Returns one of the following:
-    ///     Ok((u64, Bytes)): a tuple containing the LSN of next record, and the record itself
+    ///     Ok((Lsn, Bytes)): a tuple containing the LSN of next record, and the record itself
    ///     Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
    ///     Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
    ///
-    pub fn poll_decode(&mut self) -> Result<Option<(u64, Bytes)>, WalDecodeError> {
+    pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
        loop {
            // parse and verify page boundaries as we go
-            if self.lsn % WAL_SEGMENT_SIZE == 0 {
+            if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
                // parse long header

-                if self.inputbuf.remaining() < SizeOfXLogLongPHD {
+                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
                    return Ok(None);
                }

-                let hdr = self.decode_XLogLongPageHeaderData();
-                if hdr.std.xlp_pageaddr != self.lsn {
+                let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf);
+
+                if hdr.std.xlp_pageaddr != self.lsn.0 {
                    return Err(WalDecodeError {
                        msg: "invalid xlog segment header".into(),
                        lsn: self.lsn,
@@ -108,17 +91,16 @@ impl WalStreamDecoder {
                }
                // TODO: verify the remaining fields in the header

-                self.lsn += SizeOfXLogLongPHD as u64;
+                self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
                continue;
-            } else if self.lsn % (XLOG_BLCKSZ as u64) == 0 {
-                // parse page header
-
-                if self.inputbuf.remaining() < SizeOfXLogShortPHD {
+            } else if self.lsn.block_offset() == 0 {
+                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
                    return Ok(None);
                }

-                let hdr = self.decode_XLogPageHeaderData();
-                if hdr.xlp_pageaddr != self.lsn {
+                let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf);
+
+                if hdr.xlp_pageaddr != self.lsn.0 {
                    return Err(WalDecodeError {
                        msg: "invalid xlog page header".into(),
                        lsn: self.lsn,
@@ -126,7 +108,7 @@ impl WalStreamDecoder {
                }
                // TODO: verify the remaining fields in the header

-                self.lsn += SizeOfXLogShortPHD as u64;
+                self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
                continue;
            } else if self.padlen > 0 {
                if self.inputbuf.remaining() < self.padlen as usize {
@@ -147,7 +129,7 @@ impl WalStreamDecoder {
                // read xl_tot_len FIXME: assumes little-endian
                self.startlsn = self.lsn;
                let xl_tot_len = self.inputbuf.get_u32_le();
-                if xl_tot_len < SizeOfXLogRecord {
+                if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
                    return Err(WalDecodeError {
                        msg: format!("invalid xl_tot_len {}", xl_tot_len),
                        lsn: self.lsn,
@@ -163,7 +145,7 @@ impl WalStreamDecoder {
                continue;
            } else {
                // we're continuing a record, possibly from previous page.
-                let pageleft: u32 = XLOG_BLCKSZ - (self.lsn % (XLOG_BLCKSZ as u64)) as u32;
+                let pageleft = self.lsn.remaining_in_block() as u32;

                // read the rest of the record, or as much as fits on this page.
                let n = min(self.contlen, pageleft) as usize;
@@ -180,20 +162,18 @@ impl WalStreamDecoder {
                    let recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new());

                    let recordbuf = recordbuf.freeze();
+                    let mut buf = recordbuf.clone();

                    // XLOG_SWITCH records are special. If we see one, we need to skip
                    // to the next WAL segment.
-                    if is_xlog_switch_record(&recordbuf) {
-                        trace!(
-                            "saw xlog switch record at {:X}/{:X}",
-                            (self.lsn >> 32),
-                            self.lsn & 0xffffffff
-                        );
-                        self.padlen = (WAL_SEGMENT_SIZE - (self.lsn % WAL_SEGMENT_SIZE)) as u32;
-                    }
-
-                    if self.lsn % 8 != 0 {
-                        self.padlen = 8 - (self.lsn % 8) as u32;
+                    let xlogrec = XLogRecord::from_bytes(&mut buf);
+                    if xlogrec.is_xlog_switch_record() {
+                        trace!("saw xlog switch record at {}", self.lsn);
+                        self.padlen =
+                            self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
+                    } else {
+                        // Pad to an 8-byte boundary
+                        self.padlen = self.lsn.calc_padding(8u32) as u32;
                    }

                    let result = (self.lsn, recordbuf);
@@ -208,68 +188,8 @@ impl WalStreamDecoder {

        // deal with xlog_switch records
    }
-
-    #[allow(non_snake_case)]
-    fn decode_XLogPageHeaderData(&mut self) -> XLogPageHeaderData {
-        let buf = &mut self.inputbuf;
-
-        // FIXME: Assume little-endian
-
-        let hdr: XLogPageHeaderData = XLogPageHeaderData {
-            xlp_magic: buf.get_u16_le(),
-            xlp_info: buf.get_u16_le(),
-            xlp_tli: buf.get_u32_le(),
-            xlp_pageaddr: buf.get_u64_le(),
-            xlp_rem_len: buf.get_u32_le(),
-        };
-        // 4 bytes of padding, on 64-bit systems
-        buf.advance(4);
-
-        // FIXME: check that hdr.xlp_rem_len matches self.contlen
-        //println!("next xlog page (xlp_rem_len: {})", hdr.xlp_rem_len);
-
-        hdr
-    }
-
-    #[allow(non_snake_case)]
-    fn decode_XLogLongPageHeaderData(&mut self) -> XLogLongPageHeaderData {
-        let hdr: XLogLongPageHeaderData = XLogLongPageHeaderData {
-            std: self.decode_XLogPageHeaderData(),
-            xlp_sysid: self.inputbuf.get_u64_le(),
-            xlp_seg_size: self.inputbuf.get_u32_le(),
-            xlp_xlog_blcksz: self.inputbuf.get_u32_le(),
-        };
-
-        hdr
-    }
 }

-// FIXME:
-const BLCKSZ: u16 = 8192;
-
-//
-// Constants from xlogrecord.h
-//
-
-const XLR_MAX_BLOCK_ID: u8 = 32;
-
-const XLR_BLOCK_ID_DATA_SHORT: u8 = 255;
-const XLR_BLOCK_ID_DATA_LONG: u8 = 254;
-const XLR_BLOCK_ID_ORIGIN: u8 = 253;
-const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252;
-
-const BKPBLOCK_FORK_MASK: u8 = 0x0F;
-const _BKPBLOCK_FLAG_MASK: u8 = 0xF0;
-const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */
-const BKPBLOCK_HAS_DATA: u8 = 0x20;
-const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */
-const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */
-
-/* Information stored in bimg_info */
-const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
-const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
-const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
-
 #[allow(dead_code)]
 pub struct DecodedBkpBlock {
    /* Is this block ref in use? */
@@ -289,7 +209,7 @@ pub struct DecodedBkpBlock {
    /* Information on full-page image, if any */
    has_image: bool,       /* has image, even for consistency checking */
    pub apply_image: bool, /* has image that should be restored */
-    pub will_init: bool,
+    pub will_init: bool,   /* record doesn't need previous page version to apply */
    //char	   *bkp_image;
    hole_offset: u16,
    hole_length: u16,
@@ -325,10 +245,8 @@ impl DecodedBkpBlock {
    }
 }

-#[allow(non_upper_case_globals)]
-const SizeOfXLogRecord: u32 = 24;
-
 pub struct DecodedWALRecord {
+    pub xl_xid: TransactionId,
    pub xl_info: u8,
    pub xl_rmid: u8,
    pub record: Bytes, // raw XLogRecord
@@ -337,31 +255,6 @@ pub struct DecodedWALRecord {
    pub main_data_offset: usize,
 }

-// Is this record an XLOG_SWITCH record? They need some special processing,
-// so we need to check for that before the rest of the parsing.
-//
-// FIXME: refactor this and decode_wal_record() below to avoid the duplication.
-fn is_xlog_switch_record(rec: &Bytes) -> bool {
-    let mut buf = rec.clone();
-
-    // FIXME: assume little-endian here
-    let _xl_tot_len = buf.get_u32_le();
-    let _xl_xid = buf.get_u32_le();
-    let _xl_prev = buf.get_u64_le();
-    let xl_info = buf.get_u8();
-    let xl_rmid = buf.get_u8();
-    buf.advance(2); // 2 bytes of padding
-    let _xl_crc = buf.get_u32_le();
-
-    xl_info == pg_constants::XLOG_SWITCH && xl_rmid == pg_constants::RM_XLOG_ID
-}
-
-pub type Oid = u32;
-pub type BlockNumber = u32;
-
-pub const MAIN_FORKNUM: u8 = 0;
-pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
-
 #[repr(C)]
 #[derive(Debug, Clone, Copy)]
 pub struct RelFileNode {
@@ -370,6 +263,24 @@ pub struct RelFileNode {
    pub relnode: Oid, /* relation */
 }

+#[repr(C)]
+#[derive(Debug)]
+pub struct XlRelmapUpdate {
+    pub dbid: Oid,   /* database ID, or 0 for shared map */
+    pub tsid: Oid,   /* database's tablespace, or pg_global */
+    pub nbytes: i32, /* size of relmap data */
+}
+
+impl XlRelmapUpdate {
+    pub fn decode(buf: &mut Bytes) -> XlRelmapUpdate {
+        XlRelmapUpdate {
+            dbid: buf.get_u32_le(),
+            tsid: buf.get_u32_le(),
+            nbytes: buf.get_i32_le(),
+        }
+    }
+}
+
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlSmgrTruncate {
@@ -378,31 +289,291 @@ pub struct XlSmgrTruncate {
    pub flags: u32,
 }

-pub fn decode_truncate_record(decoded: &DecodedWALRecord) -> XlSmgrTruncate {
-    let mut buf = decoded.record.clone();
-    buf.advance((SizeOfXLogRecord + 2) as usize);
-    XlSmgrTruncate {
-        blkno: buf.get_u32_le(),
-        rnode: RelFileNode {
-            spcnode: buf.get_u32_le(), /* tablespace */
-            dbnode: buf.get_u32_le(),  /* database */
-            relnode: buf.get_u32_le(), /* relation */
-        },
-        flags: buf.get_u32_le(),
+impl XlSmgrTruncate {
+    pub fn decode(buf: &mut Bytes) -> XlSmgrTruncate {
+        XlSmgrTruncate {
+            blkno: buf.get_u32_le(),
+            rnode: RelFileNode {
+                spcnode: buf.get_u32_le(), /* tablespace */
+                dbnode: buf.get_u32_le(),  /* database */
+                relnode: buf.get_u32_le(), /* relation */
+            },
+            flags: buf.get_u32_le(),
+        }
    }
 }

-//
-// Routines to decode a WAL record and figure out which blocks are modified
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlCreateDatabase {
+    pub db_id: Oid,
+    pub tablespace_id: Oid,
+    pub src_db_id: Oid,
+    pub src_tablespace_id: Oid,
+}
+
+impl XlCreateDatabase {
+    pub fn decode(buf: &mut Bytes) -> XlCreateDatabase {
+        XlCreateDatabase {
+            db_id: buf.get_u32_le(),
+            tablespace_id: buf.get_u32_le(),
+            src_db_id: buf.get_u32_le(),
+            src_tablespace_id: buf.get_u32_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlHeapInsert {
+    pub offnum: OffsetNumber,
+    pub flags: u8,
+}
+
+impl XlHeapInsert {
+    pub fn decode(buf: &mut Bytes) -> XlHeapInsert {
+        XlHeapInsert {
+            offnum: buf.get_u16_le(),
+            flags: buf.get_u8(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlHeapMultiInsert {
+    pub flags: u8,
+    pub ntuples: u16,
+}
+
+impl XlHeapMultiInsert {
+    pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert {
+        XlHeapMultiInsert {
+            flags: buf.get_u8(),
+            ntuples: buf.get_u16_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlHeapDelete {
+    pub xmax: TransactionId,
+    pub offnum: OffsetNumber,
+    pub infobits_set: u8,
+    pub flags: u8,
+}
+
+impl XlHeapDelete {
+    pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
+        XlHeapDelete {
+            xmax: buf.get_u32_le(),
+            offnum: buf.get_u16_le(),
+            infobits_set: buf.get_u8(),
+            flags: buf.get_u8(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlHeapUpdate {
+    pub old_xmax: TransactionId,
+    pub old_offnum: OffsetNumber,
+    pub old_infobits_set: u8,
+    pub flags: u8,
+    pub new_xmax: TransactionId,
+    pub new_offnum: OffsetNumber,
+}
+
+impl XlHeapUpdate {
+    pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
+        XlHeapUpdate {
+            old_xmax: buf.get_u32_le(),
+            old_offnum: buf.get_u16_le(),
+            old_infobits_set: buf.get_u8(),
+            flags: buf.get_u8(),
+            new_xmax: buf.get_u32_le(),
+            new_offnum: buf.get_u16_le(),
+        }
+    }
+}
+
+///
+/// Note: Parsing some fields is missing, because they're not needed.
+///
+/// This is similar to the xl_xact_parsed_commit and
+/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
+/// struct for commits and aborts.
+///
+#[derive(Debug)]
+pub struct XlXactParsedRecord {
+    pub xid: TransactionId,
+    pub info: u8,
+    pub xact_time: TimestampTz,
+    pub xinfo: u32,
+
+    pub db_id: Oid, /* MyDatabaseId */
+    pub ts_id: Oid, /* MyDatabaseTableSpace */
+
+    pub subxacts: Vec<TransactionId>,
+
+    pub xnodes: Vec<RelFileNode>,
+}
+
+impl XlXactParsedRecord {
+    /// Decode a XLOG_XACT_COMMIT/ABORT/COMMIT_PREPARED/ABORT_PREPARED
+    /// record. This should agree with the ParseCommitRecord and ParseAbortRecord
+    /// functions in PostgreSQL (in src/backend/access/rmgr/xactdesc.c)
+    pub fn decode(buf: &mut Bytes, mut xid: TransactionId, xl_info: u8) -> XlXactParsedRecord {
+        let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
+        // The record starts with time of commit/abort
+        let xact_time = buf.get_i64_le();
+        let xinfo;
+        if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
+            xinfo = buf.get_u32_le();
+        } else {
+            xinfo = 0;
+        }
+        let db_id;
+        let ts_id;
+        if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
+            db_id = buf.get_u32_le();
+            ts_id = buf.get_u32_le();
+        } else {
+            db_id = 0;
+            ts_id = 0;
+        }
+        let mut subxacts = Vec::<TransactionId>::new();
+        if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
+            let nsubxacts = buf.get_i32_le();
+            for _i in 0..nsubxacts {
+                let subxact = buf.get_u32_le();
+                subxacts.push(subxact);
+            }
+        }
+        let mut xnodes = Vec::<RelFileNode>::new();
+        if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
+            let nrels = buf.get_i32_le();
+            for _i in 0..nrels {
+                let spcnode = buf.get_u32_le();
+                let dbnode = buf.get_u32_le();
+                let relnode = buf.get_u32_le();
+                trace!(
+                    "XLOG_XACT_COMMIT relfilenode {}/{}/{}",
+                    spcnode,
+                    dbnode,
+                    relnode
+                );
+                xnodes.push(RelFileNode {
+                    spcnode,
+                    dbnode,
+                    relnode,
+                });
+            }
+        }
+        if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
+            let nmsgs = buf.get_i32_le();
+            for _i in 0..nmsgs {
+                let sizeof_shared_invalidation_message = 0;
+                buf.advance(sizeof_shared_invalidation_message);
+            }
+        }
+        if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
+            xid = buf.get_u32_le();
+            trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
+        }
+        XlXactParsedRecord {
+            xid,
+            info,
+            xact_time,
+            xinfo,
+            db_id,
+            ts_id,
+            subxacts,
+            xnodes,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct MultiXactMember {
+    pub xid: TransactionId,
+    pub status: MultiXactStatus,
+}
+
+impl MultiXactMember {
+    pub fn decode(buf: &mut Bytes) -> MultiXactMember {
+        MultiXactMember {
+            xid: buf.get_u32_le(),
+            status: buf.get_u32_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlMultiXactCreate {
+    pub mid: MultiXactId,      /* new MultiXact's ID */
+    pub moff: MultiXactOffset, /* its starting offset in members file */
+    pub nmembers: u32,         /* number of member XIDs */
+    pub members: Vec<MultiXactMember>,
+}
+
+impl XlMultiXactCreate {
+    pub fn decode(buf: &mut Bytes) -> XlMultiXactCreate {
+        let mid = buf.get_u32_le();
+        let moff = buf.get_u32_le();
+        let nmembers = buf.get_u32_le();
+        let mut members = Vec::new();
+        for _ in 0..nmembers {
+            members.push(MultiXactMember::decode(buf));
+        }
+        XlMultiXactCreate {
+            mid,
+            moff,
+            nmembers,
+            members,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlMultiXactTruncate {
+    pub oldest_multi_db: Oid,
+    /* to-be-truncated range of multixact offsets */
+    pub start_trunc_off: MultiXactId, /* just for completeness' sake */
+    pub end_trunc_off: MultiXactId,
+
+    /* to-be-truncated range of multixact members */
+    pub start_trunc_memb: MultiXactOffset,
+    pub end_trunc_memb: MultiXactOffset,
+}
+
+impl XlMultiXactTruncate {
+    pub fn decode(buf: &mut Bytes) -> XlMultiXactTruncate {
+        XlMultiXactTruncate {
+            oldest_multi_db: buf.get_u32_le(),
+            start_trunc_off: buf.get_u32_le(),
+            end_trunc_off: buf.get_u32_le(),
+            start_trunc_memb: buf.get_u32_le(),
+            end_trunc_memb: buf.get_u32_le(),
+        }
+    }
+}
+
+/// Main routine to decode a WAL record and figure out which blocks are modified
 //
 // See xlogrecord.h for details
 // The overall layout of an XLOG record is:
 //		Fixed-size header (XLogRecord struct)
 //      XLogRecordBlockHeader struct
-//          If BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
-//	           If BKPIMAGE_HAS_HOLE and BKPIMAGE_IS_COMPRESSED, an
+//          If pg_constants::BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
+//	           If pg_constants::BKPIMAGE_HAS_HOLE and pg_constants::BKPIMAGE_IS_COMPRESSED, an
 //	           XLogRecordBlockCompressHeader struct follows.
-//          If BKPBLOCK_SAME_REL is not set, a RelFileNode follows
+//          If pg_constants::BKPBLOCK_SAME_REL is not set, a RelFileNode follows
 //          BlockNumber follows
 //      XLogRecordBlockHeader struct
 //      ...
@@ -422,23 +593,17 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
    // 1. Parse XLogRecord struct

    // FIXME: assume little-endian here
-    let xl_tot_len = buf.get_u32_le();
-    let xl_xid = buf.get_u32_le();
-    let xl_prev = buf.get_u64_le();
-    let xl_info = buf.get_u8();
-    let xl_rmid = buf.get_u8();
-    buf.advance(2); // 2 bytes of padding
-    let _xl_crc = buf.get_u32_le();
+    let xlogrec = XLogRecord::from_bytes(&mut buf);

    trace!(
        "decode_wal_record xl_rmid = {} xl_info = {}",
-        xl_rmid,
-        xl_info
+        xlogrec.xl_rmid,
+        xlogrec.xl_info
    );

-    let remaining = xl_tot_len - SizeOfXLogRecord;
+    let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD;

-    if buf.remaining() != remaining as usize {
+    if buf.remaining() != remaining {
        //TODO error
    }

@@ -455,29 +620,29 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
        let block_id = buf.get_u8();

        match block_id {
-            XLR_BLOCK_ID_DATA_SHORT => {
+            pg_constants::XLR_BLOCK_ID_DATA_SHORT => {
                /* XLogRecordDataHeaderShort */
                main_data_len = buf.get_u8() as u32;
                datatotal += main_data_len;
            }

-            XLR_BLOCK_ID_DATA_LONG => {
+            pg_constants::XLR_BLOCK_ID_DATA_LONG => {
                /* XLogRecordDataHeaderLong */
                main_data_len = buf.get_u32_le();
                datatotal += main_data_len;
            }

-            XLR_BLOCK_ID_ORIGIN => {
+            pg_constants::XLR_BLOCK_ID_ORIGIN => {
                // RepOriginId is uint16
                buf.advance(2);
            }

-            XLR_BLOCK_ID_TOPLEVEL_XID => {
+            pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
                // TransactionId is uint32
                buf.advance(4);
            }

-            0..=XLR_MAX_BLOCK_ID => {
+            0..=pg_constants::XLR_MAX_BLOCK_ID => {
                /* XLogRecordBlockHeader */
                let mut blk = DecodedBkpBlock::new();
                let fork_flags: u8;
@@ -494,11 +659,11 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                max_block_id = block_id;

                fork_flags = buf.get_u8();
-                blk.forknum = fork_flags & BKPBLOCK_FORK_MASK;
+                blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
                blk.flags = fork_flags;
-                blk.has_image = (fork_flags & BKPBLOCK_HAS_IMAGE) != 0;
-                blk.has_data = (fork_flags & BKPBLOCK_HAS_DATA) != 0;
-                blk.will_init = (fork_flags & BKPBLOCK_WILL_INIT) != 0;
+                blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
+                blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0;
+                blk.will_init = (fork_flags & pg_constants::BKPBLOCK_WILL_INIT) != 0;
                blk.data_len = buf.get_u16_le();

                /* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */
@@ -511,16 +676,16 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                    blk.hole_offset = buf.get_u16_le();
                    blk.bimg_info = buf.get_u8();

-                    blk.apply_image = (blk.bimg_info & BKPIMAGE_APPLY) != 0;
+                    blk.apply_image = (blk.bimg_info & pg_constants::BKPIMAGE_APPLY) != 0;

-                    if blk.bimg_info & BKPIMAGE_IS_COMPRESSED != 0 {
-                        if blk.bimg_info & BKPIMAGE_HAS_HOLE != 0 {
+                    if blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED != 0 {
+                        if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 {
                            blk.hole_length = buf.get_u16_le();
                        } else {
                            blk.hole_length = 0;
                        }
                    } else {
-                        blk.hole_length = BLCKSZ - blk.bimg_len;
+                        blk.hole_length = pg_constants::BLCKSZ - blk.bimg_len;
                    }
                    datatotal += blk.bimg_len as u32;
                    blocks_total_len += blk.bimg_len as u32;
@@ -529,13 +694,15 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                     * cross-check that hole_offset > 0, hole_length > 0 and
                     * bimg_len < BLCKSZ if the HAS_HOLE flag is set.
                     */
-                    if blk.bimg_info & BKPIMAGE_HAS_HOLE != 0
-                        && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ)
+                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0
+                        && (blk.hole_offset == 0
+                            || blk.hole_length == 0
+                            || blk.bimg_len == pg_constants::BLCKSZ)
                    {
                        // TODO
                        /*
                        report_invalid_record(state,
-                                      "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
+                                      "pg_constants::BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
                                      (unsigned int) blk->hole_offset,
                                      (unsigned int) blk->hole_length,
                                      (unsigned int) blk->bimg_len,
@@ -548,13 +715,13 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                     * cross-check that hole_offset == 0 and hole_length == 0 if
                     * the HAS_HOLE flag is not set.
                     */
-                    if blk.bimg_info & BKPIMAGE_HAS_HOLE == 0
+                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
                        && (blk.hole_offset != 0 || blk.hole_length != 0)
                    {
                        // TODO
                        /*
                        report_invalid_record(state,
-                                      "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
+                                      "pg_constants::BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
                                      (unsigned int) blk->hole_offset,
                                      (unsigned int) blk->hole_length,
                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
@@ -566,11 +733,13 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                     * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED
                     * flag is set.
                     */
-                    if (blk.bimg_info & BKPIMAGE_IS_COMPRESSED == 0) && blk.bimg_len == BLCKSZ {
+                    if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0)
+                        && blk.bimg_len == pg_constants::BLCKSZ
+                    {
                        // TODO
                        /*
                        report_invalid_record(state,
-                                      "BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X",
+                                      "pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X",
                                      (unsigned int) blk->bimg_len,
                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
                        goto err;
@@ -581,21 +750,21 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                     * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor
                     * IS_COMPRESSED flag is set.
                     */
-                    if blk.bimg_info & BKPIMAGE_HAS_HOLE == 0
-                        && blk.bimg_info & BKPIMAGE_IS_COMPRESSED == 0
-                        && blk.bimg_len != BLCKSZ
+                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
+                        && blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0
+                        && blk.bimg_len != pg_constants::BLCKSZ
                    {
                        // TODO
                        /*
                        report_invalid_record(state,
-                                      "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X",
+                                      "neither pg_constants::BKPIMAGE_HAS_HOLE nor pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X",
                                      (unsigned int) blk->data_len,
                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
                        goto err;
                                     */
                    }
                }
-                if fork_flags & BKPBLOCK_SAME_REL == 0 {
+                if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 {
                    rnode_spcnode = buf.get_u32_le();
                    rnode_dbnode = buf.get_u32_le();
                    rnode_relnode = buf.get_u32_le();
@@ -604,7 +773,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                    // TODO
                    /*
                    report_invalid_record(state,
-                                    "BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
+                                    "pg_constants::BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
                                    (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
                    goto err;           */
                }
@@ -635,88 +804,94 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
    // We don't need them, so just skip blocks_total_len bytes
    buf.advance(blocks_total_len as usize);

-    let main_data_offset = (xl_tot_len - main_data_len) as usize;
+    let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;

    // 4. Decode main_data
    if main_data_len > 0 {
        assert_eq!(buf.remaining(), main_data_len as usize);
    }

-    //5. Handle special CLOG and XACT records
-    if xl_rmid == pg_constants::RM_CLOG_ID {
-        let mut blk = DecodedBkpBlock::new();
-        blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
-        blk.blkno = buf.get_i32_le() as u32;
-        trace!("RM_CLOG_ID updates block {}", blk.blkno);
-        blocks.push(blk);
-    } else if xl_rmid == pg_constants::RM_XACT_ID {
-        let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
-        if info == pg_constants::XLOG_XACT_COMMIT {
-            let mut blk = DecodedBkpBlock::new();
-            blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
-            blk.blkno = xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
-            trace!(
-                "XLOG_XACT_COMMIT xl_prev {:X}/{:X}  xid {} updates block {}",
-                (xl_prev >> 32),
-                xl_prev & 0xffffffff,
-                xl_xid,
-                blk.blkno
-            );
-            blocks.push(blk);
-            //TODO parse commit record to extract subtrans entries
-        } else if info == pg_constants::XLOG_XACT_ABORT {
-            let mut blk = DecodedBkpBlock::new();
-            blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
-            blk.blkno = xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
-            trace!(
-                "XLOG_XACT_ABORT xl_prev {:X}/{:X} xid {} updates block {}",
-                (xl_prev >> 32),
-                xl_prev & 0xffffffff,
-                xl_xid,
-                blk.blkno
-            );
-            blocks.push(blk);
-            //TODO parse abort record to extract subtrans entries
-        }
-    }
-    else if xl_rmid == pg_constants::RM_DBASE_ID
-    {
-        let info = xl_info & !pg_constants::XLR_INFO_MASK;
-        if info == pg_constants::XLOG_DBASE_CREATE
+    // 5. Handle a few special record types that modify blocks without registering
+    // them with the standard mechanism.
+    if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
+        let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+        let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
+        if info == pg_constants::XLOG_HEAP_INSERT {
+            let xlrec = XlHeapInsert::decode(&mut buf);
+            if (xlrec.flags
+                & (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
+                    | pg_constants::XLH_INSERT_ALL_FROZEN_SET))
+                != 0
+            {
+                let mut blk = DecodedBkpBlock::new();
+                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                blk.blkno = blkno;
+                blk.rnode_spcnode = blocks[0].rnode_spcnode;
+                blk.rnode_dbnode = blocks[0].rnode_dbnode;
+                blk.rnode_relnode = blocks[0].rnode_relnode;
+                blocks.push(blk);
+            }
+        } else if info == pg_constants::XLOG_HEAP_DELETE {
+            let xlrec = XlHeapDelete::decode(&mut buf);
+            if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                let mut blk = DecodedBkpBlock::new();
+                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                blk.blkno = blkno;
+                blk.rnode_spcnode = blocks[0].rnode_spcnode;
+                blk.rnode_dbnode = blocks[0].rnode_dbnode;
+                blk.rnode_relnode = blocks[0].rnode_relnode;
+                blocks.push(blk);
+            }
+        } else if info == pg_constants::XLOG_HEAP_UPDATE
+            || info == pg_constants::XLOG_HEAP_HOT_UPDATE
        {
-            //buf points to main_data
-            let db_id =  buf.get_u32_le();
-            let tablespace_id =  buf.get_u32_le();
-            let src_db_id =  buf.get_u32_le();
-            let src_tablespace_id =  buf.get_u32_le();
-            trace!("XLOG_DBASE_CREATE db_id {} src_db_id {}", db_id, src_db_id);
-            // in postgres it is implemented as copydir
-            // we need to copy all pages in page_cache
+            let xlrec = XlHeapUpdate::decode(&mut buf);
+            if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                let mut blk = DecodedBkpBlock::new();
+                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                blk.blkno = blkno;
+                blk.rnode_spcnode = blocks[0].rnode_spcnode;
+                blk.rnode_dbnode = blocks[0].rnode_dbnode;
+                blk.rnode_relnode = blocks[0].rnode_relnode;
+                blocks.push(blk);
+            }
+            if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0
+                && blocks.len() > 1
+            {
+                let mut blk = DecodedBkpBlock::new();
+                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                blk.blkno = blocks[1].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
+                blk.rnode_spcnode = blocks[1].rnode_spcnode;
+                blk.rnode_dbnode = blocks[1].rnode_dbnode;
+                blk.rnode_relnode = blocks[1].rnode_relnode;
+                blocks.push(blk);
+            }
        }
-        else
-        {
-            trace!("XLOG_DBASE_DROP is not handled yet");
-        }
-    }
-    else if xl_rmid == pg_constants::RM_TBLSPC_ID
-    {
-        let info = xl_info & !pg_constants::XLR_INFO_MASK;
-        if info == pg_constants::XLOG_TBLSPC_CREATE
-        {
-            //buf points to main_data
-            let ts_id =  buf.get_u32_le();
-            let ts_path = str::from_utf8(&buf).unwrap();
-            trace!("XLOG_TBLSPC_CREATE ts_id {} ts_path {}", ts_id, ts_path);
-        }
-        else
-        {
-            trace!("XLOG_TBLSPC_DROP is not handled yet");
+    } else if xlogrec.xl_rmid == pg_constants::RM_HEAP2_ID {
+        let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+        if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+            let xlrec = XlHeapMultiInsert::decode(&mut buf);
+            if (xlrec.flags
+                & (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
+                    | pg_constants::XLH_INSERT_ALL_FROZEN_SET))
+                != 0
+            {
+                let mut blk = DecodedBkpBlock::new();
+                let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
+                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
+                blk.blkno = blkno;
+                blk.rnode_spcnode = blocks[0].rnode_spcnode;
+                blk.rnode_dbnode = blocks[0].rnode_dbnode;
+                blk.rnode_relnode = blocks[0].rnode_relnode;
+                blocks.push(blk);
+            }
        }
    }

    DecodedWALRecord {
-        xl_info,
-        xl_rmid,
+        xl_xid: xlogrec.xl_xid,
+        xl_info: xlogrec.xl_info,
+        xl_rmid: xlogrec.xl_rmid,
        record,
        blocks,
        main_data_offset,
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -1,23 +1,26 @@
 //!
-//! WAL receiver
-//!
-//! The WAL receiver connects to the WAL safekeeper service, and streams WAL.
-//! For each WAL record, it decodes the record to figure out which data blocks
-//! the record affects, and adds the records to the page cache.
+//! WAL receiver connects to the WAL safekeeper service,
+//! streams WAL, decodes records and saves them in page cache.
 //!
+//! We keep one WAL receiver active per timeline.

+use crate::object_key::*;
 use crate::page_cache;
-use crate::page_cache::{BufferTag, RelTag};
-use crate::pg_constants;
+use crate::restore_local_repo;
 use crate::waldecoder::*;
 use crate::PageServerConf;
 use crate::ZTimelineId;
-use anyhow::Error;
+use anyhow::{Error, Result};
 use lazy_static::lazy_static;
 use log::*;
+use postgres::fallible_iterator::FallibleIterator;
+use postgres::replication::ReplicationIter;
+use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
 use postgres_ffi::xlog_utils::*;
+use postgres_ffi::*;
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
+use std::cmp::{max, min};
 use std::collections::HashMap;
 use std::fs;
 use std::fs::{File, OpenOptions};
@@ -26,11 +29,9 @@ use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Mutex;
 use std::thread;
-use tokio::runtime;
-use tokio::time::{sleep, Duration};
-use tokio_postgres::replication::{PgTimestamp, ReplicationStream};
-use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow};
-use tokio_stream::StreamExt;
+use std::thread::sleep;
+use std::time::{Duration, SystemTime};
+use zenith_utils::lsn::Lsn;

 //
 // We keep one WAL Receiver active per timeline.
@@ -46,7 +47,7 @@ lazy_static! {

 // Launch a new WAL receiver, or tell one that's running about change in connection string
 pub fn launch_wal_receiver(
-    conf: &PageServerConf,
+    conf: &'static PageServerConf,
    timelineid: ZTimelineId,
    wal_producer_connstr: &str,
 ) {
@@ -63,11 +64,10 @@ pub fn launch_wal_receiver(
            receivers.insert(timelineid, receiver);

            // Also launch a new thread to handle this connection
-            let conf_copy = conf.clone();
            let _walreceiver_thread = thread::Builder::new()
                .name("WAL receiver thread".into())
                .spawn(move || {
-                    thread_main(&conf_copy, timelineid);
+                    thread_main(conf, timelineid);
                })
                .unwrap();
        }
@@ -88,192 +88,130 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
 //
 // This is the entry point for the WAL receiver thread.
 //
-fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
+fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId) {
    info!(
        "WAL receiver thread started for timeline : '{}'",
        timelineid
    );

-    let runtime = runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .unwrap();
+    //
+    // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
+    // and start streaming WAL from it. If the connection is lost, keep retrying.
+    //
+    loop {
+        // Look up the current WAL producer address
+        let wal_producer_connstr = get_wal_producer_connstr(timelineid);

-    runtime.block_on(async {
-        loop {
-            // Look up the current WAL producer address
-            let wal_producer_connstr = get_wal_producer_connstr(timelineid);
+        let res = walreceiver_main(conf, timelineid, &wal_producer_connstr);

-            let res = walreceiver_main(conf, timelineid, &wal_producer_connstr).await;
-
-            if let Err(e) = res {
-                info!(
-                    "WAL streaming connection failed ({}), retrying in 1 second",
-                    e
-                );
-                sleep(Duration::from_secs(1)).await;
-            }
+        if let Err(e) = res {
+            info!(
+                "WAL streaming connection failed ({}), retrying in 1 second",
+                e
+            );
+            sleep(Duration::from_secs(1));
        }
-    });
+    }
 }

-async fn walreceiver_main(
-    conf: &PageServerConf,
+fn walreceiver_main(
+    _conf: &PageServerConf,
    timelineid: ZTimelineId,
    wal_producer_connstr: &str,
 ) -> Result<(), Error> {
    // Connect to the database in replication mode.
    info!("connecting to {:?}", wal_producer_connstr);
-    let connect_cfg = format!("{} replication=true", wal_producer_connstr);
-    let (rclient, connection) = tokio_postgres::connect(&connect_cfg, NoTls).await?;
+    let connect_cfg = format!(
+        "{} application_name=pageserver replication=true",
+        wal_producer_connstr
+    );
+
+    let mut rclient = Client::connect(&connect_cfg, NoTls)?;
    info!("connected!");

-    // The connection object performs the actual communication with the database,
-    // so spawn it off to run on its own.
-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            error!("connection error: {}", e);
-        }
-    });
-
-    let identify = identify_system(&rclient).await?;
+    let identify = identify_system(&mut rclient)?;
    info!("{:?}", identify);
-    let end_of_wal = u64::from(identify.xlogpos);
+    let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
    let mut caught_up = false;

-    let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap();
+    let repository = page_cache::get_repository();
+    let timeline = repository.get_timeline(timelineid).unwrap();

    //
    // Start streaming the WAL, from where we left off previously.
    //
-    let mut startpoint = pcache.get_last_valid_lsn();
-    let last_valid_lsn = pcache.get_last_valid_lsn();
-    if startpoint == 0 {
-        // If we start here with identify.xlogpos we will have race condition with
-        // postgres start: insert into postgres may request page that was modified with lsn
-        // smaller than identify.xlogpos.
-        //
-        // Current procedure for starting postgres will anyway be changed to something
-        // different like having 'initdb' method on a pageserver (or importing some shared
-        // empty database snapshot), so for now I just put start of first segment which
-        // seems to be a valid record.
-        pcache.init_valid_lsn(0x_1_000_000_u64);
-        startpoint = 0x_1_000_000_u64;
-    } else {
-        // There might be some padding after the last full record, skip it.
-        //
-        // FIXME: It probably would be better to always start streaming from the beginning
-        // of the page, or the segment, so that we could check the page/segment headers
-        // too. Just for the sake of paranoia.
-        if startpoint % 8 != 0 {
-            startpoint += 8 - (startpoint % 8);
-        }
+    // If we had previously received WAL up to some point in the middle of a WAL record, we
+    // better start from the end of last full WAL record, not in the middle of one. Hence,
+    // use 'last_record_lsn' rather than 'last_valid_lsn' here.
+    let mut last_rec_lsn = timeline.get_last_record_lsn();
+    let mut startpoint = last_rec_lsn;
+
+    if startpoint == Lsn(0) {
+        error!("No previous WAL position");
    }
+
+    // There might be some padding after the last full record, skip it.
+    //
+    // FIXME: It probably would be better to always start streaming from the beginning
+    // of the page, or the segment, so that we could check the page/segment headers
+    // too. Just for the sake of paranoia.
+    startpoint += startpoint.calc_padding(8u32);
+
    debug!(
-        "last_valid_lsn {:X}/{:X} starting replication from {:X}/{:X}  for timeline {}, server is at {:X}/{:X}...",
-        (last_valid_lsn >> 32),
-        (last_valid_lsn & 0xffffffff),
-        (startpoint >> 32),
-        (startpoint & 0xffffffff),
-        timelineid,
-        (end_of_wal >> 32),
-        (end_of_wal & 0xffffffff)
+        "last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
+        last_rec_lsn, startpoint, timelineid, end_of_wal
    );

-    let startpoint = PgLsn::from(startpoint);
    let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
-    let copy_stream = rclient.copy_both_simple::<bytes::Bytes>(&query).await?;

-    let physical_stream = ReplicationStream::new(copy_stream);
-    tokio::pin!(physical_stream);
+    let copy_stream = rclient.copy_both_simple(&query)?;
+    let mut physical_stream = ReplicationIter::new(copy_stream);

-    let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
+    let mut waldecoder = WalStreamDecoder::new(startpoint);

-    while let Some(replication_message) = physical_stream.next().await {
-        match replication_message? {
+    let checkpoint_bytes =
+        timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, startpoint, false)?;
+    let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
+    trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
+
+    while let Some(replication_message) = physical_stream.next()? {
+        let status_update = match replication_message {
            ReplicationMessage::XLogData(xlog_data) => {
                // Pass the WAL data to the decoder, and see if we can decode
                // more records as a result.
                let data = xlog_data.data();
-                let startlsn = xlog_data.wal_start();
+                let startlsn = Lsn::from(xlog_data.wal_start());
                let endlsn = startlsn + data.len() as u64;
+                let prev_last_rec_lsn = last_rec_lsn;

-                write_wal_file(
-                    startlsn,
-                    timelineid,
-                    16 * 1024 * 1024, // FIXME
-                    data,
-                )?;
+                write_wal_file(startlsn, timelineid, pg_constants::WAL_SEGMENT_SIZE, data)?;

-                trace!(
-                    "received XLogData between {:X}/{:X} and {:X}/{:X}",
-                    (startlsn >> 32),
-                    (startlsn & 0xffffffff),
-                    (endlsn >> 32),
-                    (endlsn & 0xffffffff)
-                );
+                trace!("received XLogData between {} and {}", startlsn, endlsn);

                waldecoder.feed_bytes(data);

-                loop {
-                    if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                        let decoded = decode_wal_record(recdata.clone());
-                        // Put the WAL record to the page cache. We make a separate copy of
-                        // it for every block it modifies. (The actual WAL record is kept in
-                        // a Bytes, which uses a reference counter for the underlying buffer,
-                        // so having multiple copies of it doesn't cost that much)
-                        for blk in decoded.blocks.iter() {
-                            let tag = BufferTag {
-                                rel: RelTag {
-                                    spcnode: blk.rnode_spcnode,
-                                    dbnode: blk.rnode_dbnode,
-                                    relnode: blk.rnode_relnode,
-                                    forknum: blk.forknum as u8,
-                                },
-                                blknum: blk.blkno,
-                            };
+                while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                    // Save old checkpoint value to compare with it after decoding WAL record
+                    let old_checkpoint_bytes = checkpoint.encode();
+                    let decoded = decode_wal_record(recdata.clone());
+                    restore_local_repo::save_decoded_record(
+                        &mut checkpoint,
+                        &*timeline,
+                        &decoded,
+                        recdata,
+                        lsn,
+                    )?;
+                    last_rec_lsn = lsn;

-                            let rec = page_cache::WALRecord {
-                                lsn,
-                                will_init: blk.will_init || blk.apply_image,
-                                truncate: false,
-                                rec: recdata.clone(),
-                                main_data_offset: decoded.main_data_offset as u32,
-                            };
-
-                            pcache.put_wal_record(tag, rec);
-                        }
-                        // include truncate wal record in all pages
-                        if decoded.xl_rmid == pg_constants::RM_SMGR_ID
-                            && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                                == pg_constants::XLOG_SMGR_TRUNCATE
-                        {
-                            let truncate = decode_truncate_record(&decoded);
-                            if (truncate.flags & SMGR_TRUNCATE_HEAP) != 0 {
-                                let tag = BufferTag {
-                                    rel: RelTag {
-                                        spcnode: truncate.rnode.spcnode,
-                                        dbnode: truncate.rnode.dbnode,
-                                        relnode: truncate.rnode.relnode,
-                                        forknum: MAIN_FORKNUM,
-                                    },
-                                    blknum: truncate.blkno,
-                                };
-                                let rec = page_cache::WALRecord {
-                                    lsn: lsn,
-                                    will_init: false,
-                                    truncate: true,
-                                    rec: recdata.clone(),
-                                    main_data_offset: decoded.main_data_offset as u32,
-                                };
-                                pcache.put_rel_wal_record(tag, rec).await?;
-                            }
-                        }
-                        // Now that this record has been handled, let the page cache know that
-                        // it is up-to-date to this LSN
-                        pcache.advance_last_record_lsn(lsn);
-                    } else {
-                        break;
+                    let new_checkpoint_bytes = checkpoint.encode();
+                    // Check if checkpoint data was updated by save_decoded_record
+                    if new_checkpoint_bytes != old_checkpoint_bytes {
+                        timeline.put_page_image(
+                            ObjectTag::Checkpoint,
+                            lsn,
+                            new_checkpoint_bytes,
+                            false,
+                        )?;
                    }
                }

@@ -283,48 +221,118 @@ async fn walreceiver_main(
                // better reflect that, because GetPage@LSN requests might also point in the
                // middle of a record, if the request LSN was taken from the server's current
                // flush ptr.
-                pcache.advance_last_valid_lsn(endlsn);
+                timeline.advance_last_valid_lsn(endlsn);
+
+                // Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
+                // "checkpoint" the repository to flush all the changes from WAL we've processed
+                // so far to disk. After this, we don't need the original WAL anymore, and it
+                // can be removed. This is probably too aggressive for production, but it's useful
+                // to expose bugs now.
+                //
+                // TODO: We don't actually dare to remove the WAL. It's useful for debugging,
+                // and we might it for logical decoding other things in the future. Although
+                // we should also be able to fetch it back from the WAL safekeepers or S3 if
+                // needed.
+                if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
+                    != last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
+                {
+                    info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
+                    let (oldest_segno, newest_segno) = find_wal_file_range(
+                        timelineid,
+                        pg_constants::WAL_SEGMENT_SIZE,
+                        last_rec_lsn,
+                    )?;
+
+                    if newest_segno - oldest_segno >= 10 {
+                        timeline.checkpoint()?;
+
+                        // TODO: This is where we could remove WAL older than last_rec_lsn.
+                        //remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
+                    }
+                }

                if !caught_up && endlsn >= end_of_wal {
-                    info!(
-                        "caught up at LSN {:X}/{:X}",
-                        (endlsn >> 32),
-                        (endlsn & 0xffffffff)
-                    );
+                    info!("caught up at LSN {}", endlsn);
                    caught_up = true;
                }
+
+                Some(endlsn)
            }

            ReplicationMessage::PrimaryKeepAlive(keepalive) => {
                let wal_end = keepalive.wal_end();
                let timestamp = keepalive.timestamp();
-                let reply_requested: bool = keepalive.reply() != 0;
+                let reply_requested = keepalive.reply() != 0;

                trace!(
-                    "received PrimaryKeepAlive(wal_end: {}, timestamp: {} reply: {})",
+                    "received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})",
                    wal_end,
                    timestamp,
                    reply_requested,
                );
-                if reply_requested {
-                    // TODO: More thought should go into what values are sent here.
-                    let last_lsn = PgLsn::from(pcache.get_last_valid_lsn());
-                    let write_lsn = last_lsn;
-                    let flush_lsn = last_lsn;
-                    let apply_lsn = PgLsn::INVALID;
-                    let ts = PgTimestamp::now()?;
-                    const NO_REPLY: u8 = 0u8;

-                    physical_stream
-                        .as_mut()
-                        .standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)
-                        .await?;
+                if reply_requested {
+                    Some(timeline.get_last_valid_lsn())
+                } else {
+                    None
                }
            }
-            _ => (),
+
+            _ => None,
+        };
+
+        if let Some(last_lsn) = status_update {
+            // TODO: More thought should go into what values are sent here.
+            let last_lsn = PgLsn::from(u64::from(last_lsn));
+            let write_lsn = last_lsn;
+            let flush_lsn = last_lsn;
+            let apply_lsn = PgLsn::from(0);
+            let ts = SystemTime::now();
+            const NO_REPLY: u8 = 0;
+
+            physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
        }
    }
-    return Ok(());
+    Ok(())
+}
+
+fn find_wal_file_range(
+    timeline: ZTimelineId,
+    wal_seg_size: usize,
+    written_upto: Lsn,
+) -> Result<(u64, u64)> {
+    let written_upto_segno = written_upto.segment_number(wal_seg_size);
+
+    let mut oldest_segno = written_upto_segno;
+    let mut newest_segno = written_upto_segno;
+    // Scan the wal directory, and count how many WAL filed we could remove
+    let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
+    for entry in fs::read_dir(wal_dir)? {
+        let entry = entry?;
+        let path = entry.path();
+
+        if path.is_dir() {
+            continue;
+        }
+
+        let filename = path.file_name().unwrap().to_str().unwrap();
+
+        if IsXLogFileName(filename) {
+            let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
+
+            if segno > written_upto_segno {
+                // that's strange.
+                warn!("there is a WAL file from future at {}", path.display());
+                continue;
+            }
+
+            oldest_segno = min(oldest_segno, segno);
+            newest_segno = max(newest_segno, segno);
+        }
+    }
+    // FIXME: would be good to assert that there are no gaps in the WAL files
+
+    Ok((oldest_segno, newest_segno))
 }

 /// Data returned from the postgres `IDENTIFY_SYSTEM` command
@@ -347,9 +355,9 @@ pub struct IdentifySystem {
 pub struct IdentifyError;

 /// Run the postgres `IDENTIFY_SYSTEM` command
-pub async fn identify_system(client: &tokio_postgres::Client) -> Result<IdentifySystem, Error> {
+pub fn identify_system(client: &mut Client) -> Result<IdentifySystem, Error> {
    let query_str = "IDENTIFY_SYSTEM";
-    let response = client.simple_query(query_str).await?;
+    let response = client.simple_query(query_str)?;

    // get(N) from row, then parse it as some destination type.
    fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
@@ -370,12 +378,12 @@ pub async fn identify_system(client: &tokio_postgres::Client) -> Result<Identify
            dbname: get_parse(first_row, 3).ok(),
        })
    } else {
-        Err(IdentifyError)?
+        Err(IdentifyError.into())
    }
 }

 fn write_wal_file(
-    startpos: XLogRecPtr,
+    startpos: Lsn,
    timeline: ZTimelineId,
    wal_seg_size: usize,
    buf: &[u8],
@@ -389,7 +397,7 @@ fn write_wal_file(
    let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));

    /* Extract WAL location for this block */
-    let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
+    let mut xlogoff = start_pos.segment_offset(wal_seg_size);

    while bytes_left != 0 {
        let bytes_to_write;
@@ -405,7 +413,7 @@ fn write_wal_file(
        }

        /* Open file */
-        let segno = XLByteToSeg(start_pos, wal_seg_size);
+        let segno = start_pos.segment_number(wal_seg_size);
        let wal_file_name = XLogFileName(
            1, // FIXME: always use Postgres timeline 1
            segno,
@@ -457,7 +465,7 @@ fn write_wal_file(
        xlogoff += bytes_to_write;

        /* Did we reach the end of a WAL segment? */
-        if XLogSegmentOffset(start_pos, wal_seg_size) == 0 {
+        if start_pos.segment_offset(wal_seg_size) == 0 {
            xlogoff = 0;
            if partial {
                fs::rename(&wal_file_partial_path, &wal_file_path)?;
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -1,21 +1,23 @@
-//
-// WAL redo
-//
-// We rely on Postgres to perform WAL redo for us. We launch a
-// postgres process in special "wal redo" mode that's similar to
-// single-user mode. We then pass the the previous page image, if any,
-// and all the WAL records we want to apply, to the postgress
-// process. Then we get the page image back. Communication with the
-// postgres process happens via stdin/stdout
-//
-// See src/backend/tcop/zenith_wal_redo.c for the other side of
-// this communication.
-//
-// TODO: Even though the postgres code runs in a separate process,
-// it's not a secure sandbox.
-//
+//!
+//! WAL redo. This service runs PostgreSQL in a special wal_redo mode
+//! to apply given WAL records over an old page image and return new page image.
+//!
+//! We rely on Postgres to perform WAL redo for us. We launch a
+//! postgres process in special "wal redo" mode that's similar to
+//! single-user mode. We then pass the previous page image, if any,
+//! and all the WAL records we want to apply, to the postgres
+//! process. Then we get the page image back. Communication with the
+//! postgres process happens via stdin/stdout
+//!
+//! See src/backend/tcop/zenith_wal_redo.c for the other side of
+//! this communication.
+//!
+//! TODO: Even though the postgres code runs in a separate process,
+//! it's not a secure sandbox.
+//!
+use byteorder::{ByteOrder, LittleEndian};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
 use log::*;
-use std::assert;
 use std::cell::RefCell;
 use std::fs;
 use std::fs::OpenOptions;
@@ -23,241 +25,487 @@ use std::io::prelude::*;
 use std::io::Error;
 use std::path::PathBuf;
 use std::process::Stdio;
+use std::sync::mpsc;
 use std::sync::Arc;
+use std::sync::Mutex;
 use std::time::Duration;
 use std::time::Instant;
 use tokio::io::AsyncBufReadExt;
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
-use tokio::process::{Child, ChildStdin, ChildStdout, Command};
-use tokio::runtime::Runtime;
+use tokio::process::{ChildStdin, ChildStdout, Command};
 use tokio::time::timeout;
+use zenith_utils::bin_ser::BeSer;
+use zenith_utils::lsn::Lsn;

-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use crate::object_key::*;
+use crate::repository::BufferTag;
+use crate::repository::WALRecord;
+use crate::waldecoder::XlXactParsedRecord;
+use crate::waldecoder::{MultiXactId, XlMultiXactCreate};
+use crate::PageServerConf;
+use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
+use postgres_ffi::pg_constants;
+use postgres_ffi::XLogRecord;

-use crate::page_cache;
-use crate::page_cache::CacheEntry;
-use crate::page_cache::WALRecord;
-use crate::ZTimelineId;
-use crate::{page_cache::BufferTag, pg_constants, PageServerConf};
+///
+/// WAL Redo Manager is responsible for replaying WAL records.
+///
+/// Callers use the WAL redo manager through this abstract interface,
+/// which makes it easy to mock it in tests.
+pub trait WalRedoManager: Send + Sync {
+    /// Apply some WAL records.
+    ///
+    /// The caller passes an old page image, and WAL records that should be
+    /// applied over it. The return value is a new page image, after applying
+    /// the reords.
+    fn request_redo(
+        &self,
+        tag: ObjectTag,
+        lsn: Lsn,
+        base_img: Option<Bytes>,
+        records: Vec<WALRecord>,
+    ) -> Result<Bytes, WalRedoError>;
+}
+
+///
+/// A dummy WAL Redo Manager implementation that doesn't allow replaying
+/// anything. Currently used during bootstrapping (zenith init), to create
+/// a Repository object without launching the real WAL redo process.
+///
+pub struct DummyRedoManager {}
+impl crate::walredo::WalRedoManager for DummyRedoManager {
+    fn request_redo(
+        &self,
+        _tag: ObjectTag,
+        _lsn: Lsn,
+        _base_img: Option<Bytes>,
+        _records: Vec<WALRecord>,
+    ) -> Result<Bytes, WalRedoError> {
+        Err(WalRedoError::InvalidState)
+    }
+}

 static TIMEOUT: Duration = Duration::from_secs(20);

-//
-// Main entry point for the WAL applicator thread.
-//
-pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) {
-    info!("WAL redo thread started {}", timelineid);
+///
+/// The implementation consists of two parts: PostgresRedoManager, and
+/// PostgresRedoManagerInternal. PostgresRedoManager is the public struct
+/// that can be used to send redo requests to the manager.
+/// PostgresRedoManagerInternal is used by the manager thread itself.
+///
+pub struct PostgresRedoManager {
+    request_tx: Mutex<mpsc::Sender<WalRedoRequest>>,
+}

-    // We block on waiting for requests on the walredo request channel, but
-    // use async I/O to communicate with the child process. Initialize the
-    // runtime for the async part.
-    let runtime = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .unwrap();
+struct PostgresRedoManagerInternal {
+    conf: &'static PageServerConf,

-    let pcache = page_cache::get_pagecache(conf, timelineid).unwrap();
+    request_rx: mpsc::Receiver<WalRedoRequest>,
+}

-    // Loop forever, handling requests as they come.
-    let walredo_channel_receiver = &pcache.walredo_receiver;
-    loop {
-        let mut process: WalRedoProcess;
-        let datadir = format!("wal-redo/{}", timelineid);
+#[derive(Debug)]
+struct WalRedoRequestData {
+    tag: ObjectTag,
+    lsn: Lsn,
+    base_img: Option<Bytes>,
+    records: Vec<WALRecord>,
+}

-        info!("launching WAL redo postgres process {}", timelineid);
-        {
-            let _guard = runtime.enter();
-            process = WalRedoProcess::launch(&datadir, &runtime).unwrap();
+#[derive(Debug)]
+struct WalRedoRequest {
+    data: WalRedoRequestData,
+    response_channel: mpsc::Sender<Result<Bytes, WalRedoError>>,
+}
+
+/// An error happened in WAL redo
+#[derive(Debug, thiserror::Error)]
+pub enum WalRedoError {
+    #[error(transparent)]
+    IoError(#[from] std::io::Error),
+
+    #[error("cannot perform WAL redo now")]
+    InvalidState,
+}
+
+///
+/// Public interface of WAL redo manager
+///
+impl PostgresRedoManager {
+    ///
+    /// Create a new PostgresRedoManager.
+    ///
+    /// This launches a new thread to handle the requests.
+    pub fn new(conf: &'static PageServerConf) -> PostgresRedoManager {
+        let (tx, rx) = mpsc::channel();
+
+        //
+        // Launch the WAL redo thread
+        //
+        // Get mutable references to the values that we need to pass to the
+        // thread.
+        let request_rx = rx;
+
+        // Currently, the join handle is not saved anywhere and we
+        // won't try restart the thread if it dies.
+        let _walredo_thread = std::thread::Builder::new()
+            .name("WAL redo thread".into())
+            .spawn(move || {
+                let mut internal = PostgresRedoManagerInternal { conf, request_rx };
+                internal.wal_redo_main();
+            })
+            .unwrap();
+
+        PostgresRedoManager {
+            request_tx: Mutex::new(tx),
        }
-        info!("WAL redo postgres started");
-
-        // Pretty arbitrarily, reuse the same Postgres process for 100 requests.
-        // After that, kill it and start a new one. This is mostly to avoid
-        // using up all shared buffers in Postgres's shared buffer cache; we don't
-        // want to write any pages to disk in the WAL redo process.
-        for _i in 1..100000 {
-            let request = walredo_channel_receiver.recv().unwrap();
-
-            let result = handle_apply_request(&pcache, &process, &runtime, request);
-            if result.is_err() {
-                // Something went wrong with handling the request. It's not clear
-                // if the request was faulty, and the next request would succeed
-                // again, or if the 'postgres' process went haywire. To be safe,
-                // kill the 'postgres' process so that we will start from a clean
-                // slate, with a new process, for the next request.
-                break;
-            }
-        }
-
-        // Time to kill the 'postgres' process. A new one will be launched on next
-        // iteration of the loop.
-        info!("killing WAL redo postgres process");
-        let _ = runtime.block_on(process.stdin.get_mut().shutdown());
-        let mut child = process.child;
-        drop(process.stdin);
-        let _ = runtime.block_on(child.wait());
    }
 }

-fn transaction_id_set_status_bit(
-    xl_info: u8,
-    xl_rmid: u8,
-    xl_xid: u32,
-    record: WALRecord,
-    page: &mut BytesMut,
-) {
-    let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
-    let mut status = 0;
-    if info == pg_constants::XLOG_XACT_COMMIT {
-        status = pg_constants::TRANSACTION_STATUS_COMMITTED;
-    } else if info == pg_constants::XLOG_XACT_ABORT {
-        status = pg_constants::TRANSACTION_STATUS_ABORTED;
-    } else {
-        trace!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {:X}/{:X} main_data_offset {}, rec.len {}",
-        status,
-        record.lsn >> 32,
-        record.lsn & 0xffffffff,
-        record.main_data_offset, record.rec.len());
-        return;
+impl WalRedoManager for PostgresRedoManager {
+    ///
+    /// Request the WAL redo manager to apply some WAL records
+    ///
+    /// The WAL redo is handled by a separate thread, so this just sends a request
+    /// to the thread and waits for response.
+    ///
+    fn request_redo(
+        &self,
+        tag: ObjectTag,
+        lsn: Lsn,
+        base_img: Option<Bytes>,
+        records: Vec<WALRecord>,
+    ) -> Result<Bytes, WalRedoError> {
+        // Create a channel where to receive the response
+        let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
+
+        let request = WalRedoRequest {
+            data: WalRedoRequestData {
+                tag,
+                lsn,
+                base_img,
+                records,
+            },
+            response_channel: tx,
+        };
+
+        self.request_tx
+            .lock()
+            .unwrap()
+            .send(request)
+            .expect("could not send WAL redo request");
+
+        rx.recv()
+            .expect("could not receive response to WAL redo request")
    }
-
-    trace!("handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort) lsn {:X}/{:X} main_data_offset {}, rec.len {}",
-    status,
-    record.lsn >> 32,
-    record.lsn & 0xffffffff,
-    record.main_data_offset, record.rec.len());
-
-    let byteno: usize = ((xl_rmid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
-        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
-
-    let byteptr = &mut page[byteno..byteno + 1];
-    let bshift: u8 = ((xl_xid % pg_constants::CLOG_XACTS_PER_BYTE)
-        * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
-
-    let mut curval = byteptr[0];
-    curval = (curval >> bshift) & pg_constants::CLOG_XACT_BITMASK;
-
-    let mut byteval = [0];
-    byteval[0] = curval;
-    byteval[0] &= !(((1 << pg_constants::CLOG_BITS_PER_XACT as u8) - 1) << bshift);
-    byteval[0] |= status << bshift;
-
-    byteptr.copy_from_slice(&byteval);
-    trace!(
-        "xl_xid {} byteno {} curval {} byteval {}",
-        xl_xid,
-        byteno,
-        curval,
-        byteval[0]
-    );
 }

-fn handle_apply_request(
-    pcache: &page_cache::PageCache,
-    process: &WalRedoProcess,
-    runtime: &Runtime,
-    entry_rc: Arc<CacheEntry>,
-) -> Result<(), Error> {
-    let tag = entry_rc.key.tag;
-    let lsn = entry_rc.key.lsn;
-    let (base_img, records) = pcache.collect_records_for_apply(entry_rc.as_ref());
+fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
+    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
+        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
+        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
+}

-    let mut entry = entry_rc.content.lock().unwrap();
-    assert!(entry.apply_pending);
-    entry.apply_pending = false;
+fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
+    (xid as u16) % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP
+        * pg_constants::MXACT_MEMBER_BITS_PER_XACT
+}

-    let nrecords = records.len();
+/* Location (byte offset within page) of TransactionId of given member */
+fn mx_offset_to_member_offset(xid: MultiXactId) -> usize {
+    mx_offset_to_flags_offset(xid)
+        + (pg_constants::MULTIXACT_FLAGBYTES_PER_GROUP
+            + (xid as u16 % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP) * 4) as usize
+}

-    let start = Instant::now();
+///
+/// WAL redo thread
+///
+impl PostgresRedoManagerInternal {
+    //
+    // Main entry point for the WAL applicator thread.
+    //
+    fn wal_redo_main(&mut self) {
+        info!("WAL redo thread started");

-    let apply_result: Result<Bytes, Error>;
-    if tag.rel.forknum == pg_constants::PG_XACT_FORKNUM as u8 {
-        //TODO use base image if any
-        static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
-        let zero_page_bytes: &[u8] = &ZERO_PAGE;
-        let mut page = BytesMut::from(zero_page_bytes);
+        // We block on waiting for requests on the walredo request channel, but
+        // use async I/O to communicate with the child process. Initialize the
+        // runtime for the async part.
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();

-        for record in records {
-            let mut buf = record.rec.clone();
+        let processes: Vec<PostgresRedoProcess>;

-            // 1. Parse XLogRecord struct
-            // FIXME: refactor to avoid code duplication.
-            let _xl_tot_len = buf.get_u32_le();
-            let xl_xid = buf.get_u32_le();
-            let _xl_prev = buf.get_u64_le();
-            let xl_info = buf.get_u8();
-            let xl_rmid = buf.get_u8();
-            buf.advance(2); // 2 bytes of padding
-            let _xl_crc = buf.get_u32_le();
+        info!("launching WAL redo postgres process");

-            if xl_rmid == pg_constants::RM_CLOG_ID {
-                let info = xl_info & !pg_constants::XLR_INFO_MASK;
-                if info == pg_constants::CLOG_ZEROPAGE {
-                    page.clone_from_slice(zero_page_bytes);
-                    trace!("handle_apply_request for RM_CLOG_ID-CLOG_ZEROPAGE lsn {:X}/{:X} main_data_offset {}, rec.len {}",
-                    record.lsn >> 32,
-                    record.lsn & 0xffffffff,
-                    record.main_data_offset, record.rec.len());
+        let wal_redoers = self.conf.wal_redoers;
+        processes = (0..wal_redoers)
+            .map(|i| {
+                runtime
+                    .block_on(PostgresRedoProcess::launch(self.conf, i))
+                    .unwrap()
+            })
+            .collect();
+
+        // Loop forever, handling requests as they come.
+        loop {
+            let mut requests: Vec<WalRedoRequest> = Vec::new();
+            requests.push(
+                self.request_rx
+                    .recv()
+                    .expect("WAL redo request channel was closed"),
+            );
+            loop {
+                let req = self.request_rx.try_recv();
+                match req {
+                    Ok(req) => requests.push(req),
+                    Err(_) => break,
+                }
+            }
+            let request_data = requests.iter().map(|req| &req.data);
+            let mut rr = 0; // round robin
+            let results = runtime.block_on(async {
+                let futures = request_data.map(|req| {
+                    rr += 1;
+                    self.handle_apply_request(&processes[rr % wal_redoers], &req)
+                });
+                let mut results: Vec<Result<Bytes, WalRedoError>> = Vec::new();
+                for future in futures {
+                    results.push(future.await);
+                }
+                results
+            });
+            for (result, request) in results.into_iter().zip(requests.iter()) {
+                let result_ok = result.is_ok();
+
+                // Send the result to the requester
+                let _ = request.response_channel.send(result);
+
+                if !result_ok {
+                    error!("wal-redo-postgres failed to apply request {:?}", request);
                }
-            } else if xl_rmid == pg_constants::RM_XACT_ID {
-                transaction_id_set_status_bit(xl_info, xl_rmid, xl_xid, record, &mut page);
            }
        }
-
-        apply_result = Ok::<Bytes, Error>(page.freeze());
-    } else {
-        apply_result = process.apply_wal_records(runtime, tag, base_img, records);
    }

-    let duration = start.elapsed();
+    async fn handle_apply_request(
+        &self,
+        process: &PostgresRedoProcess,
+        request: &WalRedoRequestData,
+    ) -> Result<Bytes, WalRedoError> {
+        let tag = request.tag;
+        let lsn = request.lsn;
+        let base_img = request.base_img.clone();
+        let records = &request.records;

-    let result;
+        let nrecords = records.len();

-    trace!(
-        "applied {} WAL records in {} ms to reconstruct page image at LSN {:X}/{:X}",
-        nrecords,
-        duration.as_millis(),
-        lsn >> 32,
-        lsn & 0xffff_ffff
-    );
+        let start = Instant::now();

-    if let Err(e) = apply_result {
-        error!("could not apply WAL records: {}", e);
-        result = Err(e);
-    } else {
-        entry.page_image = Some(apply_result.unwrap());
-        result = Ok(());
+        let apply_result: Result<Bytes, Error>;
+        if let ObjectTag::RelationBuffer(buf_tag) = tag {
+            // Relational WAL records are applied using wal-redo-postgres
+            apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
+        } else {
+            // Non-relational WAL records we apply ourselves.
+            const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
+            let mut page = BytesMut::new();
+            if let Some(fpi) = base_img {
+                // If full-page image is provided, then use it...
+                page.extend_from_slice(&fpi[..]);
+            } else {
+                // otherwise initialize page with zeros
+                page.extend_from_slice(&ZERO_PAGE);
+            }
+            // Apply all collected WAL records
+            for record in records {
+                let mut buf = record.rec.clone();
+
+                // 1. Parse XLogRecord struct
+                // FIXME: refactor to avoid code duplication.
+                let xlogrec = XLogRecord::from_bytes(&mut buf);
+
+                //move to main data
+                // TODO probably, we should store some records in our special format
+                // to avoid this weird parsing on replay
+                let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
+                if buf.remaining() > skip {
+                    buf.advance(skip);
+                }
+
+                if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
+                    let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
+                    if info == pg_constants::CLOG_ZEROPAGE {
+                        // The only operation we need to implement is CLOG_ZEROPAGE
+                        page.copy_from_slice(&ZERO_PAGE);
+                    }
+                } else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
+                    // Transaction manager stuff
+                    let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
+                    let tag_blknum = match tag {
+                        ObjectTag::Clog(slru) => slru.blknum,
+                        ObjectTag::TwoPhase(_) => {
+                            assert!(info == pg_constants::XLOG_XACT_PREPARE);
+                            trace!("Apply prepare {} record", xlogrec.xl_xid);
+                            page.clear();
+                            page.extend_from_slice(&buf[..]);
+                            continue;
+                        }
+                        _ => panic!("Not valid XACT object tag {:?}", tag),
+                    };
+                    let parsed_xact =
+                        XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
+                    if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
+                        || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
+                    {
+                        transaction_id_set_status(
+                            parsed_xact.xid,
+                            pg_constants::TRANSACTION_STATUS_COMMITTED,
+                            &mut page,
+                        );
+                        for subxact in &parsed_xact.subxacts {
+                            let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                            // only update xids on the requested page
+                            if tag_blknum == blkno {
+                                transaction_id_set_status(
+                                    *subxact,
+                                    pg_constants::TRANSACTION_STATUS_SUB_COMMITTED,
+                                    &mut page,
+                                );
+                            }
+                        }
+                    } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
+                        || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
+                    {
+                        transaction_id_set_status(
+                            parsed_xact.xid,
+                            pg_constants::TRANSACTION_STATUS_ABORTED,
+                            &mut page,
+                        );
+                        for subxact in &parsed_xact.subxacts {
+                            let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                            // only update xids on the requested page
+                            if tag_blknum == blkno {
+                                transaction_id_set_status(
+                                    *subxact,
+                                    pg_constants::TRANSACTION_STATUS_ABORTED,
+                                    &mut page,
+                                );
+                            }
+                        }
+                    }
+                } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
+                    // Multiexact operations
+                    let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                    if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
+                        || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
+                    {
+                        // Just need to zero page
+                        page.copy_from_slice(&ZERO_PAGE);
+                    } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+                        let xlrec = XlMultiXactCreate::decode(&mut buf);
+                        if let ObjectTag::MultiXactMembers(slru) = tag {
+                            for i in 0..xlrec.nmembers {
+                                let blkno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                                if blkno == slru.blknum {
+                                    // update only target block
+                                    let offset = xlrec.moff + i;
+                                    let memberoff = mx_offset_to_member_offset(offset);
+                                    let flagsoff = mx_offset_to_flags_offset(offset);
+                                    let bshift = mx_offset_to_flags_bitshift(offset);
+                                    let mut flagsval =
+                                        LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                                    flagsval &=
+                                        !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
+                                            << bshift);
+                                    flagsval |= xlrec.members[i as usize].status << bshift;
+                                    LittleEndian::write_u32(
+                                        &mut page[flagsoff..flagsoff + 4],
+                                        flagsval,
+                                    );
+                                    LittleEndian::write_u32(
+                                        &mut page[memberoff..memberoff + 4],
+                                        xlrec.members[i as usize].xid,
+                                    );
+                                }
+                            }
+                        } else {
+                            // Multixact offsets SLRU
+                            let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
+                                * 4) as usize;
+                            LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
+                        }
+                    } else {
+                        panic!();
+                    }
+                } else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
+                    // Ralation map file has size 512 bytes
+                    page.clear();
+                    page.extend_from_slice(&buf[12..]); // skip xl_relmap_update
+                    assert!(page.len() == 512); // size of pg_filenode.map
+                }
+            }
+
+            apply_result = Ok::<Bytes, Error>(page.freeze());
+        }
+
+        let duration = start.elapsed();
+
+        let result: Result<Bytes, WalRedoError>;
+
+        debug!(
+            "applied {} WAL records in {} ms to reconstruct page image at LSN {}",
+            nrecords,
+            duration.as_millis(),
+            lsn
+        );
+
+        if let Err(e) = apply_result {
+            error!("could not apply WAL records: {}", e);
+            result = Err(WalRedoError::IoError(e));
+        } else {
+            let img = apply_result.unwrap();
+
+            result = Ok(img);
+        }
+
+        // The caller is responsible for sending the response
+        result
    }
-
-    // Wake up the requester, whether the operation succeeded or not.
-    entry_rc.walredo_condvar.notify_all();
-
-    result
 }

-struct WalRedoProcess {
-    child: Child,
-    stdin: RefCell<ChildStdin>,
-    stdout: RefCell<ChildStdout>,
+struct PostgresRedoProcess {
+    stdin: Arc<RefCell<ChildStdin>>,
+    stdout: Arc<RefCell<ChildStdout>>,
 }

-impl WalRedoProcess {
+impl PostgresRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    // Tests who run pageserver binary are setting proper PG_BIN_DIR
-    // and PG_LIB_DIR so that WalRedo would start right postgres. We may later
-    // switch to setting same things in pageserver config file.
-    fn launch(datadir: &str, runtime: &Runtime) -> Result<WalRedoProcess, Error> {
-        // Create empty data directory for wal-redo postgres deleting old one.
-        fs::remove_dir_all(datadir).ok();
-        let initdb = runtime
-            .block_on(
-                Command::new("initdb")
-                    .args(&["-D", datadir])
-                    .arg("-N")
-                    .output(),
-            )
+    async fn launch(conf: &PageServerConf, id: usize) -> Result<PostgresRedoProcess, Error> {
+        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
+        // just create one with constant name. That fails if you try to launch more than
+        // one WAL redo manager concurrently.
+        let datadir = conf.workdir.join(format!("wal-redo-datadir-{}", id));
+
+        // Create empty data directory for wal-redo postgres, deleting old one first.
+        if datadir.exists() {
+            info!("directory {:?} exists, removing", &datadir);
+            if let Err(e) = fs::remove_dir_all(&datadir) {
+                error!("could not remove old wal-redo-datadir: {:?}", e);
+            }
+        }
+        info!("running initdb in {:?}", datadir.display());
+        let initdb = Command::new(conf.pg_bin_dir().join("initdb"))
+            .args(&["-D", datadir.to_str().unwrap()])
+            .arg("-N")
+            .env_clear()
+            .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+            .output()
+            .await
            .expect("failed to execute initdb");

        if !initdb.status.success() {
@@ -271,20 +519,28 @@ impl WalRedoProcess {
            let mut config = OpenOptions::new()
                .append(true)
                .open(PathBuf::from(&datadir).join("postgresql.conf"))?;
-            config.write(b"shared_buffers=128kB\n")?;
-            config.write(b"fsync=off\n")?;
+            config.write_all(b"shared_buffers=128kB\n")?;
+            config.write_all(b"fsync=off\n")?;
+            config.write_all(b"shared_preload_libraries=zenith\n")?;
+            config.write_all(b"zenith.wal_redo=on\n")?;
        }
        // Start postgres itself
-        let mut child = Command::new("postgres")
+        let mut child = Command::new(conf.pg_bin_dir().join("postgres"))
            .arg("--wal-redo")
            .stdin(Stdio::piped())
            .stderr(Stdio::piped())
            .stdout(Stdio::piped())
-            .env("PGDATA", datadir)
+            .env_clear()
+            .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+            .env("PGDATA", &datadir)
            .spawn()
            .expect("postgres --wal-redo command failed to start");

-        info!("launched WAL redo postgres process on {}", datadir);
+        info!(
+            "launched WAL redo postgres process on {:?}",
+            datadir.display()
+        );

        let stdin = child.stdin.take().expect("failed to open child's stdin");
        let stderr = child.stderr.take().expect("failed to open child's stderr");
@@ -311,10 +567,9 @@ impl WalRedoProcess {
        };
        tokio::spawn(f_stderr);

-        Ok(WalRedoProcess {
-            child,
-            stdin: RefCell::new(stdin),
-            stdout: RefCell::new(stdout),
+        Ok(PostgresRedoProcess {
+            stdin: Arc::new(RefCell::new(stdin)),
+            stdout: Arc::new(RefCell::new(stdout)),
        })
    }

@@ -322,91 +577,106 @@ impl WalRedoProcess {
    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
-    fn apply_wal_records(
+    async fn apply_wal_records(
        &self,
-        runtime: &Runtime,
        tag: BufferTag,
        base_img: Option<Bytes>,
-        records: Vec<WALRecord>,
-    ) -> Result<Bytes, Error> {
+        records: &[WALRecord],
+    ) -> Result<Bytes, std::io::Error> {
        let mut stdin = self.stdin.borrow_mut();
        let mut stdout = self.stdout.borrow_mut();
-        runtime.block_on(async {
-            //
-            // This async block sends all the commands to the process.
-            //
-            // For reasons I don't understand, this needs to be a "move" block;
-            // otherwise the stdin pipe doesn't get closed, despite the shutdown()
-            // call.
-            //
-            let f_stdin = async {
-                // Send base image, if any. (If the record initializes the page, previous page
-                // version is not needed.)
+
+        // We do three things simultaneously: send the old base image and WAL records to
+        // the child process's stdin, read the result from child's stdout, and forward any logging
+        // information that the child writes to its stderr to the page server's log.
+        //
+        // 'f_stdin' handles writing the base image and WAL records to the child process.
+        // 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
+        // tokio runtime in the 'launch' function already, forwards the logging.
+        let f_stdin = async {
+            // Send base image, if any. (If the record initializes the page, previous page
+            // version is not needed.)
+            timeout(
+                TIMEOUT,
+                stdin.write_all(&build_begin_redo_for_block_msg(tag)),
+            )
+            .await??;
+            if base_img.is_some() {
                timeout(
                    TIMEOUT,
-                    stdin.write_all(&build_begin_redo_for_block_msg(tag)),
+                    stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
                )
                .await??;
-                if base_img.is_some() {
-                    timeout(
-                        TIMEOUT,
-                        stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
-                    )
-                    .await??;
-                }
+            }

-                // Send WAL records.
-                for rec in records.iter() {
-                    let r = rec.clone();
+            // Send WAL records.
+            for rec in records.iter() {
+                let r = rec.clone();

-                    stdin
-                        .write_all(&build_apply_record_msg(r.lsn, r.rec))
-                        .await?;
+                stdin
+                    .write_all(&build_apply_record_msg(r.lsn, r.rec))
+                    .await?;

-                    //debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
-                    //       r.lsn >> 32, r.lsn & 0xffff_ffff);
-                }
-                //debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
-                //       records.len(), lsn >> 32, lsn & 0xffff_ffff);
+                //debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
+                //       r.lsn >> 32, r.lsn & 0xffff_ffff);
+            }
+            //debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
+            //       records.len(), lsn >> 32, lsn & 0xffff_ffff);

-                // Send GetPage command to get the result back
-                timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
-                timeout(TIMEOUT, stdin.flush()).await??;
-                //debug!("sent GetPage for {}", tag.blknum);
-                Ok::<(), Error>(())
-            };
+            // Send GetPage command to get the result back
+            timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
+            timeout(TIMEOUT, stdin.flush()).await??;
+            //debug!("sent GetPage for {}", tag.blknum);
+            Ok::<(), Error>(())
+        };

-            // Read back new page image
-            let f_stdout = async {
-                let mut buf = [0u8; 8192];
+        // Read back new page image
+        let f_stdout = async {
+            let mut buf = [0u8; 8192];

-                timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
-                //debug!("got response for {}", tag.blknum);
-                Ok::<[u8; 8192], Error>(buf)
-            };
+            timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
+            //debug!("got response for {}", tag.blknum);
+            Ok::<[u8; 8192], Error>(buf)
+        };

-            // Kill the process. This closes its stdin, which should signal the process
-            // to terminate. TODO: SIGKILL if needed
-            //child.wait();
+        let res = tokio::try_join!(f_stdout, f_stdin)?;

-            let res = futures::try_join!(f_stdout, f_stdin)?;
+        let buf = res.0;

-            let buf = res.0;
-
-            Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
-        })
+        Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
    }
 }

+// Functions for constructing messages to send to the postgres WAL redo
+// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
+// explanation of the protocol.
+
 fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
-    let len = 4 + 5 * 4;
+    let len = 4 + 1 + 4 * 4;
    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'B');
    buf.put_u32(len as u32);
-    tag.pack(&mut buf);

-    assert!(buf.len() == 1 + len);
+    // FIXME: this is a temporary hack that should go away when we refactor
+    // the postgres protocol serialization + handlers.
+    //
+    // BytesMut is a dynamic growable buffer, used a lot in tokio code but
+    // not in the std library. To write to a BytesMut from a serde serializer,
+    // we need to either:
+    // - pre-allocate the required buffer space. This is annoying because we
+    //   shouldn't care what the exact serialized size is-- that's the
+    //   serializer's job.
+    // - Or, we need to create a temporary "writer" (which implements the
+    //   `Write` trait). It's a bit awkward, because the writer consumes the
+    //   underlying BytesMut, and we need to extract it later with
+    //   `into_inner`.
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
+        .expect("serialize BufferTag should always succeed");
+    let buf = writer.into_inner();
+
+    debug_assert!(buf.len() == 1 + len);

    buf.freeze()
 }
@@ -414,42 +684,48 @@ fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
 fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
    assert!(base_img.len() == 8192);

-    let len = 4 + 5 * 4 + base_img.len();
+    let len = 4 + 1 + 4 * 4 + base_img.len();
    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'P');
    buf.put_u32(len as u32);
-    tag.pack(&mut buf);
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
+        .expect("serialize BufferTag should always succeed");
+    let mut buf = writer.into_inner();
    buf.put(base_img);

-    assert!(buf.len() == 1 + len);
+    debug_assert!(buf.len() == 1 + len);

    buf.freeze()
 }

-fn build_apply_record_msg(endlsn: u64, rec: Bytes) -> Bytes {
+fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
    let len = 4 + 8 + rec.len();
    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'A');
    buf.put_u32(len as u32);
-    buf.put_u64(endlsn);
+    buf.put_u64(endlsn.0);
    buf.put(rec);

-    assert!(buf.len() == 1 + len);
+    debug_assert!(buf.len() == 1 + len);

    buf.freeze()
 }

 fn build_get_page_msg(tag: BufferTag) -> Bytes {
-    let len = 4 + 5 * 4;
+    let len = 4 + 1 + 4 * 4;
    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'G');
    buf.put_u32(len as u32);
-    tag.pack(&mut buf);
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
+        .expect("serialize BufferTag should always succeed");
+    let buf = writer.into_inner();

-    assert!(buf.len() == 1 + len);
+    debug_assert!(buf.len() == 1 + len);

    buf.freeze()
 }
--- a/pgbuild.sh
+++ b/pgbuild.sh
@@ -1,33 +0,0 @@
-#!/bin/sh
-#
-#   Purpose of this script is to build and install postgres in a local directory
-# so that zenith intergation tests would find pg binaries and support files.
-#
-# ./pgbuild.sh would do following:
-#
-#   1) run out-of-source build of postgres in REPO_ROOT/tmp_install/build directory (I'm reusing
-#  tmp_install path here since it is already present in .gitignore)
-#
-#   2) installs postgres to REPO_ROOT/tmp_install/
-#
-
-# Halt immediately if any command fails
-set -e
-
-REPO_ROOT=$(dirname "$0")
-REPO_ROOT="`( cd \"$REPO_ROOT\" && pwd )`"
-
-# configure
-echo "Configuring postgres build"
-mkdir -p $REPO_ROOT/tmp_install/build
-cd $REPO_ROOT/tmp_install/build
-../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
-    --enable-depend --with-libxml --prefix=/ > configure.log
-
-# compile
-echo "Compiling postgres"
-make -j8 -s
-export DESTDIR=$REPO_ROOT/tmp_install
-
-echo "Installing postgres to $DESTDIR"
-make install -s
--- a/postgres_ffi/Cargo.toml
+++ b/postgres_ffi/Cargo.toml
@@ -9,12 +9,17 @@ edition = "2018"
 [dependencies]
 chrono = "0.4.19"
 rand = "0.8.3"
+regex = "1.4.5"
 bytes = "1.0.1"
 byteorder = "1.4.3"
 anyhow = "1.0"
 crc32c = "0.6.0"
 hex = "0.4.3"
+lazy_static = "1.4"
 log = "0.4.14"
+memoffset = "0.6.2"
+thiserror = "1.0"
+workspace_hack = { path = "../workspace_hack" }

 [build-dependencies]
 bindgen = "0.57"
--- a/postgres_ffi/README
+++ b/postgres_ffi/README
@@ -1,3 +1,25 @@
-This module contains utility functions for interacting with PostgreSQL
-file formats.
+This module contains utilities for working with PostgreSQL file
+formats. It's a collection of structs that are auto-generated from the
+PostgreSQL header files using bindgen, and Rust functions to read and
+manipulate them.

+There are also a bunch of constants in `pg_constants.rs` that are copied
+from various PostgreSQL headers, rather than auto-generated. They mostly
+should be auto-generated too, but that's a TODO.
+
+The PostgreSQL on-disk file format is not portable across different
+CPU architectures and operating systems. It is also subject to change
+in each major PostgreSQL version. Currently, this module is based on
+PostgreSQL v14, but in the future we will probably need a separate
+copy for each PostgreSQL version.
+
+To interact with the C structs, there is some unsafe code in this
+module. Do not copy-paste that to the rest of the codebase! Keep the
+amount of unsafe code to a minimum, and limited to this module only,
+and only where it's truly needed.
+
+TODO: Currently, there is also some code that deals with WAL records
+in pageserver/src/waldecoder.rs.  That should be moved into this
+module. The rest of the codebase should not have intimate knowledge of
+PostgreSQL file formats or WAL layout, that knowledge should be
+encapsulated in this module.
--- a/postgres_ffi/build.rs
+++ b/postgres_ffi/build.rs
@@ -11,27 +11,42 @@ fn main() {
    // to bindgen, and lets you build up options for
    // the resulting bindings.
    let bindings = bindgen::Builder::default()
-        // The input header we would like to generate
-        // bindings for.
+        //
+        // All the needed PostgreSQL headers are included from 'pg_control_ffi.h'
+        //
        .header("pg_control_ffi.h")
+        //
        // Tell cargo to invalidate the built crate whenever any of the
        // included header files changed.
+        //
        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
+        //
+        // These are the types and constants that we want to generate bindings for
+        //
        .whitelist_type("ControlFileData")
+        .whitelist_type("CheckPoint")
+        .whitelist_type("FullTransactionId")
+        .whitelist_type("XLogRecord")
+        .whitelist_type("XLogPageHeaderData")
+        .whitelist_type("XLogLongPageHeaderData")
+        .whitelist_var("XLOG_PAGE_MAGIC")
        .whitelist_var("PG_CONTROL_FILE_SIZE")
        .whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
        .whitelist_type("DBState")
+        //
        // Path the server include dir. It is in tmp_install/include/server, if you did
        // "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
        // and used DESTDIR to move it into tmp_install, then it's in
-        // tmp_install/include/postgres/server (that's how the pgbuild.sh script does it).
+        // tmp_install/include/postgres/server
        // 'pg_config --includedir-server' would perhaps be the more proper way to find it,
        // but this will do for now.
+        //
        .clang_arg("-I../tmp_install/include/server")
        .clang_arg("-I../tmp_install/include/postgresql/server")
+        //
        // Finish the builder and generate the bindings.
+        //
        .generate()
-        // Unwrap the Result and panic on failure.
        .expect("Unable to generate bindings");

    // Write the bindings to the $OUT_DIR/bindings.rs file.
--- a/postgres_ffi/pg_control_ffi.h
+++ b/postgres_ffi/pg_control_ffi.h
@@ -1,4 +1,10 @@
+/*
+ * This header file is the input to bindgen. It includes all the
+ * PostgreSQL headers that we need to auto-generate Rust structs
+ * from. If you need to expose a new struct to Rust code, add the
+ * header here, and whitelist the struct in the build.rs file.
+ */
 #include "c.h"
 #include "catalog/pg_control.h"
+#include "access/xlog_internal.h"

-const uint32 PG_CONTROLFILEDATA_OFFSETOF_CRC = offsetof(ControlFileData, crc);
--- a/postgres_ffi/src/controlfile_utils.rs
+++ b/postgres_ffi/src/controlfile_utils.rs
@@ -0,0 +1,124 @@
+//!
+//! Utilities for reading and writing the PostgreSQL control file.
+//!
+//! The PostgreSQL control file is one the first things that the PostgreSQL
+//! server reads when it starts up. It indicates whether the server was shut
+//! down cleanly, or if it crashed or was restored from online backup so that
+//! WAL recovery needs to be performed. It also contains a copy of the latest
+//! checkpoint record and its location in the WAL.
+//!
+//! The control file also contains fields for detecting whether the
+//! data directory is compatible with a postgres binary. That includes
+//! a version number, configuration options that can be set at
+//! compilation time like the block size, and the platform's alignment
+//! and endianess information. (The PostgreSQL on-disk file format is
+//! not portable across platforms.)
+//!
+//! The control file is stored in the PostgreSQL data directory, as
+//! `global/pg_control`. The data stored in it is designed to be smaller than
+//! 512 bytes, on the assumption that it can be updated atomically. The actual
+//! file is larger, 8192 bytes, but the rest of it is just filled with zeros.
+//!
+//! See src/include/catalog/pg_control.h in the PostgreSQL sources for more
+//! information. You can use PostgreSQL's pg_controldata utility to view its
+//! contents.
+//!
+use crate::{ControlFileData, PG_CONTROL_FILE_SIZE};
+
+use anyhow::{bail, Result};
+use bytes::{Bytes, BytesMut};
+
+/// Equivalent to sizeof(ControlFileData) in C
+const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
+
+impl ControlFileData {
+    /// Compute the offset of the `crc` field within the `ControlFileData` struct.
+    /// Equivalent to offsetof(ControlFileData, crc) in C.
+    // Someday this can be const when the right compiler features land.
+    fn pg_control_crc_offset() -> usize {
+        memoffset::offset_of!(ControlFileData, crc)
+    }
+
+    ///
+    /// Interpret a slice of bytes as a Postgres control file.
+    ///
+    pub fn decode(buf: &[u8]) -> Result<ControlFileData> {
+        // Check that the slice has the expected size. The control file is
+        // padded with zeros up to a 512 byte sector size, so accept a
+        // larger size too, so that the caller can just the whole file
+        // contents without knowing the exact size of the struct.
+        if buf.len() < SIZEOF_CONTROLDATA {
+            bail!("control file is too short");
+        }
+
+        // Compute the expected CRC of the content.
+        let OFFSETOF_CRC = Self::pg_control_crc_offset();
+        let expectedcrc = crc32c::crc32c(&buf[0..OFFSETOF_CRC]);
+
+        // Convert the slice into an array of the right size, and use `transmute` to
+        // reinterpret the raw bytes as a ControlFileData struct.
+        //
+        // NB: Ideally we would use 'zerocopy::FromBytes' for this, but bindgen doesn't
+        // derive FromBytes for us. The safety of this depends on the same constraints
+        // as for FromBytes, namely, all of its fields must implement FromBytes. That
+        // includes the primitive integer types, like `u8`, `u16`, `u32`, `u64` and their
+        // signed variants. But `bool` is not safe, because the contents of the high bits
+        // in a rust bool are undefined. In practice, PostgreSQL uses 1 to represent
+        // true and 0 for false, which is compatible with Rust bool, but let's try not to
+        // depend on it.
+        //
+        // FIXME: ControlFileData does contain 'bool's at the moment.
+        //
+        // See https://github.com/zenithdb/zenith/issues/207 for discussion on the safety
+        // of this.
+        let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
+        b.copy_from_slice(&buf[0..SIZEOF_CONTROLDATA]);
+        let controlfile: ControlFileData =
+            unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
+
+        // Check the CRC
+        if expectedcrc != controlfile.crc {
+            bail!(
+                "invalid CRC in control file: expected {:08X}, was {:08X}",
+                expectedcrc,
+                controlfile.crc
+            );
+        }
+
+        Ok(controlfile)
+    }
+
+    ///
+    /// Convert a struct representing a Postgres control file into raw bytes.
+    ///
+    /// The CRC is recomputed to match the contents of the fields.
+    pub fn encode(&self) -> Bytes {
+        //
+        // Use `transmute` to reinterpret struct as raw bytes.
+        //
+        // FIXME: This triggers undefined behavior, because the contents
+        // of the padding bytes are undefined, and this leaks those
+        // undefined bytes into the resulting array. The Rust code won't
+        // care what's in those bytes, and PostgreSQL doesn't care
+        // either. HOWEVER, it is a potential security issue, because the
+        // bytes can contain arbitrary pieces of memory from the page
+        // server. In the worst case, that could be private keys or
+        // another tenant's data.
+        //
+        // See https://github.com/zenithdb/zenith/issues/207 for discussion.
+        let b: [u8; SIZEOF_CONTROLDATA] =
+            unsafe { std::mem::transmute::<ControlFileData, [u8; SIZEOF_CONTROLDATA]>(*self) };
+
+        // Recompute the CRC
+        let OFFSETOF_CRC = Self::pg_control_crc_offset();
+        let newcrc = crc32c::crc32c(&b[0..OFFSETOF_CRC]);
+
+        let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize);
+        buf.extend_from_slice(&b[0..OFFSETOF_CRC]);
+        buf.extend_from_slice(&newcrc.to_ne_bytes());
+        // Fill the rest of the control file with zeros.
+        buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
+
+        buf.into()
+    }
+}
--- a/postgres_ffi/src/lib.rs
+++ b/postgres_ffi/src/lib.rs
@@ -3,67 +3,8 @@
 #![allow(non_snake_case)]
 include!(concat!(env!("OUT_DIR"), "/bindings.rs"));

+pub mod controlfile_utils;
+pub mod nonrelfile_utils;
+pub mod pg_constants;
+pub mod relfile_utils;
 pub mod xlog_utils;
-
-use bytes::{Buf, Bytes, BytesMut};
-
-// sizeof(ControlFileData)
-const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
-const OFFSETOF_CRC: usize = PG_CONTROLFILEDATA_OFFSETOF_CRC as usize;
-
-impl ControlFileData {
-    // Initialize an all-zeros ControlFileData struct
-    pub fn new() -> ControlFileData {
-        let controlfile: ControlFileData;
-
-        let b = [0u8; SIZEOF_CONTROLDATA];
-        controlfile =
-            unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
-
-        controlfile
-    }
-}
-
-pub fn decode_pg_control(mut buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
-    let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
-    buf.copy_to_slice(&mut b);
-
-    let controlfile: ControlFileData;
-
-    // TODO: verify CRC
-    let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC];
-    data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]);
-    let expectedcrc = crc32c::crc32c(&data_without_crc);
-
-    controlfile = unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
-
-    if expectedcrc != controlfile.crc {
-        anyhow::bail!(
-            "invalid CRC in control file: expected {:08X}, was {:08X}",
-            expectedcrc,
-            controlfile.crc
-        );
-    }
-
-    Ok(controlfile)
-}
-
-pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes {
-    let b: [u8; SIZEOF_CONTROLDATA];
-
-    b = unsafe { std::mem::transmute::<ControlFileData, [u8; SIZEOF_CONTROLDATA]>(controlfile) };
-
-    // Recompute the CRC
-    let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC];
-    data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]);
-    let newcrc = crc32c::crc32c(&data_without_crc);
-
-    let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize);
-
-    buf.extend_from_slice(&b[0..OFFSETOF_CRC]);
-    buf.extend_from_slice(&newcrc.to_ne_bytes());
-    // Fill the rest of the control file with zeros.
-    buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
-
-    buf.into()
-}
--- a/postgres_ffi/src/nonrelfile_utils.rs
+++ b/postgres_ffi/src/nonrelfile_utils.rs
@@ -0,0 +1,32 @@
+//!
+//! Common utilities for dealing with PostgreSQL non-relation files.
+//!
+use crate::pg_constants;
+use bytes::BytesMut;
+use log::*;
+
+pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
+    trace!(
+        "handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
+        status
+    );
+
+    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
+        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+
+    let bshift: u8 =
+        ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
+
+    page[byteno] =
+        (page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift);
+}
+
+pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
+    let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
+        / pg_constants::CLOG_XACTS_PER_BYTE) as usize;
+
+    let bshift: u8 =
+        ((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
+
+    ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8
+}
--- a/postgres_ffi/src/pg_constants.rs
+++ b/postgres_ffi/src/pg_constants.rs
@@ -0,0 +1,187 @@
+//!
+//! Misc constants, copied from PostgreSQL headers.
+//!
+//! TODO: These probably should be auto-generated using bindgen,
+//! rather than copied by hand. Although on the other hand, it's nice
+//! to have them all here in one place, and have the ability to add
+//! comments on them.
+//!
+
+//
+// From pg_tablespace_d.h
+//
+pub const DEFAULTTABLESPACE_OID: u32 = 1663;
+pub const GLOBALTABLESPACE_OID: u32 = 1664;
+
+//
+// Fork numbers, from relpath.h
+//
+pub const MAIN_FORKNUM: u8 = 0;
+pub const FSM_FORKNUM: u8 = 1;
+pub const VISIBILITYMAP_FORKNUM: u8 = 2;
+pub const INIT_FORKNUM: u8 = 3;
+
+// From storage_xlog.h
+pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
+pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
+pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
+
+// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and
+// --with-segsize=SEGSIZE, but assume the defaults for now.
+pub const BLCKSZ: u16 = 8192;
+pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);
+
+//
+// constants from clog.h
+//
+pub const CLOG_XACTS_PER_BYTE: u32 = 4;
+pub const CLOG_XACTS_PER_PAGE: u32 = BLCKSZ as u32 * CLOG_XACTS_PER_BYTE;
+pub const CLOG_BITS_PER_XACT: u8 = 2;
+pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
+
+//
+// Constants from visbilitymap.h
+//
+pub const SIZE_OF_PAGE_HEADER: u16 = 24;
+pub const BITS_PER_HEAPBLOCK: u16 = 2;
+pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
+
+pub const TRANSACTION_STATUS_IN_PROGRESS: u8 = 0x00;
+pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
+pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
+pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
+
+pub const CLOG_ZEROPAGE: u8 = 0x00;
+pub const CLOG_TRUNCATE: u8 = 0x10;
+
+// From xact.h
+pub const XLOG_XACT_COMMIT: u8 = 0x00;
+pub const XLOG_XACT_PREPARE: u8 = 0x10;
+pub const XLOG_XACT_ABORT: u8 = 0x20;
+pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
+pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
+
+// From srlu.h
+pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
+pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
+
+/* mask for filtering opcodes out of xl_info */
+pub const XLOG_XACT_OPMASK: u8 = 0x70;
+pub const XLOG_HEAP_OPMASK: u8 = 0x70;
+/* does this record have a 'xinfo' field or not */
+pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
+
+/*
+ * The following flags, stored in xinfo, determine which information is
+ * contained in commit/abort records.
+ */
+pub const XACT_XINFO_HAS_DBINFO: u32 = 1u32 << 0;
+pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
+pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
+pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
+pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
+// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
+// pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
+// pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;
+
+// From pg_control.h and rmgrlist.h
+pub const XLOG_NEXTOID: u8 = 0x30;
+pub const XLOG_SWITCH: u8 = 0x40;
+pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
+pub const DB_SHUTDOWNED: u32 = 1;
+
+// From multixact.h
+pub const FIRST_MULTIXACT_ID: u32 = 1;
+pub const MAX_MULTIXACT_ID: u32 = 0xFFFFFFFF;
+
+pub const XLOG_MULTIXACT_ZERO_OFF_PAGE: u8 = 0x00;
+pub const XLOG_MULTIXACT_ZERO_MEM_PAGE: u8 = 0x10;
+pub const XLOG_MULTIXACT_CREATE_ID: u8 = 0x20;
+pub const XLOG_MULTIXACT_TRUNCATE_ID: u8 = 0x30;
+
+pub const MULTIXACT_OFFSETS_PER_PAGE: u16 = BLCKSZ / 4;
+pub const MXACT_MEMBER_BITS_PER_XACT: u16 = 8;
+pub const MXACT_MEMBER_FLAGS_PER_BYTE: u16 = 1;
+pub const MULTIXACT_FLAGBYTES_PER_GROUP: u16 = 4;
+pub const MULTIXACT_MEMBERS_PER_MEMBERGROUP: u16 =
+    MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE;
+/* size in bytes of a complete group */
+pub const MULTIXACT_MEMBERGROUP_SIZE: u16 =
+    4 * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP;
+pub const MULTIXACT_MEMBERGROUPS_PER_PAGE: u16 = BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE;
+pub const MULTIXACT_MEMBERS_PER_PAGE: u16 =
+    MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+
+// From heapam_xlog.h
+pub const XLOG_HEAP_INSERT: u8 = 0x00;
+pub const XLOG_HEAP_DELETE: u8 = 0x10;
+pub const XLOG_HEAP_UPDATE: u8 = 0x20;
+pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
+pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
+pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
+pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
+pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
+pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
+pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
+pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
+
+pub const RM_XLOG_ID: u8 = 0;
+pub const RM_XACT_ID: u8 = 1;
+pub const RM_SMGR_ID: u8 = 2;
+pub const RM_CLOG_ID: u8 = 3;
+pub const RM_DBASE_ID: u8 = 4;
+pub const RM_TBLSPC_ID: u8 = 5;
+pub const RM_MULTIXACT_ID: u8 = 6;
+pub const RM_RELMAP_ID: u8 = 7;
+pub const RM_STANDBY_ID: u8 = 8;
+pub const RM_HEAP2_ID: u8 = 9;
+pub const RM_HEAP_ID: u8 = 10;
+
+// from xlogreader.h
+pub const XLR_INFO_MASK: u8 = 0x0F;
+pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
+
+// from dbcommands_xlog.h
+pub const XLOG_DBASE_CREATE: u8 = 0x00;
+pub const XLOG_DBASE_DROP: u8 = 0x10;
+
+pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
+pub const XLOG_TBLSPC_DROP: u8 = 0x10;
+
+pub const SIZEOF_XLOGRECORD: u32 = 24;
+
+//
+// from xlogrecord.h
+//
+pub const XLR_MAX_BLOCK_ID: u8 = 32;
+
+pub const XLR_BLOCK_ID_DATA_SHORT: u8 = 255;
+pub const XLR_BLOCK_ID_DATA_LONG: u8 = 254;
+pub const XLR_BLOCK_ID_ORIGIN: u8 = 253;
+pub const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252;
+
+pub const BKPBLOCK_FORK_MASK: u8 = 0x0F;
+pub const _BKPBLOCK_FLAG_MASK: u8 = 0xF0;
+pub const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */
+pub const BKPBLOCK_HAS_DATA: u8 = 0x20;
+pub const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */
+pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */
+
+/* Information stored in bimg_info */
+pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
+pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
+pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
+
+/* From transam.h */
+pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
+pub const INVALID_TRANSACTION_ID: u32 = 0;
+pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
+pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
+
+/* FIXME: pageserver should request wal_seg_size from compute node */
+pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
+
+pub const XLOG_BLCKSZ: usize = 8192;
+pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
+pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
+pub const XLP_LONG_HEADER: u16 = 0x0002;
--- a/postgres_ffi/src/relfile_utils.rs
+++ b/postgres_ffi/src/relfile_utils.rs
@@ -0,0 +1,141 @@
+//!
+//! Common utilities for dealing with PostgreSQL relation files.
+//!
+use crate::pg_constants;
+use lazy_static::lazy_static;
+use regex::Regex;
+
+#[derive(Debug, Clone, thiserror::Error, PartialEq)]
+pub enum FilePathError {
+    #[error("invalid relation fork name")]
+    InvalidForkName,
+    #[error("invalid relation data file name")]
+    InvalidFileName,
+}
+
+impl From<core::num::ParseIntError> for FilePathError {
+    fn from(_e: core::num::ParseIntError) -> Self {
+        FilePathError::InvalidFileName
+    }
+}
+
+/// Convert Postgres relation file's fork suffix to fork number.
+pub fn forkname_to_number(forkname: Option<&str>) -> Result<u8, FilePathError> {
+    match forkname {
+        // "main" is not in filenames, it's implicit if the fork name is not present
+        None => Ok(pg_constants::MAIN_FORKNUM),
+        Some("fsm") => Ok(pg_constants::FSM_FORKNUM),
+        Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM),
+        Some("init") => Ok(pg_constants::INIT_FORKNUM),
+        Some(_) => Err(FilePathError::InvalidForkName),
+    }
+}
+
+/// Convert Postgres fork number to the right suffix of the relation data file.
+pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
+    match forknum {
+        pg_constants::MAIN_FORKNUM => None,
+        pg_constants::FSM_FORKNUM => Some("fsm"),
+        pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"),
+        pg_constants::INIT_FORKNUM => Some("init"),
+        _ => Some("UNKNOWN FORKNUM"),
+    }
+}
+
+///
+/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
+///
+/// Formats:
+/// <oid>
+/// <oid>_<fork name>
+/// <oid>.<segment number>
+/// <oid>_<fork name>.<segment number>
+///
+/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
+///
+pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
+    lazy_static! {
+        static ref RELFILE_RE: Regex =
+            Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
+    }
+    let caps = RELFILE_RE
+        .captures(fname)
+        .ok_or(FilePathError::InvalidFileName)?;
+
+    let relnode_str = caps.name("relnode").unwrap().as_str();
+    let relnode = relnode_str.parse::<u32>()?;
+
+    let forkname = caps.name("forkname").map(|f| f.as_str());
+    let forknum = forkname_to_number(forkname)?;
+
+    let segno_match = caps.name("segno");
+    let segno = if segno_match.is_none() {
+        0
+    } else {
+        segno_match.unwrap().as_str().parse::<u32>()?
+    };
+
+    Ok((relnode, forknum, segno))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_valid_relfilenames() {
+        assert_eq!(parse_relfilename("1234"), Ok((1234, 0, 0)));
+        assert_eq!(parse_relfilename("1234_fsm"), Ok((1234, 1, 0)));
+        assert_eq!(parse_relfilename("1234_vm"), Ok((1234, 2, 0)));
+        assert_eq!(parse_relfilename("1234_init"), Ok((1234, 3, 0)));
+
+        assert_eq!(parse_relfilename("1234.12"), Ok((1234, 0, 12)));
+        assert_eq!(parse_relfilename("1234_fsm.12"), Ok((1234, 1, 12)));
+        assert_eq!(parse_relfilename("1234_vm.12"), Ok((1234, 2, 12)));
+        assert_eq!(parse_relfilename("1234_init.12"), Ok((1234, 3, 12)));
+
+        // relfilenode is unsigned, so it can go up to 2^32-1
+        assert_eq!(parse_relfilename("3147483648"), Ok((3147483648, 0, 0)));
+    }
+
+    #[test]
+    fn test_parse_invalid_relfilenames() {
+        assert_eq!(
+            parse_relfilename("foo"),
+            Err(FilePathError::InvalidFileName)
+        );
+        assert_eq!(
+            parse_relfilename("1.2.3"),
+            Err(FilePathError::InvalidFileName)
+        );
+        assert_eq!(
+            parse_relfilename("1234_invalid"),
+            Err(FilePathError::InvalidForkName)
+        );
+        assert_eq!(
+            parse_relfilename("1234_"),
+            Err(FilePathError::InvalidFileName)
+        );
+
+        // too large for u32
+        assert_eq!(
+            parse_relfilename("12345678901"),
+            Err(FilePathError::InvalidFileName)
+        );
+        assert_eq!(
+            parse_relfilename("-1234"),
+            Err(FilePathError::InvalidFileName)
+        );
+    }
+
+    #[test]
+    fn test_parse_weird_relfilenames() {
+        // we accept 0 for the relfilenode, but PostgreSQL should never do that.
+        assert_eq!(parse_relfilename("0"), Ok((0, 0, 0)));
+
+        // PostgreSQL has a limit of 2^32-2 blocks in a table. With 8k block size and
+        // 1 GB segments, the max segment number is 32767. But we accept larger values
+        // currently.
+        assert_eq!(parse_relfilename("1.123456"), Ok((1, 0, 123456)));
+    }
+}
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -7,7 +7,18 @@
 // have been named the same as the corresponding PostgreSQL functions instead.
 //

+use crate::pg_constants;
+use crate::CheckPoint;
+use crate::ControlFileData;
+use crate::FullTransactionId;
+use crate::XLogLongPageHeaderData;
+use crate::XLogPageHeaderData;
+use crate::XLogRecord;
+use crate::XLOG_PAGE_MAGIC;
+
 use byteorder::{ByteOrder, LittleEndian};
+use bytes::{Buf, Bytes};
+use bytes::{BufMut, BytesMut};
 use crc32c::*;
 use log::*;
 use std::cmp::min;
@@ -19,32 +30,27 @@ use std::time::SystemTime;
 pub const XLOG_FNAME_LEN: usize = 24;
 pub const XLOG_BLCKSZ: usize = 8192;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
-pub const XLOG_PAGE_MAGIC: u16 = 0xD109;
 pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
-pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = XLP_REM_LEN_OFFS + 4 + 4;
-pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4;
 pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
-pub const XLOG_SIZE_OF_XLOG_RECORD: usize = XLOG_RECORD_CRC_OFFS + 4;
+pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
+
+pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::<XLogPageHeaderData>();
+pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::<XLogLongPageHeaderData>();
+pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
+pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
+
 pub type XLogRecPtr = u64;
 pub type TimeLineID = u32;
-pub type TimestampTz = u64;
+pub type TimestampTz = i64;
 pub type XLogSegNo = u64;

-#[allow(non_snake_case)]
-pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
-    (xlogptr as u32) & (wal_segsz_bytes as u32 - 1)
-}
+const XID_CHECKPOINT_INTERVAL: u32 = 1024;

 #[allow(non_snake_case)]
 pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
    (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
 }

-#[allow(non_snake_case)]
-pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
-    xlogptr / wal_segsz_bytes as u64
-}
-
 #[allow(non_snake_case)]
 pub fn XLogSegNoOffsetToRecPtr(
    segno: XLogSegNo,
@@ -89,9 +95,9 @@ pub fn get_current_timestamp() -> TimestampTz {
    const USECS_PER_SEC: u64 = 1000000;
    match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) {
        Ok(n) => {
-            (n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
+            ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
                * USECS_PER_SEC
-                + n.subsec_micros() as u64
+                + n.subsec_micros() as u64) as i64
        }
        Err(_) => panic!("SystemTime before UNIX EPOCH!"),
    }
@@ -126,7 +132,7 @@ fn find_end_of_wal_segment(
            let xlp_magic = LittleEndian::read_u16(&buf[0..2]);
            let xlp_info = LittleEndian::read_u16(&buf[2..4]);
            let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]);
-            if xlp_magic != XLOG_PAGE_MAGIC {
+            if xlp_magic != XLOG_PAGE_MAGIC as u16 {
                info!("Invalid WAL file {}.partial magic {}", file_name, xlp_magic);
                break;
            }
@@ -205,33 +211,31 @@ pub fn find_end_of_wal(
    let mut high_tli: TimeLineID = 0;
    let mut high_ispartial = false;

-    for entry in fs::read_dir(data_dir).unwrap() {
-        if let Ok(entry) = entry {
-            let ispartial: bool;
-            let entry_name = entry.file_name();
-            let fname = entry_name.to_str().unwrap();
-            /*
-             * Check if the filename looks like an xlog file, or a .partial file.
-             */
-            if IsXLogFileName(fname) {
-                ispartial = false;
-            } else if IsPartialXLogFileName(fname) {
-                ispartial = true;
-            } else {
-                continue;
-            }
-            let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
-            if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
-                continue;
-            }
-            if segno > high_segno
-                || (segno == high_segno && tli > high_tli)
-                || (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
-            {
-                high_segno = segno;
-                high_tli = tli;
-                high_ispartial = ispartial;
-            }
+    for entry in fs::read_dir(data_dir).unwrap().flatten() {
+        let ispartial: bool;
+        let entry_name = entry.file_name();
+        let fname = entry_name.to_str().unwrap();
+        /*
+         * Check if the filename looks like an xlog file, or a .partial file.
+         */
+        if IsXLogFileName(fname) {
+            ispartial = false;
+        } else if IsPartialXLogFileName(fname) {
+            ispartial = true;
+        } else {
+            continue;
+        }
+        let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
+        if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
+            continue;
+        }
+        if segno > high_segno
+            || (segno == high_segno && tli > high_tli)
+            || (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
+        {
+            high_segno = segno;
+            high_tli = tli;
+            high_ispartial = ispartial;
        }
    }
    if high_segno > 0 {
@@ -264,3 +268,186 @@ pub fn main() {
        tli
    );
 }
+
+impl XLogRecord {
+    pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
+        XLogRecord {
+            xl_tot_len: buf.get_u32_le(),
+            xl_xid: buf.get_u32_le(),
+            xl_prev: buf.get_u64_le(),
+            xl_info: buf.get_u8(),
+            xl_rmid: buf.get_u8(),
+            xl_crc: {
+                buf.advance(2);
+                buf.get_u32_le()
+            },
+        }
+    }
+
+    pub fn encode(&self) -> Bytes {
+        let b: [u8; XLOG_SIZE_OF_XLOG_RECORD];
+        b = unsafe { std::mem::transmute::<XLogRecord, [u8; XLOG_SIZE_OF_XLOG_RECORD]>(*self) };
+        Bytes::copy_from_slice(&b[..])
+    }
+
+    // Is this record an XLOG_SWITCH record? They need some special processing,
+    pub fn is_xlog_switch_record(&self) -> bool {
+        self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID
+    }
+}
+
+impl XLogPageHeaderData {
+    pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogPageHeaderData {
+        let hdr: XLogPageHeaderData = XLogPageHeaderData {
+            xlp_magic: buf.get_u16_le(),
+            xlp_info: buf.get_u16_le(),
+            xlp_tli: buf.get_u32_le(),
+            xlp_pageaddr: buf.get_u64_le(),
+            xlp_rem_len: buf.get_u32_le(),
+        };
+        buf.get_u32_le(); //padding
+        hdr
+    }
+}
+
+impl XLogLongPageHeaderData {
+    pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogLongPageHeaderData {
+        XLogLongPageHeaderData {
+            std: XLogPageHeaderData::from_bytes(buf),
+            xlp_sysid: buf.get_u64_le(),
+            xlp_seg_size: buf.get_u32_le(),
+            xlp_xlog_blcksz: buf.get_u32_le(),
+        }
+    }
+
+    pub fn encode(&self) -> Bytes {
+        let b: [u8; XLOG_SIZE_OF_XLOG_LONG_PHD];
+        b = unsafe {
+            std::mem::transmute::<XLogLongPageHeaderData, [u8; XLOG_SIZE_OF_XLOG_LONG_PHD]>(*self)
+        };
+        Bytes::copy_from_slice(&b[..])
+    }
+}
+
+pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
+
+impl CheckPoint {
+    pub fn new(lsn: u64, timeline: u32) -> CheckPoint {
+        CheckPoint {
+            redo: lsn,
+            ThisTimeLineID: timeline,
+            PrevTimeLineID: timeline,
+            fullPageWrites: true, // TODO: get actual value of full_page_writes
+            nextXid: FullTransactionId {
+                value: pg_constants::FIRST_NORMAL_TRANSACTION_ID as u64,
+            }, // TODO: handle epoch?
+            nextOid: pg_constants::FIRST_BOOTSTRAP_OBJECT_ID,
+            nextMulti: 1,
+            nextMultiOffset: 0,
+            oldestXid: pg_constants::FIRST_NORMAL_TRANSACTION_ID,
+            oldestXidDB: 0,
+            oldestMulti: 1,
+            oldestMultiDB: 0,
+            time: 0,
+            oldestCommitTsXid: 0,
+            newestCommitTsXid: 0,
+            oldestActiveXid: pg_constants::INVALID_TRANSACTION_ID,
+        }
+    }
+
+    pub fn encode(&self) -> Bytes {
+        let b: [u8; SIZEOF_CHECKPOINT];
+        b = unsafe { std::mem::transmute::<CheckPoint, [u8; SIZEOF_CHECKPOINT]>(*self) };
+        Bytes::copy_from_slice(&b[..])
+    }
+
+    pub fn decode(buf: &[u8]) -> Result<CheckPoint, anyhow::Error> {
+        let mut b = [0u8; SIZEOF_CHECKPOINT];
+        b.copy_from_slice(&buf[0..SIZEOF_CHECKPOINT]);
+        let checkpoint: CheckPoint;
+        checkpoint = unsafe { std::mem::transmute::<[u8; SIZEOF_CHECKPOINT], CheckPoint>(b) };
+        Ok(checkpoint)
+    }
+
+    // Update next XID based on provided new_xid and stored epoch.
+    // Next XID should be greater than new_xid.
+    // Also take in account 32-bit wrap-around.
+    pub fn update_next_xid(&mut self, xid: u32) {
+        let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
+        let full_xid = self.nextXid.value;
+        let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
+        let old_xid = full_xid as u32;
+        if new_xid.wrapping_sub(old_xid) as i32 > 0 {
+            let mut epoch = full_xid >> 32;
+            if new_xid < old_xid {
+                // wrap-around
+                epoch += 1;
+            }
+            self.nextXid = FullTransactionId {
+                value: (epoch << 32) | new_xid as u64,
+            };
+        }
+    }
+}
+
+//
+// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
+// We need this segment to start compute node.
+// In order to minimize changes in Postgres core, we prefer to
+// provide WAL segment from which is can extract checkpoint record in standard way,
+// rather then implement some alternative mechanism.
+//
+pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
+    let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);
+
+    let hdr = XLogLongPageHeaderData {
+        std: {
+            XLogPageHeaderData {
+                xlp_magic: XLOG_PAGE_MAGIC as u16,
+                xlp_info: pg_constants::XLP_LONG_HEADER,
+                xlp_tli: 1, // FIXME: always use Postgres timeline 1
+                xlp_pageaddr: pg_control.checkPoint - XLOG_SIZE_OF_XLOG_LONG_PHD as u64,
+                xlp_rem_len: 0,
+            }
+        },
+        xlp_sysid: pg_control.system_identifier,
+        xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32,
+        xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
+    };
+
+    let hdr_bytes = hdr.encode();
+    seg_buf.extend_from_slice(&hdr_bytes);
+
+    let rec_hdr = XLogRecord {
+        xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD
+            + SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT
+            + SIZEOF_CHECKPOINT) as u32,
+        xl_xid: 0, //0 is for InvalidTransactionId
+        xl_prev: 0,
+        xl_info: pg_constants::XLOG_CHECKPOINT_SHUTDOWN,
+        xl_rmid: pg_constants::RM_XLOG_ID,
+        xl_crc: 0,
+    };
+
+    let mut rec_shord_hdr_bytes = BytesMut::new();
+    rec_shord_hdr_bytes.put_u8(pg_constants::XLR_BLOCK_ID_DATA_SHORT);
+    rec_shord_hdr_bytes.put_u8(SIZEOF_CHECKPOINT as u8);
+
+    let rec_bytes = rec_hdr.encode();
+    let checkpoint_bytes = pg_control.checkPointCopy.encode();
+
+    //calculate record checksum
+    let mut crc = 0;
+    crc = crc32c_append(crc, &rec_shord_hdr_bytes[..]);
+    crc = crc32c_append(crc, &checkpoint_bytes[..]);
+    crc = crc32c_append(crc, &rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
+
+    seg_buf.extend_from_slice(&rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
+    seg_buf.put_u32_le(crc);
+    seg_buf.extend_from_slice(&rec_shord_hdr_bytes);
+    seg_buf.extend_from_slice(&checkpoint_bytes);
+
+    //zero out the rest of the file
+    seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
+    seg_buf.freeze()
+}
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "proxy"
+version = "0.1.0"
+authors = ["Stas Kelvich <stas.kelvich@gmail.com>"]
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow = "1.0"
+bytes = { version = "1.0.1", features = ['serde'] }
+md5 = "0.7.0"
+rand = "0.8.3"
+hex = "0.4.3"
+serde = "1"
+serde_json = "1"
+tokio = { version = "1.7.1", features = ["full"] }
+tokio-postgres = "0.7.2"
+clap = "2.33.0"
+
+zenith_utils = { path = "../zenith_utils" }
--- a/proxy/src/cplane_api.rs
+++ b/proxy/src/cplane_api.rs
@@ -0,0 +1,92 @@
+use anyhow::{bail, Result};
+use serde::{Deserialize, Serialize};
+use std::{
+    collections::HashMap,
+    net::{IpAddr, SocketAddr},
+};
+
+pub struct CPlaneApi {
+    // address: SocketAddr,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct DatabaseInfo {
+    pub host: IpAddr, // TODO: allow host name here too
+    pub port: u16,
+    pub dbname: String,
+    pub user: String,
+    pub password: String,
+}
+
+impl DatabaseInfo {
+    pub fn socket_addr(&self) -> SocketAddr {
+        SocketAddr::new(self.host, self.port)
+    }
+
+    pub fn conn_string(&self) -> String {
+        format!(
+            "dbname={} user={} password={}",
+            self.dbname, self.user, self.password
+        )
+    }
+}
+
+// mock cplane api
+impl CPlaneApi {
+    pub fn new(_address: &SocketAddr) -> CPlaneApi {
+        CPlaneApi {
+            // address: address.clone(),
+        }
+    }
+
+    pub fn check_auth(&self, user: &str, md5_response: &[u8], salt: &[u8; 4]) -> Result<()> {
+        // passwords for both is "mypass"
+        let auth_map: HashMap<_, &str> = vec![
+            ("stas@zenith", "716ee6e1c4a9364d66285452c47402b1"),
+            ("stas2@zenith", "3996f75df64c16a8bfaf01301b61d582"),
+        ]
+        .into_iter()
+        .collect();
+
+        let stored_hash = auth_map
+            .get(&user)
+            .ok_or_else(|| anyhow::Error::msg("user not found"))?;
+        let salted_stored_hash = format!(
+            "md5{:x}",
+            md5::compute([stored_hash.as_bytes(), salt].concat())
+        );
+
+        let received_hash = std::str::from_utf8(&md5_response)?;
+
+        println!(
+            "auth: {} rh={} sh={} ssh={} {:?}",
+            user, received_hash, stored_hash, salted_stored_hash, salt
+        );
+
+        if received_hash == salted_stored_hash {
+            Ok(())
+        } else {
+            bail!("Auth failed")
+        }
+    }
+
+    pub fn get_database_uri(&self, _user: &str, _database: &str) -> Result<DatabaseInfo> {
+        Ok(DatabaseInfo {
+            host: "127.0.0.1".parse()?,
+            port: 5432,
+            dbname: "stas".to_string(),
+            user: "stas".to_string(),
+            password: "mypass".to_string(),
+        })
+    }
+
+    // pub fn create_database(&self, _user: &String, _database: &String) -> Result<DatabaseInfo> {
+    //     Ok(DatabaseInfo {
+    //         host: "127.0.0.1".parse()?,
+    //         port: 5432,
+    //         dbname: "stas".to_string(),
+    //         user: "stas".to_string(),
+    //         password: "mypass".to_string(),
+    //     })
+    // }
+}
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -0,0 +1,106 @@
+///
+/// Postgres protocol proxy/router.
+///
+/// This service listens psql port and can check auth via external service
+/// (control plane API in our case) and can create new databases and accounts
+/// in somewhat transparent manner (again via communication with control plane API).
+///
+use std::{
+    collections::HashMap,
+    net::{SocketAddr, TcpListener},
+    sync::{mpsc, Mutex},
+    thread,
+};
+
+use clap::{App, Arg};
+
+use cplane_api::DatabaseInfo;
+
+mod cplane_api;
+mod mgmt;
+mod proxy;
+
+pub struct ProxyConf {
+    /// main entrypoint for users to connect to
+    pub proxy_address: SocketAddr,
+
+    /// http management endpoint. Upon user account creation control plane
+    /// will notify us here, so that we can 'unfreeze' user session.
+    pub mgmt_address: SocketAddr,
+
+    /// send unauthenticated users to this URI
+    pub redirect_uri: String,
+
+    /// control plane address where we would check auth.
+    pub cplane_address: SocketAddr,
+}
+
+pub struct ProxyState {
+    pub conf: ProxyConf,
+    pub waiters: Mutex<HashMap<String, mpsc::Sender<anyhow::Result<DatabaseInfo>>>>,
+}
+
+fn main() -> anyhow::Result<()> {
+    let arg_matches = App::new("Zenith proxy/router")
+        .arg(
+            Arg::with_name("proxy")
+                .short("p")
+                .long("proxy")
+                .takes_value(true)
+                .help("listen for incoming client connections on ip:port")
+                .default_value("127.0.0.1:4432"),
+        )
+        .arg(
+            Arg::with_name("mgmt")
+                .short("m")
+                .long("mgmt")
+                .takes_value(true)
+                .help("listen for management callback connection on ip:port")
+                .default_value("127.0.0.1:7000"),
+        )
+        .arg(
+            Arg::with_name("uri")
+                .short("u")
+                .long("uri")
+                .takes_value(true)
+                .help("redirect unauthenticated users to given uri")
+                .default_value("http://localhost:3000/psql_session/"),
+        )
+        .get_matches();
+
+    let conf = ProxyConf {
+        proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
+        mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
+        redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
+        cplane_address: "127.0.0.1:3000".parse()?,
+    };
+    let state = ProxyState {
+        conf,
+        waiters: Mutex::new(HashMap::new()),
+    };
+    let state: &'static ProxyState = Box::leak(Box::new(state));
+
+    // Check that we can bind to address before further initialization
+    println!("Starting proxy on {}", state.conf.proxy_address);
+    let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;
+
+    println!("Starting mgmt on {}", state.conf.mgmt_address);
+    let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
+
+    let threads = vec![
+        // Spawn a thread to listen for connections. It will spawn further threads
+        // for each connection.
+        thread::Builder::new()
+            .name("Proxy thread".into())
+            .spawn(move || proxy::thread_main(&state, pageserver_listener))?,
+        thread::Builder::new()
+            .name("Mgmt thread".into())
+            .spawn(move || mgmt::thread_main(&state, mgmt_listener))?,
+    ];
+
+    for t in threads.into_iter() {
+        t.join().unwrap()?;
+    }
+
+    Ok(())
+}
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -0,0 +1,111 @@
+use std::{
+    net::{TcpListener, TcpStream},
+    thread,
+};
+
+use anyhow::bail;
+use bytes::Bytes;
+use serde::{Deserialize, Serialize};
+use zenith_utils::{
+    postgres_backend::{self, query_from_cstring, PostgresBackend},
+    pq_proto::{BeMessage, SINGLE_COL_ROWDESC},
+};
+
+use crate::{cplane_api::DatabaseInfo, ProxyState};
+
+///
+/// Main proxy listener loop.
+///
+/// Listens for connections, and launches a new handler thread for each.
+///
+pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow::Result<()> {
+    loop {
+        let (socket, peer_addr) = listener.accept()?;
+        println!("accepted connection from {}", peer_addr);
+        socket.set_nodelay(true).unwrap();
+
+        thread::spawn(move || {
+            if let Err(err) = mgmt_conn_main(state, socket) {
+                println!("error: {}", err);
+            }
+        });
+    }
+}
+
+pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
+    let mut conn_handler = MgmtHandler { state };
+    let mut pgbackend = PostgresBackend::new(socket, postgres_backend::AuthType::Trust)?;
+    pgbackend.run(&mut conn_handler)
+}
+
+struct MgmtHandler {
+    state: &'static ProxyState,
+}
+/// Serialized examples:
+// {
+//     "session_id": "71d6d03e6d93d99a",
+//     "result": {
+//         "Success": {
+//             "host": "127.0.0.1",
+//             "port": 5432,
+//             "dbname": "stas",
+//             "user": "stas"
+//             "password": "mypass"
+//         }
+//     }
+// }
+// {
+//     "session_id": "71d6d03e6d93d99a",
+//     "result": {
+//         "Failure": "oops"
+//     }
+// }
+#[derive(Serialize, Deserialize)]
+pub struct PsqlSessionResponse {
+    session_id: String,
+    result: PsqlSessionResult,
+}
+
+#[derive(Serialize, Deserialize)]
+pub enum PsqlSessionResult {
+    Success(DatabaseInfo),
+    Failure(String),
+}
+
+impl postgres_backend::Handler for MgmtHandler {
+    fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend,
+        query_string: Bytes,
+    ) -> anyhow::Result<()> {
+        let query_string = query_from_cstring(query_string);
+
+        println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
+
+        let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
+
+        let waiters = self.state.waiters.lock().unwrap();
+
+        let sender = waiters
+            .get(&resp.session_id)
+            .ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
+
+        match resp.result {
+            PsqlSessionResult::Success(db_info) => {
+                sender.send(Ok(db_info))?;
+
+                pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
+                    .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
+                    .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                pgb.flush()?;
+                Ok(())
+            }
+
+            PsqlSessionResult::Failure(message) => {
+                sender.send(Err(anyhow::Error::msg(message.clone())))?;
+
+                bail!("psql session request failed: {}", message)
+            }
+        }
+    }
+}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -0,0 +1,256 @@
+use crate::cplane_api::CPlaneApi;
+use crate::cplane_api::DatabaseInfo;
+use crate::ProxyState;
+
+use anyhow::bail;
+use tokio_postgres::NoTls;
+
+use rand::Rng;
+use std::sync::mpsc::channel;
+use std::thread;
+use tokio::io::AsyncWriteExt;
+use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
+use zenith_utils::pq_proto::*;
+use zenith_utils::{postgres_backend, pq_proto::BeMessage};
+
+///
+/// Main proxy listener loop.
+///
+/// Listens for connections, and launches a new handler thread for each.
+///
+pub fn thread_main(
+    state: &'static ProxyState,
+    listener: std::net::TcpListener,
+) -> anyhow::Result<()> {
+    loop {
+        let (socket, peer_addr) = listener.accept()?;
+        println!("accepted connection from {}", peer_addr);
+        socket.set_nodelay(true).unwrap();
+
+        thread::spawn(move || {
+            if let Err(err) = proxy_conn_main(state, socket) {
+                println!("error: {}", err);
+            }
+        });
+    }
+}
+
+// XXX: clean up fields
+struct ProxyConnection {
+    state: &'static ProxyState,
+
+    cplane: CPlaneApi,
+
+    user: String,
+    database: String,
+
+    pgb: PostgresBackend,
+    md5_salt: [u8; 4],
+
+    psql_session_id: String,
+}
+
+pub fn proxy_conn_main(
+    state: &'static ProxyState,
+    socket: std::net::TcpStream,
+) -> anyhow::Result<()> {
+    let mut conn = ProxyConnection {
+        state,
+        cplane: CPlaneApi::new(&state.conf.cplane_address),
+        user: "".into(),
+        database: "".into(),
+        pgb: PostgresBackend::new(socket, postgres_backend::AuthType::MD5)?,
+        md5_salt: [0u8; 4],
+        psql_session_id: "".into(),
+    };
+
+    // Check StartupMessage
+    // This will set conn.existing_user and we can decide on next actions
+    conn.handle_startup()?;
+
+    // both scenarious here should end up producing database connection string
+    let db_info = if conn.is_existing_user() {
+        conn.handle_existing_user()?
+    } else {
+        conn.handle_new_user()?
+    };
+
+    // ok, proxy pass user connection to database_uri
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let _ = runtime.block_on(proxy_pass(conn.pgb, db_info))?;
+
+    println!("proxy_conn_main done;");
+
+    Ok(())
+}
+
+impl ProxyConnection {
+    fn is_existing_user(&self) -> bool {
+        self.user.ends_with("@zenith")
+    }
+
+    fn handle_startup(&mut self) -> anyhow::Result<()> {
+        loop {
+            let msg = self.pgb.read_message()?;
+            println!("got message {:?}", msg);
+            match msg {
+                Some(FeMessage::StartupMessage(m)) => {
+                    println!("got startup message {:?}", m);
+
+                    match m.kind {
+                        StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => {
+                            println!("SSL requested");
+                            self.pgb.write_message(&BeMessage::Negotiate)?;
+                        }
+                        StartupRequestCode::Normal => {
+                            self.user = m
+                                .params
+                                .get("user")
+                                .ok_or_else(|| {
+                                    anyhow::Error::msg("user is required in startup packet")
+                                })?
+                                .into();
+                            self.database = m
+                                .params
+                                .get("database")
+                                .ok_or_else(|| {
+                                    anyhow::Error::msg("database is required in startup packet")
+                                })?
+                                .into();
+
+                            break;
+                        }
+                        StartupRequestCode::Cancel => break,
+                    }
+                }
+                None => {
+                    bail!("connection closed")
+                }
+                unexpected => {
+                    bail!("unexpected message type : {:?}", unexpected)
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
+        // ask password
+        rand::thread_rng().fill(&mut self.md5_salt);
+        self.pgb
+            .write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
+        self.pgb.state = ProtoState::Authentication; // XXX
+
+        // check password
+        println!("handle_existing_user");
+        let msg = self.pgb.read_message()?;
+        println!("got message {:?}", msg);
+        if let Some(FeMessage::PasswordMessage(m)) = msg {
+            println!("got password message '{:?}'", m);
+
+            assert!(self.is_existing_user());
+
+            let (_trailing_null, md5_response) = m
+                .split_last()
+                .ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;
+
+            if let Err(e) = self.check_auth_md5(md5_response) {
+                self.pgb
+                    .write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
+                bail!("auth failed: {}", e);
+            } else {
+                self.pgb
+                    .write_message_noflush(&BeMessage::AuthenticationOk)?;
+                self.pgb
+                    .write_message_noflush(&BeMessage::ParameterStatus)?;
+                self.pgb.write_message(&BeMessage::ReadyForQuery)?;
+            }
+        }
+
+        // ok, we are authorized
+        self.cplane.get_database_uri(&self.user, &self.database)
+    }
+
+    fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
+        let mut psql_session_id_buf = [0u8; 8];
+        rand::thread_rng().fill(&mut psql_session_id_buf);
+        self.psql_session_id = hex::encode(psql_session_id_buf);
+
+        let hello_message = format!("☀️  Welcome to Zenith!
+
+To proceed with database creation open following link:
+
+    {}{}
+
+It needed to be done once and we will send you '.pgpass' file which will allow you to access or create
+databases without opening the browser.
+
+", self.state.conf.redirect_uri,self.psql_session_id);
+
+        self.pgb
+            .write_message_noflush(&BeMessage::AuthenticationOk)?;
+        self.pgb
+            .write_message_noflush(&BeMessage::ParameterStatus)?;
+        self.pgb
+            .write_message(&BeMessage::NoticeResponse(hello_message))?;
+
+        // await for database creation
+        let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
+        let _ = self
+            .state
+            .waiters
+            .lock()
+            .unwrap()
+            .insert(self.psql_session_id.clone(), tx);
+
+        // Wait for web console response
+        // XXX: respond with error to client
+        let dbinfo = rx.recv()??;
+
+        self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
+            "Connecting to database.".to_string(),
+        ))?;
+        self.pgb.write_message(&BeMessage::ReadyForQuery)?;
+
+        Ok(dbinfo)
+    }
+
+    fn check_auth_md5(&self, md5_response: &[u8]) -> anyhow::Result<()> {
+        assert!(self.is_existing_user());
+        self.cplane
+            .check_auth(self.user.as_str(), md5_response, &self.md5_salt)
+    }
+}
+
+async fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
+    let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?;
+    let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
+    let _ = config.connect_raw(&mut socket, NoTls).await?;
+
+    println!("Connected to pg, proxying");
+
+    let incoming_std = pgb.into_stream();
+    incoming_std.set_nonblocking(true)?;
+    let mut incoming_conn = tokio::net::TcpStream::from_std(incoming_std)?;
+
+    let (mut ri, mut wi) = incoming_conn.split();
+    let (mut ro, mut wo) = socket.split();
+
+    let client_to_server = async {
+        tokio::io::copy(&mut ri, &mut wo).await?;
+        wo.shutdown().await
+    };
+
+    let server_to_client = async {
+        tokio::io::copy(&mut ro, &mut wi).await?;
+        wi.shutdown().await
+    };
+
+    tokio::try_join!(client_to_server, server_to_client)?;
+
+    Ok(())
+}
--- a/run_clippy.sh
+++ b/run_clippy.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# If you save this in your path under the name "cargo-zclippy" (or whatever
+# name you like), then you can run it as "cargo zclippy" from the shell prompt.
+#
+# If your text editor has rust-analyzer integration, you can also use this new
+# command as a replacement for "cargo check" or "cargo clippy" and see clippy
+# warnings and errors right in the editor.
+# In vscode, this setting is Rust-analyzer>Check On Save:Command
+
+cargo clippy "${@:2}" -- -A clippy::new_without_default -A clippy::manual_range_contains -A clippy::comparison_chain
--- a/test_runner/Pipfile
+++ b/test_runner/Pipfile
@@ -0,0 +1,18 @@
+[[source]]
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pytest = ">=6.0.0"
+psycopg2 = "*"
+typing-extensions = "*"
+
+[dev-packages]
+yapf = "*"
+flake8 = "*"
+mypy = "*"
+
+[requires]
+# we need at least 3.6, but pipenv doesn't allow to say this directly
+python_version = "3"
--- a/test_runner/Pipfile.lock
+++ b/test_runner/Pipfile.lock
@@ -0,0 +1,269 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "4c20c05c20c50cf7e8f78ab461ab23841125345e63e00e2efa7661c165b6b364"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.python.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "attrs": {
+            "hashes": [
+                "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
+                "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==21.2.0"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:833b26fb89d5de469b24a390e9df088d4e52e4ba33b01dc5e0e4f41b81a16c00",
+                "sha256:b142cc1dd1342f31ff04bb7d022492b09920cb64fed867cd3ea6f80fe3ebd139"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==4.5.0"
+        },
+        "iniconfig": {
+            "hashes": [
+                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
+                "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
+            ],
+            "version": "==1.1.1"
+        },
+        "packaging": {
+            "hashes": [
+                "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5",
+                "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==20.9"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
+                "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.13.1"
+        },
+        "psycopg2": {
+            "hashes": [
+                "sha256:03a485bf71498870e38b535c0e6e7162d6ac06a91487edddc3b959894d65f79c",
+                "sha256:22102cfeb904898254f287b1a77360bf66c636858e7476593acd5267e5c24ff9",
+                "sha256:8f4c1800e57ad128d20b2e91d222ca238fffd316cef65be781361cdf35e37979",
+                "sha256:b12073fdf2002e828e5921be2c39ff9c6eab361c5c0bd6c529619fc23677accc",
+                "sha256:b6f47af317af8110818d255e693cfa80b7f1e435285be09778db7b66efd95789",
+                "sha256:d549db98fc0e6db41a2aa0d65f7434c4308a9f64012adb209b9e489f26fe87c6",
+                "sha256:e44e39a46af7c30566b7667fb27e701e652ab0a51e05c263a01d3ff0e223b765",
+                "sha256:e84c80be7a238d3c9c099b71f6890eaa35fc881146232cce888a88ab1bfb431e",
+                "sha256:f3d42bd42302293767b84206d9a446abc67ed4a133e4fe04dad8952de06c2091"
+            ],
+            "index": "pypi",
+            "version": "==2.9"
+        },
+        "py": {
+            "hashes": [
+                "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
+                "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.10.0"
+        },
+        "pyparsing": {
+            "hashes": [
+                "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
+                "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.4.7"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b",
+                "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"
+            ],
+            "index": "pypi",
+            "version": "==6.2.4"
+        },
+        "toml": {
+            "hashes": [
+                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.10.2"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497",
+                "sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342",
+                "sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"
+            ],
+            "index": "pypi",
+            "version": "==3.10.0.0"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76",
+                "sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.4.1"
+        }
+    },
+    "develop": {
+        "flake8": {
+            "hashes": [
+                "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b",
+                "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"
+            ],
+            "index": "pypi",
+            "version": "==3.9.2"
+        },
+        "importlib-metadata": {
+            "hashes": [
+                "sha256:833b26fb89d5de469b24a390e9df088d4e52e4ba33b01dc5e0e4f41b81a16c00",
+                "sha256:b142cc1dd1342f31ff04bb7d022492b09920cb64fed867cd3ea6f80fe3ebd139"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==4.5.0"
+        },
+        "mccabe": {
+            "hashes": [
+                "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+                "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+            ],
+            "version": "==0.6.1"
+        },
+        "mypy": {
+            "hashes": [
+                "sha256:0190fb77e93ce971954c9e54ea61de2802065174e5e990c9d4c1d0f54fbeeca2",
+                "sha256:0756529da2dd4d53d26096b7969ce0a47997123261a5432b48cc6848a2cb0bd4",
+                "sha256:2f9fedc1f186697fda191e634ac1d02f03d4c260212ccb018fabbb6d4b03eee8",
+                "sha256:353aac2ce41ddeaf7599f1c73fed2b75750bef3b44b6ad12985a991bc002a0da",
+                "sha256:3f12705eabdd274b98f676e3e5a89f247ea86dc1af48a2d5a2b080abac4e1243",
+                "sha256:4efc67b9b3e2fddbe395700f91d5b8deb5980bfaaccb77b306310bd0b9e002eb",
+                "sha256:517e7528d1be7e187a5db7f0a3e479747307c1b897d9706b1c662014faba3116",
+                "sha256:68a098c104ae2b75e946b107ef69dd8398d54cb52ad57580dfb9fc78f7f997f0",
+                "sha256:746e0b0101b8efec34902810047f26a8c80e1efbb4fc554956d848c05ef85d76",
+                "sha256:8be7bbd091886bde9fcafed8dd089a766fa76eb223135fe5c9e9798f78023a20",
+                "sha256:9236c21194fde5df1b4d8ebc2ef2c1f2a5dc7f18bcbea54274937cae2e20a01c",
+                "sha256:9ef5355eaaf7a23ab157c21a44c614365238a7bdb3552ec3b80c393697d974e1",
+                "sha256:9f1d74eeb3f58c7bd3f3f92b8f63cb1678466a55e2c4612bf36909105d0724ab",
+                "sha256:a26d0e53e90815c765f91966442775cf03b8a7514a4e960de7b5320208b07269",
+                "sha256:ae94c31bb556ddb2310e4f913b706696ccbd43c62d3331cd3511caef466871d2",
+                "sha256:b5ba1f0d5f9087e03bf5958c28d421a03a4c1ad260bf81556195dffeccd979c4",
+                "sha256:b5dfcd22c6bab08dfeded8d5b44bdcb68c6f1ab261861e35c470b89074f78a70",
+                "sha256:cd01c599cf9f897b6b6c6b5d8b182557fb7d99326bcdf5d449a0fbbb4ccee4b9",
+                "sha256:e89880168c67cf4fde4506b80ee42f1537ad66ad366c101d388b3fd7d7ce2afd",
+                "sha256:ebe2bc9cb638475f5d39068d2dbe8ae1d605bb8d8d3ff281c695df1670ab3987",
+                "sha256:f89bfda7f0f66b789792ab64ce0978e4a991a0e4dd6197349d0767b0f1095b21",
+                "sha256:fc4d63da57ef0e8cd4ab45131f3fe5c286ce7dd7f032650d0fbc239c6190e167",
+                "sha256:fd634bc17b1e2d6ce716f0e43446d0d61cdadb1efcad5c56ca211c22b246ebc8"
+            ],
+            "index": "pypi",
+            "version": "==0.902"
+        },
+        "mypy-extensions": {
+            "hashes": [
+                "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
+                "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
+            ],
+            "version": "==0.4.3"
+        },
+        "pycodestyle": {
+            "hashes": [
+                "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068",
+                "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.7.0"
+        },
+        "pyflakes": {
+            "hashes": [
+                "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3",
+                "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.3.1"
+        },
+        "toml": {
+            "hashes": [
+                "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
+                "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.10.2"
+        },
+        "typed-ast": {
+            "hashes": [
+                "sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
+                "sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
+                "sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
+                "sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
+                "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
+                "sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
+                "sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
+                "sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
+                "sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
+                "sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
+                "sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
+                "sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
+                "sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
+                "sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
+                "sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
+                "sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
+                "sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
+                "sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
+                "sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
+                "sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
+                "sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
+                "sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
+                "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
+                "sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
+                "sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
+                "sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
+                "sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
+                "sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
+                "sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
+                "sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
+            ],
+            "markers": "python_version < '3.8'",
+            "version": "==1.4.3"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:0ac0f89795dd19de6b97debb0c6af1c70987fd80a2d62d1958f7e56fcc31b497",
+                "sha256:50b6f157849174217d0656f99dc82fe932884fb250826c18350e159ec6cdf342",
+                "sha256:779383f6086d90c99ae41cf0ff39aac8a7937a9283ce0a414e5dd782f4c94a84"
+            ],
+            "index": "pypi",
+            "version": "==3.10.0.0"
+        },
+        "yapf": {
+            "hashes": [
+                "sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
+                "sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
+            ],
+            "index": "pypi",
+            "version": "==0.31.0"
+        },
+        "zipp": {
+            "hashes": [
+                "sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76",
+                "sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==3.4.1"
+        }
+    }
+}
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -0,0 +1,104 @@
+## Zenith test runner
+
+This directory contains integration tests.
+
+Prerequisites:
+- Python 3.6 or later
+- Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
+  packages are stale, as it commonly happens, so manual installation is not
+  recommended.
+  Run `pipenv shell` to activate the venv.
+- Zenith and Postgres binaries
+    - See the root README.md for build directions
+    - Tests can be run from the git tree; or see the environment variables
+      below to run from other directories.
+- The zenith git repo, including the postgres submodule
+  (for some tests, e.g. pg_regress)
+
+### Test Organization
+
+The tests are divided into a few batches, such that each batch takes roughly
+the same amount of time. The batches can be run in parallel, to minimize total
+runtime. Currently, there are only two batches:
+
+- test_batch_pg_regress: Runs PostgreSQL regression tests
+- test_others: All other tests
+
+### Running the tests
+
+Because pytest will search all subdirectories for tests, it's easiest to
+run the tests from within the `test_runner` directory.
+
+Test state (postgres data, pageserver state, and log files) will
+be stored under a directory `test_output`.
+
+You can run all the tests with:
+
+`pytest`
+
+If you want to run all the tests in a particular file:
+
+`pytest test_pgbench.py`
+
+If you want to run all tests that have the string "bench" in their names:
+
+`pytest -k bench`
+
+Useful environment variables:
+
+`ZENITH_BIN`: The directory where zenith binaries can be found.
+`POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
+`TEST_OUTPUT`: Set the directory where test state and test output files
+should go.
+`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
+
+Let stdout and stderr go to the terminal instead of capturing them:
+`pytest -s ...`
+(Note many tests capture subprocess outputs separately, so this may not
+show much.)
+
+Exit after the first test failure:
+`pytest -x ...`
+(there are many more pytest options; run `pytest -h` to see them.)
+
+
+### Building new tests
+
+The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
+
+Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
+
+So this code:
+
+```python
+def test_something(zenith_cli, pg_bin):
+    pass
+```
+
+... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
+
+Fixtures can't be imported using the normal python syntax. Instead, use this:
+
+```python
+pytest_plugins = ("fixtures.something")
+```
+
+That will make all the fixtures in the `fixtures/something.py` file available.
+
+Anything that's likely to be used in multiple tests should be built into a fixture.
+
+Note that fixtures can clean up after themselves if they use the `yield` syntax.
+Cleanup will happen even if the test fails (raises an unhandled exception).
+Python destructors, e.g. `__del__()` aren't recommended for cleanup.
+
+
+### Code quality
+
+Before submitting a patch, please consider:
+
+* Writing a couple of docstrings to clarify the reasoning behind a new test.
+* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
+* Formatting the code with `yapf -r -i .` (TODO: implement an opt-in pre-commit hook for that).
+* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.
+
+The tools can be installed with `pipenv install --dev`.
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -0,0 +1,73 @@
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+#
+# Create a couple of branches off the main branch, at a historical point in time.
+#
+def test_branch_behind(zenith_cli, pageserver, postgres, pg_bin):
+    # Branch at the point where only 100 rows were inserted
+    zenith_cli.run(["branch", "test_branch_behind", "empty"])
+
+    pgmain = postgres.create_start('test_branch_behind')
+    print("postgres is running on 'test_branch_behind' branch")
+
+    main_pg_conn = pgmain.connect()
+    main_cur = main_pg_conn.cursor()
+
+    # Create table, and insert the first 100 rows
+    main_cur.execute('CREATE TABLE foo (t text)')
+    main_cur.execute('''
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 100) g
+    ''')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn_a = main_cur.fetchone()[0]
+    print('LSN after 100 rows: ' + lsn_a)
+
+    # Insert some more rows. (This generates enough WAL to fill a few segments.)
+    main_cur.execute('''
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 100000) g
+    ''')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn_b = main_cur.fetchone()[0]
+    print('LSN after 100100 rows: ' + lsn_b)
+
+    # Branch at the point where only 100 rows were inserted
+    zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
+
+    # Insert many more rows. This generates enough WAL to fill a few segments.
+    main_cur.execute('''
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 100000) g
+    ''')
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+
+    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
+    lsn_c = main_cur.fetchone()[0]
+    print('LSN after 200100 rows: ' + lsn_c)
+
+    # Branch at the point where only 200 rows were inserted
+    zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
+
+    pg_hundred = postgres.create_start("test_branch_behind_hundred")
+    pg_more = postgres.create_start("test_branch_behind_more")
+
+    # On the 'hundred' branch, we should see only 100 rows
+    hundred_pg_conn = pg_hundred.connect()
+    hundred_cur = hundred_pg_conn.cursor()
+    hundred_cur.execute('SELECT count(*) FROM foo')
+    assert hundred_cur.fetchone() == (100, )
+
+    # On the 'more' branch, we should see 100200 rows
+    more_pg_conn = pg_more.connect()
+    more_cur = more_pg_conn.cursor()
+    more_cur.execute('SELECT count(*) FROM foo')
+    assert more_cur.fetchone() == (100100, )
+
+    # All the rows are visible on the main branch
+    main_cur.execute('SELECT count(*) FROM foo')
+    assert main_cur.fetchone() == (200100, )
--- a/test_runner/batch_others/test_bulk_insert.py
+++ b/test_runner/batch_others/test_bulk_insert.py
@@ -0,0 +1,25 @@
+from contextlib import closing
+import psycopg2.extras
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+#
+# Test insertion of larg number of records
+#
+# This test is pretty tightly coupled with the current implementation of page version storage
+# and garbage collection in object_repository.rs.
+#
+def test_bulk_insert(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_bulk_insert", "empty"])
+    pg = postgres.create_start('test_bulk_insert')
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+          cur.execute("create table t(c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint)")
+          cur.execute("create index on t(c1)")
+          cur.execute("create index on t(c2)")
+          cur.execute("create index on t(c3)")
+          cur.execute("create index on t(c4)")
+          cur.execute("create index on t(c5)")
+          cur.execute("insert into t values (generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000))")
+          cur.execute("insert into t values (generate_series(1,1000000),random()*1000000,random()*1000000,random()*1000000,random()*1000000)")
--- a/test_runner/batch_others/test_config.py
+++ b/test_runner/batch_others/test_config.py
@@ -0,0 +1,29 @@
+from contextlib import closing
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+#
+# Test starting Postgres with custom options
+#
+def test_config(zenith_cli, pageserver, postgres, pg_bin):
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_config", "empty"])
+
+    # change config
+    pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
+    print('postgres is running on test_config branch')
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute('''
+                SELECT setting
+                FROM pg_settings
+                WHERE
+                    source != 'default'
+                    AND source != 'override'
+                    AND name = 'log_min_messages'
+            ''')
+
+            # check that config change was applied
+            assert cur.fetchone() == ('debug1', )
--- a/test_runner/batch_others/test_createdb.py
+++ b/test_runner/batch_others/test_createdb.py
@@ -0,0 +1,32 @@
+from contextlib import closing
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+#
+# Test CREATE DATABASE when there have been relmapper changes
+#
+def test_createdb(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_createdb", "empty"])
+
+    pg = postgres.create_start('test_createdb')
+    print("postgres is running on 'test_createdb' branch")
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # Cause a 'relmapper' change in the original branch
+            cur.execute('VACUUM FULL pg_class')
+
+            cur.execute('CREATE DATABASE foodb')
+
+            cur.execute('SELECT pg_current_wal_insert_lsn()')
+            lsn = cur.fetchone()[0]
+
+    # Create a branch
+    zenith_cli.run(["branch", "test_createdb2", "test_createdb@" + lsn])
+
+    pg2 = postgres.create_start('test_createdb2')
+
+    # Test that you can connect to the new database on both branches
+    for db in (pg, pg2):
+        db.connect(dbname='foodb').close()
--- a/test_runner/batch_others/test_createuser.py
+++ b/test_runner/batch_others/test_createuser.py
@@ -0,0 +1,31 @@
+from contextlib import closing
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+#
+# Test CREATE USER to check shared catalog restore
+#
+def test_createuser(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_createuser", "empty"])
+
+    pg = postgres.create_start('test_createuser')
+    print("postgres is running on 'test_createuser' branch")
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # Cause a 'relmapper' change in the original branch
+            cur.execute('CREATE USER testuser with password %s', ('testpwd', ))
+
+            cur.execute('CHECKPOINT')
+
+            cur.execute('SELECT pg_current_wal_insert_lsn()')
+            lsn = cur.fetchone()[0]
+
+    # Create a branch
+    zenith_cli.run(["branch", "test_createuser2", "test_createuser@" + lsn])
+
+    pg2 = postgres.create_start('test_createuser2')
+
+    # Test that you can connect to new branch as a new user
+    assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]
--- a/test_runner/batch_others/test_gc.py
+++ b/test_runner/batch_others/test_gc.py
@@ -0,0 +1,97 @@
+from contextlib import closing
+import psycopg2.extras
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+#
+# Test Garbage Collection of old page versions.
+#
+# This test is pretty tightly coupled with the current implementation of page version storage
+# and garbage collection in object_repository.rs.
+#
+def test_gc(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_gc", "empty"])
+    pg = postgres.create_start('test_gc')
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            with closing(pageserver.connect()) as psconn:
+                with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
+
+                    # Get the timeline ID of our branch. We need it for the 'do_gc' command
+                    cur.execute("SHOW zenith.zenith_timeline")
+                    timeline = cur.fetchone()[0]
+
+                    # Create a test table
+                    cur.execute("CREATE TABLE foo(x integer)")
+
+                    # Run GC, to clear out any old page versions left behind in the catalogs by
+                    # the CREATE TABLE command. We want to have a clean slate with no garbage
+                    # before running the actual tests below, otherwise the counts won't match
+                    # what we expect.
+                    print("Running GC before test")
+                    pscur.execute(f"do_gc {timeline} 0")
+                    row = pscur.fetchone()
+                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
+                    # remember the number of relations
+                    n_relations = row['n_relations']
+                    assert n_relations > 0
+
+                    # Insert a row. The first insert will also create a metadata entry for the
+                    # relation, with size == 1 block. Hence, bump up the expected relation count.
+                    n_relations += 1;
+                    print("Inserting one row and running GC")
+                    cur.execute("INSERT INTO foo VALUES (1)")
+                    pscur.execute(f"do_gc {timeline} 0")
+                    row = pscur.fetchone()
+                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
+                    assert row['n_relations'] == n_relations
+                    assert row['dropped'] == 0
+                    assert row['truncated'] == 30
+                    assert row['deleted'] == 3
+
+                    # Insert two more rows and run GC.
+                    print("Inserting two more rows and running GC")
+                    cur.execute("INSERT INTO foo VALUES (2)")
+                    cur.execute("INSERT INTO foo VALUES (3)")
+
+                    pscur.execute(f"do_gc {timeline} 0")
+                    row = pscur.fetchone()
+                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
+                    assert row['n_relations'] == n_relations
+                    assert row['dropped'] == 0
+                    assert row['truncated'] == 30
+                    assert row['deleted'] == 2
+
+                    # Insert one more row. It creates one more page version, but doesn't affect the
+                    # relation size.
+                    print("Inserting one more row")
+                    cur.execute("INSERT INTO foo VALUES (3)")
+
+                    pscur.execute(f"do_gc {timeline} 0")
+                    row = pscur.fetchone()
+                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
+                    assert row['n_relations'] == n_relations
+                    assert row['dropped'] == 0
+                    assert row['truncated'] == 30
+                    assert row['deleted'] == 1
+
+                    # Run GC again, with no changes in the database. Should not remove anything.
+                    pscur.execute(f"do_gc {timeline} 0")
+                    row = pscur.fetchone()
+                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
+                    assert row['n_relations'] == n_relations
+                    assert row['dropped'] == 0
+                    assert row['truncated'] == 30
+                    assert row['deleted'] == 0
+
+                    #
+                    # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
+                    #
+                    cur.execute("DROP TABLE foo")
+
+                    pscur.execute(f"do_gc {timeline} 0")
+                    row = pscur.fetchone()
+                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
+                    # Each relation fork is counted separately, hence 3.
+                    assert row['dropped'] == 3
--- a/test_runner/batch_others/test_multixact.py
+++ b/test_runner/batch_others/test_multixact.py
@@ -0,0 +1,63 @@
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+#
+# Test multixact state after branching
+# Now this test is very minimalistic -
+# it only checks next_multixact_id field in restored pg_control,
+# since we don't have functions to check multixact internals.
+#
+def test_multixact(pageserver, postgres, pg_bin, zenith_cli, base_dir):
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_multixact", "empty"])
+    pg = postgres.create_start('test_multixact')
+
+    print("postgres is running on 'test_multixact' branch")
+    pg_conn = pg.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute('''
+        CREATE TABLE t1(i int primary key);
+        INSERT INTO t1 select * from generate_series(1, 100);
+    ''')
+
+    cur.execute('SELECT next_multixact_id FROM pg_control_checkpoint()')
+    next_multixact_id_old = cur.fetchone()[0]
+
+    # Lock entries in parallel connections to set multixact
+    nclients = 3
+    connections = []
+    for i in range(nclients):
+        # Do not turn on autocommit. We want to hold the key-share locks.
+        conn = pg.connect(autocommit=False)
+        conn.cursor().execute('select * from t1 for key share')
+        connections.append(conn)
+
+    # We should have a multixact now. We can close the connections.
+    for c in connections:
+        c.close()
+
+    # force wal flush
+    cur.execute('checkpoint')
+
+    cur.execute('SELECT next_multixact_id, pg_current_wal_flush_lsn() FROM pg_control_checkpoint()')
+    res = cur.fetchone()
+    next_multixact_id = res[0]
+    lsn = res[1]
+
+    # Ensure that we did lock some tuples
+    assert int(next_multixact_id) > int(next_multixact_id_old)
+
+    # Branch at this point
+    zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
+    pg_new = postgres.create_start('test_multixact_new')
+
+    print("postgres is running on 'test_multixact_new' branch")
+    pg_new_conn = pg_new.connect()
+    cur_new = pg_new_conn.cursor()
+
+    cur_new.execute('SELECT next_multixact_id FROM pg_control_checkpoint()')
+    next_multixact_id_new = cur_new.fetchone()[0]
+
+    # Check that we restored pg_controlfile correctly
+    assert next_multixact_id_new == next_multixact_id
--- a/test_runner/batch_others/test_pageserver_api.py
+++ b/test_runner/batch_others/test_pageserver_api.py
@@ -0,0 +1,48 @@
+import json
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+def test_status(pageserver):
+    assert pageserver.safe_psql('status') == [
+        ('hello world', ),
+    ]
+
+
+def test_branch_list(pageserver, zenith_cli):
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_branch_list_main", "empty"])
+
+    conn = pageserver.connect()
+    cur = conn.cursor()
+
+    cur.execute('branch_list')
+    branches = json.loads(cur.fetchone()[0])
+    # Filter out branches created by other tests
+    branches = [x for x in branches if x['name'].startswith('test_branch_list')]
+
+    assert len(branches) == 1
+    assert branches[0]['name'] == 'test_branch_list_main'
+    assert 'timeline_id' in branches[0]
+    assert 'latest_valid_lsn' in branches[0]
+    assert 'ancestor_id' in branches[0]
+    assert 'ancestor_lsn' in branches[0]
+
+    # Create another branch, and start Postgres on it
+    zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
+    zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
+
+    cur.execute('branch_list')
+    new_branches = json.loads(cur.fetchone()[0])
+    # Filter out branches created by other tests
+    new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
+    assert len(new_branches) == 2
+    new_branches.sort(key=lambda k: k['name'])
+
+    assert new_branches[0]['name'] == 'test_branch_list_experimental'
+    assert new_branches[0]['timeline_id'] != branches[0]['timeline_id']
+
+    # TODO: do the LSNs have to match here?
+    assert new_branches[1] == branches[0]
+
+    conn.close()
--- a/test_runner/batch_others/test_pgbench.py
+++ b/test_runner/batch_others/test_pgbench.py
@@ -0,0 +1,15 @@
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+def test_pgbench(pageserver, postgres, pg_bin, zenith_cli):
+
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_pgbench", "empty"])
+
+    pg = postgres.create_start('test_pgbench')
+    print("postgres is running on 'test_pgbench' branch")
+
+    connstr = pg.connstr()
+
+    pg_bin.run_capture(['pgbench', '-i', connstr])
+    pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr])
--- a/test_runner/batch_others/test_restart_compute.py
+++ b/test_runner/batch_others/test_restart_compute.py
@@ -0,0 +1,42 @@
+from contextlib import closing
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+#
+# Test restarting and recreating a postgres instance
+#
+def test_restart_compute(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_restart_compute", "empty"])
+
+    pg = postgres.create_start('test_restart_compute')
+    print("postgres is running on 'test_restart_compute' branch")
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # Create table, and insert a row
+            cur.execute('CREATE TABLE foo (t text)')
+            cur.execute("INSERT INTO foo VALUES ('bar')")
+
+    # Stop and restart the Postgres instance
+    pg.stop_and_destroy().create_start('test_restart_compute')
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # We can still see the row
+            cur.execute('SELECT count(*) FROM foo')
+            assert cur.fetchone() == (1, )
+
+            # Insert another row
+            cur.execute("INSERT INTO foo VALUES ('bar2')")
+            cur.execute('SELECT count(*) FROM foo')
+            assert cur.fetchone() == (2, )
+
+    # Stop, and destroy the Postgres instance. Then recreate and restart it.
+    pg.stop_and_destroy().create_start('test_restart_compute')
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # We can still see the rows
+            cur.execute('SELECT count(*) FROM foo')
+            assert cur.fetchone() == (2, )
--- a/test_runner/batch_others/test_seq_scan.py
+++ b/test_runner/batch_others/test_seq_scan.py
@@ -0,0 +1,26 @@
+from contextlib import closing
+import psycopg2.extras
+import time
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+#
+# Test insertion of larg number of records
+#
+# This test is pretty tightly coupled with the current implementation of page version storage
+# and garbage collection in object_repository.rs.
+#
+def test_seq_scan(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_seq_scan", "empty"])
+    pg = postgres.create_start('test_seq_scan')
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+          cur.execute("create table t(c1 bigint, c2 bigint, c3 bigint, c4 bigint, c5 bigint)")
+          cur.execute("insert into t values (generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000))")
+          cur.execute("set max_parallel_workers_per_gather=0");
+          for i in range(100):
+              start = time.time()
+              cur.execute("select count(*) from t");
+              stop = time.time()
+              print(f'Elapsed time for iterating through 1000000 records is {stop - start}')
--- a/test_runner/batch_others/test_twophase.py
+++ b/test_runner/batch_others/test_twophase.py
@@ -0,0 +1,46 @@
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+
+#
+# Test branching, when a transaction is in prepared state
+#
+def test_twophase(zenith_cli, pageserver, postgres, pg_bin):
+    zenith_cli.run(["branch", "test_twophase", "empty"])
+
+    pg = postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5'])
+    print("postgres is running on 'test_twophase' branch")
+
+    conn = pg.connect()
+    cur = conn.cursor()
+
+    cur.execute('CREATE TABLE foo (t text)')
+
+    # Prepare a transaction that will insert a row
+    cur.execute('BEGIN')
+    cur.execute("INSERT INTO foo VALUES ('one')")
+    cur.execute("PREPARE TRANSACTION 'insert_one'")
+
+    # Prepare another transaction that will insert a row
+    cur.execute('BEGIN')
+    cur.execute("INSERT INTO foo VALUES ('two')")
+    cur.execute("PREPARE TRANSACTION 'insert_two'")
+
+    # Create a branch with the transaction in prepared state
+    zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"])
+
+    pg2 = postgres.create_start('test_twophase_prepared',
+                                config_lines=['max_prepared_transactions=5'])
+    conn2 = pg2.connect()
+    cur2 = conn2.cursor()
+
+    # On the new branch, commit one of the prepared transactions, abort the other one.
+    cur2.execute("COMMIT PREPARED 'insert_one'")
+    cur2.execute("ROLLBACK PREPARED 'insert_two'")
+
+    cur2.execute('SELECT * FROM foo')
+    assert cur2.fetchall() == [('one', )]
+
+    # Neither insert is visible on the original branch, the transactions are still
+    # in prepared state there.
+    cur.execute('SELECT * FROM foo')
+    assert cur.fetchall() == []
--- a/Show More
+++ b/Show More