Merge branch 'main' into bojan-psbench-over-kvstore

2026-01-15 09:22:55 +00:00 · 2022-04-12 13:04:59 -04:00
parent 98149f1a08 07a9553700
commit 20923e70f5
124 changed files with 9381 additions and 3770 deletions
--- a/.circleci/ansible/.gitignore
+++ b/.circleci/ansible/.gitignore
@@ -0,0 +1,2 @@
+zenith_install.tar.gz
+.zenith_current_version
--- a/.circleci/ansible/deploy.yaml
+++ b/.circleci/ansible/deploy.yaml
@@ -1,14 +1,11 @@
 - name: Upload Zenith binaries
-  hosts: pageservers:safekeepers
+  hosts: storage
  gather_facts: False
  remote_user: admin
-  vars:
-    force_deploy: false

  tasks:

    - name: get latest version of Zenith binaries
-      ignore_errors: true
      register: current_version_file
      set_fact:
        current_version: "{{ lookup('file', '.zenith_current_version') | trim }}"
@@ -16,48 +13,13 @@
      - pageserver
      - safekeeper

-    - name: set zero value for current_version
-      when: current_version_file is failed
-      set_fact:
-        current_version: "0"
-      tags:
-      - pageserver
-      - safekeeper
-
-    - name: get deployed version from content of remote file
-      ignore_errors: true
-      ansible.builtin.slurp:
-        src: /usr/local/.zenith_current_version
-      register: remote_version_file
-      tags:
-      - pageserver
-      - safekeeper
-
-    - name: decode remote file content
-      when: remote_version_file is succeeded
-      set_fact:
-        remote_version: "{{ remote_version_file['content'] | b64decode | trim }}"
-      tags:
-      - pageserver
-      - safekeeper
-
-    - name: set zero value for remote_version
-      when: remote_version_file is failed
-      set_fact:
-        remote_version: "0"
-      tags:
-      - pageserver
-      - safekeeper
-
    - name: inform about versions
-      debug: msg="Version to deploy - {{ current_version }}, version on storage node - {{ remote_version }}"
+      debug: msg="Version to deploy - {{ current_version }}"
      tags:
      - pageserver
      - safekeeper

-
    - name: upload and extract Zenith binaries to /usr/local
-      when: current_version > remote_version or force_deploy
      ansible.builtin.unarchive:
        owner: root
        group: root
@@ -74,14 +36,24 @@
  hosts: pageservers
  gather_facts: False
  remote_user: admin
-  vars:
-    force_deploy: false

  tasks:
+
+    - name: upload init script
+      when: console_mgmt_base_url is defined
+      ansible.builtin.template:
+        src: scripts/init_pageserver.sh
+        dest: /tmp/init_pageserver.sh
+        owner: root
+        group: root
+        mode: '0755'
+      become: true
+      tags:
+      - pageserver
+
    - name: init pageserver
-      when: current_version > remote_version or force_deploy
      shell:
-        cmd: sudo -u pageserver /usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
+        cmd: /tmp/init_pageserver.sh
      args:
        creates: "/storage/pageserver/data/tenants"
      environment:
@@ -91,8 +63,23 @@
      tags:
      - pageserver

+    # It seems that currently S3 integration does not play well
+    # even with fresh pageserver without a burden of old data.
+    # TODO: turn this back on once the issue is solved.
+    # - name: update remote storage (s3) config
+    #   lineinfile:
+    #     path: /storage/pageserver/data/pageserver.toml
+    #     line: "{{ item }}"
+    #   loop:
+    #     - "[remote_storage]"
+    #     - "bucket_name = '{{ bucket_name }}'"
+    #     - "bucket_region = '{{ bucket_region }}'"
+    #     - "prefix_in_bucket = '{{ inventory_hostname }}'"
+    #   become: true
+    #   tags:
+    #   - pageserver
+
    - name: upload systemd service definition
-      when: current_version > remote_version or force_deploy
      ansible.builtin.template:
        src: systemd/pageserver.service
        dest: /etc/systemd/system/pageserver.service
@@ -104,7 +91,6 @@
      - pageserver

    - name: start systemd service
-      when: current_version > remote_version or force_deploy
      ansible.builtin.systemd:
        daemon_reload: yes
        name: pageserver
@@ -115,7 +101,7 @@
      - pageserver

    - name: post version to console
-      when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined
+      when: console_mgmt_base_url is defined
      shell:
        cmd: |
          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
@@ -127,22 +113,42 @@
  hosts: safekeepers
  gather_facts: False
  remote_user: admin
-  vars:
-    force_deploy: false

  tasks:

+    - name: upload init script
+      when: console_mgmt_base_url is defined
+      ansible.builtin.template:
+        src: scripts/init_safekeeper.sh
+        dest: /tmp/init_safekeeper.sh
+        owner: root
+        group: root
+        mode: '0755'
+      become: true
+      tags:
+      - safekeeper
+
+    - name: init safekeeper
+      shell:
+        cmd: /tmp/init_safekeeper.sh
+      args:
+        creates: "/storage/safekeeper/data/safekeeper.id"
+      environment:
+        ZENITH_REPO_DIR: "/storage/safekeeper/data"
+        LD_LIBRARY_PATH: "/usr/local/lib"
+      become: true
+      tags:
+      - safekeeper
+
    # in the future safekeepers should discover pageservers byself
    # but currently use first pageserver that was discovered
    - name: set first pageserver var for safekeepers
-      when: current_version > remote_version or force_deploy
      set_fact:
        first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}"
      tags:
      - safekeeper

    - name: upload systemd service definition
-      when: current_version > remote_version or force_deploy
      ansible.builtin.template:
        src: systemd/safekeeper.service
        dest: /etc/systemd/system/safekeeper.service
@@ -154,7 +160,6 @@
      - safekeeper

    - name: start systemd service
-      when: current_version > remote_version or force_deploy
      ansible.builtin.systemd:
        daemon_reload: yes
        name: safekeeper
@@ -165,7 +170,7 @@
      - safekeeper

    - name: post version to console
-      when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined
+      when: console_mgmt_base_url is defined
      shell:
        cmd: |
          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -1,7 +1,16 @@
 [pageservers]
-zenith-1-ps-1
+zenith-1-ps-1 console_region_id=1

 [safekeepers]
-zenith-1-sk-1
-zenith-1-sk-2
-zenith-1-sk-3
+zenith-1-sk-1 console_region_id=1
+zenith-1-sk-2 console_region_id=1
+zenith-1-sk-3 console_region_id=1
+
+[storage:children]
+pageservers
+safekeepers
+
+[storage:vars]
+console_mgmt_base_url = http://console-release.local
+bucket_name           = zenith-storage-oregon
+bucket_region         = us-west-2
--- a/.circleci/ansible/scripts/init_pageserver.sh
+++ b/.circleci/ansible/scripts/init_pageserver.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+# get instance id from meta-data service
+INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+
+# store fqdn hostname in var
+HOST=$(hostname -f)
+
+
+cat <<EOF | tee /tmp/payload
+{
+  "version": 1,
+  "host": "${HOST}",
+  "port": 6400,
+  "region_id": {{ console_region_id }},
+  "instance_id": "${INSTANCE_ID}",
+  "http_host": "${HOST}",
+  "http_port": 9898
+}
+EOF
+
+# check if pageserver already registered or not
+if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/pageservers/${INSTANCE_ID} -o /dev/null; then
+
+    # not registered, so register it now
+    ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/pageservers -d@/tmp/payload | jq -r '.ID')
+
+    # init pageserver
+    sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
+fi
--- a/.circleci/ansible/scripts/init_safekeeper.sh
+++ b/.circleci/ansible/scripts/init_safekeeper.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+# get instance id from meta-data service
+INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+
+# store fqdn hostname in var
+HOST=$(hostname -f)
+
+
+cat <<EOF | tee /tmp/payload
+{
+  "version": 1,
+  "host": "${HOST}",
+  "port": 6500,
+  "region_id": {{ console_region_id }},
+  "instance_id": "${INSTANCE_ID}",
+  "http_host": "${HOST}",
+  "http_port": 7676
+}
+EOF
+
+# check if safekeeper already registered or not
+if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/safekeepers/${INSTANCE_ID} -o /dev/null; then
+
+    # not registered, so register it now
+    ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers -d@/tmp/payload | jq -r '.ID')
+
+    # init safekeeper
+    sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
+fi
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -1,7 +1,18 @@
 [pageservers]
-zenith-us-stage-ps-1
+#zenith-us-stage-ps-1 console_region_id=27
+zenith-us-stage-ps-2 console_region_id=27

 [safekeepers]
-zenith-us-stage-sk-1
-zenith-us-stage-sk-2
-zenith-us-stage-sk-3
+zenith-us-stage-sk-1 console_region_id=27
+zenith-us-stage-sk-2 console_region_id=27
+zenith-us-stage-sk-3 console_region_id=27
+zenith-us-stage-sk-4 console_region_id=27
+
+[storage:children]
+pageservers
+safekeepers
+
+[storage:vars]
+console_mgmt_base_url = http://console-staging.local
+bucket_name           = zenith-staging-storage-us-east-1
+bucket_region         = us-east-1
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -34,10 +34,13 @@ jobs:
      - checkout

        # Grab the postgres git revision to build a cache key.
+        # Append makefile as it could change the way postgres is built.
        # Note this works even though the submodule hasn't been checkout out yet.
      - run:
          name: Get postgres cache key
-          command: git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+          command: |
+              git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+              cat Makefile >> /tmp/cache-key-postgres

      - restore_cache:
          name: Restore postgres cache
@@ -78,11 +81,14 @@ jobs:
      - checkout

        # Grab the postgres git revision to build a cache key.
+        # Append makefile as it could change the way postgres is built.
        # Note this works even though the submodule hasn't been checkout out yet.
      - run:
          name: Get postgres cache key
          command: |
            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
+            cat Makefile >> /tmp/cache-key-postgres
+

      - restore_cache:
          name: Restore postgres cache
@@ -111,7 +117,12 @@ jobs:
            fi

            export CARGO_INCREMENTAL=0
+            export CACHEPOT_BUCKET=zenith-rust-cachepot
+            export RUSTC_WRAPPER=cachepot
+            export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
+            export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
            "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests
+            cachepot -s

      - save_cache:
          name: Save rust cache
@@ -141,11 +152,13 @@ jobs:
          command: |
            if [[ $BUILD_TYPE == "debug" ]]; then
              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
+              CARGO_FLAGS=
            elif [[ $BUILD_TYPE == "release" ]]; then
              cov_prefix=()
+              CARGO_FLAGS=--release
            fi

-            "${cov_prefix[@]}" cargo test
+            "${cov_prefix[@]}" cargo test $CARGO_FLAGS

        # Install the rust binaries, for use by test jobs
      - run:
@@ -215,12 +228,12 @@ jobs:
      - checkout
      - restore_cache:
          keys:
-            - v1-python-deps-{{ checksum "poetry.lock" }}
+            - v2-python-deps-{{ checksum "poetry.lock" }}
      - run:
          name: Install deps
          command: ./scripts/pysync
      - save_cache:
-          key: v1-python-deps-{{ checksum "poetry.lock" }}
+          key: v2-python-deps-{{ checksum "poetry.lock" }}
          paths:
            - /home/circleci/.cache/pypoetry/virtualenvs
      - run:
@@ -274,12 +287,12 @@ jobs:
            - run: git submodule update --init --depth 1
      - restore_cache:
          keys:
-            - v1-python-deps-{{ checksum "poetry.lock" }}
+            - v2-python-deps-{{ checksum "poetry.lock" }}
      - run:
          name: Install deps
          command: ./scripts/pysync
      - save_cache:
-          key: v1-python-deps-{{ checksum "poetry.lock" }}
+          key: v2-python-deps-{{ checksum "poetry.lock" }}
          paths:
            - /home/circleci/.cache/pypoetry/virtualenvs
      - run:
@@ -464,7 +477,10 @@ jobs:
          name: Build and push compute-tools Docker image
          command: |
            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
-            docker build -t zenithdb/compute-tools:latest -f Dockerfile.compute-tools .
+            docker build \
+              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
+              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
+              --tag zenithdb/compute-tools:latest -f Dockerfile.compute-tools .
            docker push zenithdb/compute-tools:latest
      - run:
          name: Init postgres submodule
@@ -518,7 +534,10 @@ jobs:
          name: Build and push compute-tools Docker image
          command: |
            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
-            docker build -t zenithdb/compute-tools:release -f Dockerfile.compute-tools .
+            docker build \
+              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
+              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
+              --tag zenithdb/compute-tools:release -f Dockerfile.compute-tools .
            docker push zenithdb/compute-tools:release
      - run:
          name: Init postgres submodule
@@ -605,7 +624,7 @@ jobs:
            ssh-add ssh-key
            rm -f ssh-key ssh-key-cert.pub

-            ansible-playbook deploy.yaml -i production.hosts -e console_mgmt_base_url=http://console-release.local
+            ansible-playbook deploy.yaml -i production.hosts
            rm -f zenith_install.tar.gz .zenith_current_version

  deploy-release-proxy:
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -0,0 +1,24 @@
+# This file contains settings for `cargo hakari`.
+# See https://docs.rs/cargo-hakari/latest/cargo_hakari/config for a full list of options.
+
+hakari-package = "workspace_hack"
+
+# Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above.
+dep-format-version = "2"
+
+# Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended.
+# Hakari works much better with the new feature resolver.
+# For more about the new feature resolver, see:
+# https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver
+resolver = "2"
+
+# Add triples corresponding to platforms commonly used by developers here.
+# https://doc.rust-lang.org/rustc/platform-support.html
+platforms = [
+    # "x86_64-unknown-linux-gnu",
+    # "x86_64-apple-darwin",
+    # "x86_64-pc-windows-msvc",
+]
+
+# Write out exact versions rather than a semver range. (Defaults to false.)
+# exact-versions = true
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -1,10 +1,6 @@
 name: Build and Test

-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
+on: push

 jobs:
  regression-check:
@@ -13,7 +9,7 @@ jobs:
        # If we want to duplicate this job for different
        # Rust toolchains (e.g. nightly or 1.37.0), add them here.
        rust_toolchain: [stable]
-        os: [ubuntu-latest]
+        os: [ubuntu-latest, macos-latest]
    timeout-minutes: 30
    name: run regression test suite
    runs-on: ${{ matrix.os }}
@@ -32,11 +28,17 @@ jobs:
          toolchain: ${{ matrix.rust_toolchain }}
          override: true

-      - name: Install postgres dependencies
+      - name: Install Ubuntu postgres dependencies
+        if: matrix.os == 'ubuntu-latest'
        run: |
          sudo apt update
          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev

+      - name: Install macOs postgres dependencies
+        if: matrix.os == 'macos-latest'
+        run: |
+          brew install flex bison
+
      - name: Set pg revision for caching
        id: pg_ver
        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,7 @@ members = [
    "zenith_metrics",
    "zenith_utils",
 ]
+resolver = "2"

 [profile.release]
 # This is useful for profiling and, to some extent, debug.
--- a/5
+++ b/5
@@ -24,13 +24,14 @@ ARG GIT_VERSION=local
 ARG CACHEPOT_BUCKET=zenith-rust-cachepot
 ARG AWS_ACCESS_KEY_ID
 ARG AWS_SECRET_ACCESS_KEY
-#ENV RUSTC_WRAPPER cachepot
 ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot

 COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
 COPY . .

-RUN cargo build --release
+# Show build caching stats to check if it was used in the end.
+# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats.
+RUN cargo build --release && /usr/local/cargo/bin/cachepot -s

 # Build final image
 #
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,12 +1,17 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .circle/config.yml
-FROM rust:1.56.1-slim-buster AS rust-build
+FROM zenithdb/build:buster-20220309 AS rust-build

 WORKDIR /zenith

+ARG CACHEPOT_BUCKET=zenith-rust-cachepot
+ARG AWS_ACCESS_KEY_ID
+ARG AWS_SECRET_ACCESS_KEY
+ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot
+
 COPY . .

-RUN cargo build -p compute_tools --release
+RUN cargo build -p compute_tools --release && /usr/local/cargo/bin/cachepot -s

 # Final image that only has one binary
 FROM debian:buster-slim
--- a/5
+++ b/5
@@ -78,6 +78,11 @@ postgres: postgres-configure \
 	$(MAKE) -C tmp_install/build/contrib/zenith install
 	+@echo "Compiling contrib/zenith_test_utils"
 	$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install
+	+@echo "Compiling pg_buffercache"
+	$(MAKE) -C tmp_install/build/contrib/pg_buffercache install
+	+@echo "Compiling pageinspect"
+	$(MAKE) -C tmp_install/build/contrib/pageinspect install
+

 .PHONY: postgres-clean
 postgres-clean:
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -16,4 +16,5 @@ regex = "1"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 tar = "0.4"
-tokio = { version = "1", features = ["macros", "rt", "rt-multi-thread"] }
+tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2021"
 tar = "0.4.33"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
 serde = { version = "1.0", features = ["derive"] }
+serde_with = "1.12.0"
 toml = "0.5"
 lazy_static = "1.4"
 regex = "1"
@@ -19,4 +20,4 @@ reqwest = { version = "0.11", default-features = false, features = ["blocking",
 pageserver = { path = "../pageserver" }
 walkeeper = { path = "../walkeeper" }
 zenith_utils = { path = "../zenith_utils" }
-workspace_hack = { path = "../workspace_hack" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -5,6 +5,7 @@

 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::env;
 use std::fs;
@@ -12,9 +13,7 @@ use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
 use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::{
-    HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId,
-};
+use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId};

 use crate::safekeeper::SafekeeperNode;

@@ -25,6 +24,7 @@ use crate::safekeeper::SafekeeperNode;
 // to 'zenith init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
+#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
@@ -50,12 +50,17 @@ pub struct LocalEnv {
    // Default tenant ID to use with the 'zenith' command line utility, when
    // --tenantid is not explicitly specified.
    #[serde(default)]
-    pub default_tenant_id: Option<HexZTenantId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub default_tenant_id: Option<ZTenantId>,

    // used to issue tokens during e.g pg start
    #[serde(default)]
    pub private_key_path: PathBuf,

+    // A comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
+    #[serde(default)]
+    pub broker_endpoints: Option<String>,
+
    pub pageserver: PageServerConf,

    #[serde(default)]
@@ -66,7 +71,8 @@ pub struct LocalEnv {
    // A `HashMap<String, HashMap<ZTenantId, ZTimelineId>>` would be more appropriate here,
    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
-    branch_name_mappings: HashMap<String, Vec<(HexZTenantId, HexZTimelineId)>>,
+    #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
+    branch_name_mappings: HashMap<String, Vec<(ZTenantId, ZTimelineId)>>,
 }

 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -164,9 +170,6 @@ impl LocalEnv {
            .entry(branch_name.clone())
            .or_default();

-        let tenant_id = HexZTenantId::from(tenant_id);
-        let timeline_id = HexZTimelineId::from(timeline_id);
-
        let existing_ids = existing_values
            .iter()
            .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
@@ -193,7 +196,6 @@ impl LocalEnv {
        branch_name: &str,
        tenant_id: ZTenantId,
    ) -> Option<ZTimelineId> {
-        let tenant_id = HexZTenantId::from(tenant_id);
        self.branch_name_mappings
            .get(branch_name)?
            .iter()
@@ -207,13 +209,7 @@ impl LocalEnv {
            .iter()
            .flat_map(|(name, tenant_timelines)| {
                tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
-                    (
-                        ZTenantTimelineId::new(
-                            ZTenantId::from(tenant_id),
-                            ZTimelineId::from(timeline_id),
-                        ),
-                        name.clone(),
-                    )
+                    (ZTenantTimelineId::new(tenant_id, timeline_id), name.clone())
                })
            })
            .collect()
@@ -259,7 +255,7 @@ impl LocalEnv {

        // If no initial tenant ID was given, generate it.
        if env.default_tenant_id.is_none() {
-            env.default_tenant_id = Some(HexZTenantId::from(ZTenantId::generate()));
+            env.default_tenant_id = Some(ZTenantId::generate());
        }

        env.base_data_dir = base_path();
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -73,6 +73,8 @@ pub struct SafekeeperNode {
    pub http_base_url: String,

    pub pageserver: Arc<PageServerNode>,
+
+    broker_endpoints: Option<String>,
 }

 impl SafekeeperNode {
@@ -89,6 +91,7 @@ impl SafekeeperNode {
            http_client: Client::new(),
            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
            pageserver,
+            broker_endpoints: env.broker_endpoints.clone(),
        }
    }

@@ -135,6 +138,9 @@ impl SafekeeperNode {
        if !self.conf.sync {
            cmd.arg("--no-sync");
        }
+        if let Some(ref ep) = self.broker_endpoints {
+            cmd.args(&["--broker-endpoints", ep]);
+        }

        if !cmd.status()?.success() {
            bail!(
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -1,4 +1,3 @@
-use std::convert::TryFrom;
 use std::io::Write;
 use std::net::TcpStream;
 use std::path::PathBuf;
@@ -10,7 +9,7 @@ use anyhow::{bail, Context};
 use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
-use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse};
+use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest};
 use pageserver::timelines::TimelineInfo;
 use postgres::{Config, NoTls};
 use reqwest::blocking::{Client, RequestBuilder, Response};
@@ -19,7 +18,7 @@ use thiserror::Error;
 use zenith_utils::http::error::HttpErrorBody;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::postgres_backend::AuthType;
-use zenith_utils::zid::{HexZTenantId, HexZTimelineId, ZTenantId, ZTimelineId};
+use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::local_env::LocalEnv;
 use crate::{fill_rust_env_vars, read_pidfile};
@@ -149,12 +148,20 @@ impl PageServerNode {
        let initial_timeline_id_string = initial_timeline_id.to_string();
        args.extend(["--initial-timeline-id", &initial_timeline_id_string]);

-        let init_output = fill_rust_env_vars(cmd.args(args))
+        let cmd_with_args = cmd.args(args);
+        let init_output = fill_rust_env_vars(cmd_with_args)
            .output()
-            .context("pageserver init failed")?;
+            .with_context(|| {
+                format!("failed to init pageserver with command {:?}", cmd_with_args)
+            })?;

        if !init_output.status.success() {
-            bail!("pageserver init failed");
+            bail!(
+                "init invocation failed, {}\nStdout: {}\nStderr: {}",
+                init_output.status,
+                String::from_utf8_lossy(&init_output.stdout),
+                String::from_utf8_lossy(&init_output.stderr)
+            );
        }

        Ok(initial_timeline_id)
@@ -338,9 +345,7 @@ impl PageServerNode {
    ) -> anyhow::Result<Option<ZTenantId>> {
        let tenant_id_string = self
            .http_request(Method::POST, format!("{}/tenant", self.http_base_url))
-            .json(&TenantCreateRequest {
-                new_tenant_id: new_tenant_id.map(HexZTenantId::from),
-            })
+            .json(&TenantCreateRequest { new_tenant_id })
            .send()?
            .error_from_body()?
            .json::<Option<String>>()?;
@@ -358,7 +363,7 @@ impl PageServerNode {
    }

    pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result<Vec<TimelineInfo>> {
-        let timeline_infos: Vec<TimelineInfoResponse> = self
+        let timeline_infos: Vec<TimelineInfo> = self
            .http_request(
                Method::GET,
                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
@@ -367,10 +372,7 @@ impl PageServerNode {
            .error_from_body()?
            .json()?;

-        timeline_infos
-            .into_iter()
-            .map(TimelineInfo::try_from)
-            .collect()
+        Ok(timeline_infos)
    }

    pub fn timeline_create(
@@ -386,16 +388,14 @@ impl PageServerNode {
                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
            )
            .json(&TimelineCreateRequest {
-                new_timeline_id: new_timeline_id.map(HexZTimelineId::from),
+                new_timeline_id,
                ancestor_start_lsn,
-                ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from),
+                ancestor_timeline_id,
            })
            .send()?
            .error_from_body()?
-            .json::<Option<TimelineInfoResponse>>()?;
+            .json::<Option<TimelineInfo>>()?;

-        timeline_info_response
-            .map(TimelineInfo::try_from)
-            .transpose()
+        Ok(timeline_info_response)
    }
 }
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup.

 ### Branch

-We can create branch at certain LSN using `zenith branch` command.
+We can create branch at certain LSN using `zenith timeline branch` command.
 Each Branch lives in a corresponding timeline[] and has an ancestor[].


@@ -36,17 +36,25 @@ A checkpoint record in the WAL marks a point in the WAL sequence at which it is
 NOTE: This is an overloaded term.

 Whenever enough WAL has been accumulated in memory, the page server []
-writes out the changes from in-memory layers into new layer files[]. This process
-is called "checkpointing". The page server only creates layer files for
-relations that have been modified since the last checkpoint. 
+writes out the changes from the in-memory layer into a new delta layer file. This process
+is called "checkpointing".

 Configuration parameter `checkpoint_distance` defines the distance
 from current LSN to perform checkpoint of in-memory layers.
 Default is `DEFAULT_CHECKPOINT_DISTANCE`.
-Set this parameter to `0` to force checkpoint of every layer.

-Configuration parameter `checkpoint_period` defines the interval between checkpoint iterations.
-Default is `DEFAULT_CHECKPOINT_PERIOD`.
+### Compaction
+
+A background operation on layer files. Compaction takes a number of L0
+layer files, each of which covers the whole key space and a range of
+LSN, and reshuffles the data in them into L1 files so that each file
+covers the whole LSN range, but only part of the key space.
+
+Compaction should also opportunistically leave obsolete page versions
+from the L1 files, and materialize other page versions for faster
+access. That hasn't been implemented as of this writing, though.
+
+
 ### Compute node

 Stateless Postgres node that stores data in pageserver.
@@ -54,10 +62,10 @@ Stateless Postgres node that stores data in pageserver.
 ### Garbage collection

 The process of removing old on-disk layers that are not needed by any timeline anymore.
+
 ### Fork

 Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map.
-Each PostgreSQL fork is considered a separate relish.

 ### Layer

@@ -72,15 +80,15 @@ are immutable. See pageserver/src/layered_repository/README.md for more.
 ### Layer file (on-disk layer)

 Layered repository on-disk format is based on immutable files.  The
-files are called "layer files". Each file corresponds to one RELISH_SEG_SIZE
-segment of a PostgreSQL relation fork. There are two kinds of layer
-files: image files and delta files. An image file contains a
-"snapshot" of the segment at a particular LSN, and a delta file
-contains WAL records applicable to the segment, in a range of LSNs.
+files are called "layer files". There are two kinds of layer files:
+image files and delta files. An image file contains a "snapshot" of a
+range of keys at a particular LSN, and a delta file contains WAL
+records applicable to a range of keys, in a range of LSNs.

 ### Layer map

-The layer map tracks what layers exist for all the relishes in a timeline.
+The layer map tracks what layers exist in a timeline.
+
 ### Layered repository

 Zenith repository implementation that keeps data in layers.
@@ -149,14 +157,6 @@ and create new databases and accounts (control plane API in our case).

 The generic term in PostgreSQL for all objects in a database that have a name and a list of attributes defined in a specific order.

-### Relish
-
-We call each relation and other file that is stored in the
-repository a "relish". It comes from "rel"-ish, as in "kind of a
-rel", because it covers relations as well as other things that are
-not relations, but are treated similarly for the purposes of the
-storage layer.
-
 ### Replication slot


@@ -173,27 +173,18 @@ One repository corresponds to one Tenant.

 How much history do we need to keep around for PITR and read-only nodes?

-### Segment (PostgreSQL)
-
-NOTE: This is an overloaded term.
+### Segment

 A physical file that stores data for a given relation. File segments are
 limited in size by a compile-time setting (1 gigabyte by default), so if a
 relation exceeds that size, it is split into multiple segments.

-### Segment (Layered Repository)
-
-NOTE: This is an overloaded term.
-
-Segment is a RELISH_SEG_SIZE slice of relish (identified by a SegmentTag).
-
 ### SLRU

 SLRUs include pg_clog, pg_multixact/members, and
 pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
 they don't need to be stored permanently (e.g. pg_subtrans),
 or we do not support them in zenith yet (pg_commit_ts).
-Each SLRU segment is considered a separate relish[].

 ### Tenant (Multitenancy)
 Tenant represents a single customer, interacting with Zenith.
--- a/docs/rfcs/014-safekeepers-gossip.md
+++ b/docs/rfcs/014-safekeepers-gossip.md
@@ -0,0 +1,69 @@
+# Safekeeper gossip
+
+Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13)
+
+## Motivation
+
+In some situations, safekeeper (SK) needs coordination with other SK's that serve the same tenant:
+
+1. WAL deletion. SK needs to know what WAL was already safely replicated to delete it. Now we keep WAL indefinitely.
+2. Deciding on who is sending WAL to the pageserver. Now sending SK crash may lead to a livelock where nobody sends WAL to the pageserver.
+3. To enable SK to SK direct recovery without involving the compute
+
+## Summary
+
+Compute node has connection strings to each safekeeper. During each compute->safekeeper connection establishment, the compute node should pass down all that connection strings to each safekeeper. With that info, safekeepers may establish Postgres connections to each other and periodically send ping messages with LSN payload.
+
+## Components
+
+safekeeper, compute, compute<->safekeeper protocol, possibly console (group SK addresses)
+
+## Proposed implementation
+
+Each safekeeper can periodically ping all its peers and share connectivity and liveness info. If the ping was not receiver for, let's say, four ping periods, we may consider sending safekeeper as dead. That would mean some of the alive safekeepers should connect to the pageserver. One way to decide which one exactly: `make_connection = my_node_id == min(alive_nodes)`
+
+Since safekeepers are multi-tenant, we may establish either per-tenant physical connections or per-safekeeper ones. So it makes sense to group "logical" connections between corresponding tenants on different nodes into a single physical connection. That means that we should implement an interconnect thread that maintains physical connections and periodically broadcasts info about all tenants.
+
+Right now console may assign any 3 SK addresses to a given compute node. That may lead to a high number of gossip connections between SK's. Instead, we can assign safekeeper triples to the compute node. But if we want to "break"/" change" group by an ad-hoc action, we can do it.
+
+### Corner cases
+
+- Current safekeeper may be alive but may not have connectivity to the pageserver
+
+  To address that, we need to gossip visibility info. Based on that info, we may define SK as alive only when it can connect to the pageserver.
+
+- Current safekeeper may be alive but may not have connectivity with the compute node.
+
+  We may broadcast last_received_lsn and presence of compute connection and decide who is alive based on that.
+
+- It is tricky to decide when to shut down gossip connections because we need to be sure that pageserver got all the committed (in the distributed sense, so local SK info is not enough) records, and it may never lose them. It is not a strict requirement since `--sync-safekeepers` that happen before the compute start will allow the pageserver to consume missing WAL, but it is better to do that in the background. So the condition may look like that: `majority_max(flush_lsn) == pageserver_s3_lsn` Here we rely on the two facts:
+    - that `--sync-safekeepers` happened after the compute shutdown, and it advanced local commit_lsn's allowing pageserver to consume that WAL.
+
+    - we wait for the `pageserver_s3_lsn` advancement to avoid pageserver's last_received_lsn/disk_consistent_lsn going backward due to the disk/hardware failure and subsequent S3 recovery
+
+    If those conditions are not met, we will have some gossip activity (but that may be okay).
+
+## Pros/cons
+
+Pros:
+
+- distributed, does not introduce new services (like etcd), does not add console as a storage dependency
+- lays the foundation for gossip-based recovery
+
+Cons:
+
+- Only compute knows a set of safekeepers, but they should communicate even without compute node. In case of safekeepers restart, we will lose that info and can't gossip anymore. Hence we can't trim some WAL tail until the compute node start. Also, it is ugly.
+
+- If the console assigns a random set of safekeepers to each Postgres, we may end up in a situation where each safekeeper needs to have a connection with all other safekeepers. We can group safekeepers into isolated triples in the console to avoid that. Then "mixing" would happen only if we do rebalancing.
+
+## Alternative implementation
+
+We can have a selected node (e.g., console) with everybody reporting to it.
+
+## Security implications
+
+We don't increase the attack surface here. Communication can happen in a private network that is not exposed to users.
+
+## Scalability implications
+
+The only thing that may grow as we grow the number of computes is the number of gossip connections. But if we group safekeepers and assign a compute node to the random SK triple, the number of connections would be constant.
--- a/docs/rfcs/014-storage-lsm.md
+++ b/docs/rfcs/014-storage-lsm.md
@@ -73,8 +73,11 @@ is accessed through a buffer cache.
 If the page server crashes, the Memtable is lost. It is rebuilt by
 processing again the WAL that's newer than the latest layer in L0.

-The size of the Memtable is equal to the "checkpoint distance", or the
-amount of WAL that we need to keep in the safekeeper.
+The size of the Memtable is configured by the "checkpoint distance"
+setting. Because anything that hasn't been flushed to disk and
+uploaded to S3 yet needs to be kept in the safekeeper, the "checkpoint
+distance" also determines the amount of WAL that needs to kept in the
+safekeeper.

 # L0

@@ -129,7 +132,7 @@ collecting layers that are older than the GC horizon.
 # Partitioning scheme

 When compaction happens and creates a new set of files in L1, how do
-we partition the data into the files? 
+we partition the data into the files?

 - Goal is that each file is ~ 1 GB in size
 - Try to match partition boundaries at relation boundaries. (See [1]
--- a/docs/rfcs/015-storage-messaging.md
+++ b/docs/rfcs/015-storage-messaging.md
@@ -0,0 +1,295 @@
+# Storage messaging
+
+Created on 19.01.22
+
+Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich.
+
+That it is an alternative to (014-safekeeper-gossip)[]
+
+## Motivation
+
+As in 014-safekeeper-gossip we need to solve the following problems:
+
+* Trim WAL on safekeepers
+* Decide on which SK should push WAL to the S3
+* Decide on which SK should forward WAL to the pageserver
+* Decide on when to shut down SK<->pageserver connection
+
+This RFC suggests a more generic and hopefully more manageable way to address those problems. However, unlike 014-safekeeper-gossip, it does not bring us any closer to safekeeper-to-safekeeper recovery but rather unties two sets of different issues we previously wanted to solve with gossip.
+
+Also, with this approach, we would not need "call me maybe" anymore, and the pageserver will have all the data required to understand that it needs to reconnect to another safekeeper.
+
+## Summary
+
+Instead of p2p gossip, let's have a centralized broker where all the storage nodes report per-timeline state. Each storage node should have a `--broker-url=1.2.3.4` CLI param.
+
+Here I propose two ways to do that. After a lot of arguing with myself, I'm leaning towards the etcd approach. My arguments for it are in the pros/cons section. Both options require adding a Grpc client in our codebase either directly or as an etcd dependency.
+
+## Non-goals
+
+That RFC does *not* suggest moving the compute to pageserver and compute to safekeeper mappings out of the console. The console is still the only place in the cluster responsible for the persistency of that info. So I'm implying that each pageserver and safekeeper exactly knows what timelines he serves, as it currently is. We need some mechanism for a new pageserver to discover mapping info, but that is out of the scope of this RFC.
+
+## Impacted components
+
+pageserver, safekeeper
+adds either etcd or console as a storage dependency
+
+## Possible implementation: custom message broker in the console
+
+We've decided to go with an etcd approach instead of the message broker.
+
+<details closed>
+<summary>Original suggestion</summary>
+<br>
+We can add a Grpc service in the console that acts as a message broker since the console knows the addresses of all the components. The broker can ignore the payload and only redirect messages. So, for example, each safekeeper may send a message to the peering safekeepers or to the pageserver responsible for a given timeline.
+
+Message format could be `{sender, destination, payload}`.
+
+The destination is either:
+1. `sk_#{tenant}_#{timeline}` -- to be broadcasted on all safekeepers, responsible for that timeline, or
+2. `pserver_#{tenant}_#{timeline}` -- to be broadcasted on all pageservers, responsible for that timeline
+
+Sender is either:
+1. `sk_#{sk_id}`, or
+2. `pserver_#{pserver_id}`
+
+I can think of the following behavior to address our original problems:
+
+* WAL trimming
+  Each safekeeper periodically broadcasts `(write_lsn, commit_lsn)` to all peering (peering == responsible for that timeline) safekeepers
+
+* Decide on which SK should push WAL to the S3
+
+  Each safekeeper periodically broadcasts `i_am_alive_#{current_timestamp}` message to all peering safekeepers. That way, safekeepers may maintain the vector of alive peers (loose one, with false negatives). Alive safekeeper with the minimal id pushes data to S3.
+
+* Decide on which SK should forward WAL to the pageserver
+
+  Each safekeeper periodically sends (write_lsn, commit_lsn, compute_connected) to the relevant pageservers. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`.
+
+  Pageserver connection to the safekeeper triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore.
+
+  Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection).
+
+* Decide on when to shutdown sk<->pageserver connection
+
+  Again, pageserver would have all the info to understand when to shut down the safekeeper connection.
+
+### Scalability
+
+One node is enough (c) No, seriously, it is enough.
+
+### High Availability
+
+Broker lives in the console, so we can rely on k8s maintaining the console app alive.
+
+If the console is down, we won't trim WAL and reconnect the pageserver to another safekeeper. But, at the same, if the console is down, we already can't accept new compute connections and start stopped computes, so we are making things a bit worse, but not dramatically.
+
+### Interactions
+
+```
+         .________________.
+sk_1 <-> |                | <-> pserver_1
+...      | Console broker |     ...
+sk_n <-> |________________| <-> pserver_m
+```
+</details>
+
+
+## Implementation: etcd state store
+
+Alternatively, we can set up `etcd` and maintain the following data structure in it:
+
+```ruby
+"compute_#{tenant}_#{timeline}" => {
+    safekeepers => {
+        "sk_#{sk_id}" => {
+            write_lsn: "0/AEDF130",
+            commit_lsn: "0/AEDF100",
+            compute_connected: true,
+            last_updated: 1642621138,
+        },
+    }
+}
+```
+
+As etcd doesn't support field updates in the nested objects that translates to the following set of keys:
+
+```ruby
+"compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/write_lsn",
+"compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/commit_lsn",
+...
+```
+
+Each storage node can subscribe to the relevant sets of keys and maintain a local view of that structure. So in terms of the data flow, everything is the same as in the previous approach. Still, we can avoid implementing the message broker and prevent runtime storage dependency on a console.
+
+### Safekeeper address discovery
+
+During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertize something more useful.
+
+### Safekeeper behavior
+
+For each timeline safekeeper periodically broadcasts `compute_#{tenant}_#{timeline}/safekeepers/sk_#{sk_id}/*` fields. It subscribes to changes of `compute_#{tenant}_#{timeline}` -- that way safekeeper will have an information about peering safekeepers.
+That amount of information is enough to properly trim WAL. To decide on who is pushing the data to S3 safekeeper may use etcd leases or broadcast a timestamp and hence track who is alive.
+
+### Pageserver behavior
+
+Pageserver subscribes to `compute_#{tenant}_#{timeline}` for each tenant it owns. With that info, pageserver can maintain a view of the safekeepers state, connect to a random one, and detect the moments (e.g., one the safekeepers is not making progress or down) when it needs to reconnect to another safekeeper. Pageserver should resolve exact IP addresses through the console, e.g., exchange `#sk_#{sk_id}` to `4.5.6.7:6400`.
+
+Pageserver connection to the safekeeper can be triggered by the state change `compute_connected: false -> true`. With that, we don't need "call me maybe" anymore.
+
+As an alternative to compute_connected, we can track timestamp of the latest message arrived to safekeeper from compute. Usually compute broadcasts KeepAlive to all safekeepers every second, so it'll be updated every second when connection is ok. Then the connection can be considered down when this timestamp isn't updated for a several seconds.
+
+This will help to faster detect issues with safekeeper (and switch to another) in the following cases:
+
+      when compute failed but TCP connection stays alive until timeout (usually about a minute)
+      when safekeeper failed and didn't set compute_connected to false
+
+Another way to deal with [2] is to process (write_lsn, commit_lsn, compute_connected) as a KeepAlive on the pageserver side and detect issues when sk_id don't send anything for some time. This way is fully compliant to this RFC.
+
+Also, we don't have a "peer address amnesia" problem as in the gossip approach (with gossip, after a simultaneous reboot, safekeepers wouldn't know each other addresses until the next compute connection).
+
+### Interactions
+
+```
+         .________________.
+sk_1 <-> |                | <-> pserver_1
+...      |      etcd      |     ...
+sk_n <-> |________________| <-> pserver_m
+```
+
+### Sequence diagrams for different workflows
+
+#### Cluster startup
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Compute
+    participant SK1
+    participant SK2
+    participant SK3
+    participant PS1
+    participant PS2
+    participant O as Orchestrator
+    participant M as Metadata Service
+
+    PS1->>M: subscribe to updates to state of timeline N
+    C->>+SK1: WAL push
+    loop constantly update current lsns
+        SK1->>-M: I'm at lsn A
+    end
+    C->>+SK2: WAL push
+    loop constantly update current lsns
+        SK2->>-M: I'm at lsn B
+    end
+    C->>+SK3: WAL push
+    loop constantly update current lsns
+        SK3->>-M: I'm at lsn C
+    end
+    loop request pages
+        C->>+PS1: get_page@lsn
+        PS1->>-C: page image
+    end
+    M->>PS1: New compute appeared for timeline N. SK1 at A, SK2 at B, SK3 at C
+    note over PS1: Say SK1 at A=200, SK2 at B=150 SK3 at C=100 <br> so connect to SK1 because it is the most up to date one
+    PS1->>SK1: start replication
+```
+
+#### Behavour of services during typical operations
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Compute
+    participant SK1
+    participant SK2
+    participant SK3
+    participant PS1
+    participant PS2
+    participant O as Orchestrator
+    participant M as Metadata Service
+
+    note over C,M: Scenario 1: Pageserver checkpoint
+    note over PS1: Upload data to S3
+    PS1->>M: Update remote consistent lsn
+    M->>SK1: propagate remote consistent lsn update
+    note over SK1: truncate WAL up to remote consistent lsn
+    M->>SK2: propagate remote consistent lsn update
+    note over SK2: truncate WAL up to remote consistent lsn
+    M->>SK3: propagate remote consistent lsn update
+    note over SK3: truncate WAL up to remote consistent lsn
+    note over C,M: Scenario 2: SK1 finds itself lagging behind MAX(150 (SK2), 200 (SK2)) - 100 (SK1) > THRESHOLD
+    SK1->>SK2: Fetch WAL delta between 100 (SK1) and 200 (SK2)
+    note over C,M: Scenario 3: PS1 detects that SK1 is lagging behind: Connection from SK1 is broken or there is no messages from it in 30 seconds.
+    note over PS1: e.g. SK2 is at 150, SK3 is at 100, chose SK2 as a new replication source
+    PS1->>SK2: start replication
+```
+
+#### Behaviour during timeline relocation
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant C as Compute
+    participant SK1
+    participant SK2
+    participant SK3
+    participant PS1
+    participant PS2
+    participant O as Orchestrator
+    participant M as Metadata Service
+
+    note over C,M: Timeline is being relocated from PS1 to PS2
+    O->>+PS2: Attach timeline
+    PS2->>-O: 202 Accepted if timeline exists in S3
+    note over PS2: Download timeline from S3
+     note over O: Poll for timeline download (or subscribe to metadata service)
+    loop wait for attach to complete
+        O->>PS2: timeline detail should answer that timeline is ready
+    end
+    PS2->>M: Register downloaded timeline
+    PS2->>M: Get safekeepers for timeline, subscribe to changes
+    PS2->>SK1: Start replication to catch up
+    note over O: PS2 catched up, time to switch compute
+    O->>C: Restart compute with new pageserver url in config
+    note over C: Wal push is restarted
+    loop request pages
+        C->>+PS2: get_page@lsn
+        PS2->>-C: page image
+    end
+    O->>PS1: detach timeline
+    note over C,M: Scenario 1: Attach call failed
+    O--xPS2: Attach timeline
+    note over O: The operation can be safely retried, <br> if we hit some threshold we can try another pageserver
+    note over C,M: Scenario 2: Attach succeeded but pageserver failed to download the data or start replication
+    loop wait for attach to complete
+        O--xPS2: timeline detail should answer that timeline is ready
+    end
+    note over O: Can wait for a timeout, and then try another pageserver <br> there should be a limit on number of different pageservers to try
+    note over C,M: Scenario 3: Detach fails
+    O--xPS1: Detach timeline
+    note over O: can be retried, if continues to fail might lead to data duplication in s3
+```
+
+# Pros/cons
+
+## Console broker/etcd vs gossip:
+
+Gossip pros:
+* gossip allows running storage without the console or etcd
+
+Console broker/etcd pros:
+* simpler
+* solves "call me maybe" as well
+* avoid possible N-to-N connection issues with gossip without grouping safekeepers in pre-defined triples
+
+## Console broker vs. etcd:
+
+Initially, I wanted to avoid etcd as a dependency mostly because I've seen how painful for Clickhouse was their ZooKeeper dependency: in each chat, at each conference, people were complaining about configuration and maintenance barriers with ZooKeeper. It was that bad that ClickHouse re-implemented ZooKeeper to embed it: https://clickhouse.com/docs/en/operations/clickhouse-keeper/.
+
+But with an etcd we are in a bit different situation:
+
+1. We don't need persistency and strong consistency guarantees for the data we store in the etcd
+2. etcd uses Grpc as a protocol, and messages are pretty simple
+
+So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres).
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -68,11 +68,11 @@ S3.

 The unit is # of bytes.

-#### checkpoint_period
+#### compaction_period

-The pageserver checks whether `checkpoint_distance` has been reached
-every `checkpoint_period` seconds. Default is 1 s, which should be
-fine.
+Every `compaction_period` seconds, the page server checks if
+maintenance operations, like compaction, are needed on the layer
+files.  Default is 1 s, which should be fine.

 #### gc_horizon

--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -67,6 +67,8 @@ For more detailed info, see `/walkeeper/README`
 `/workspace_hack`:
 The workspace_hack crate exists only to pin down some dependencies.

+We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation.
+
 `/zenith`

 Main entry point for the 'zenith' CLI utility.
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -4,20 +4,21 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" }
 chrono = "0.4.19"
 rand = "0.8.3"
 regex = "1.4.5"
 bytes = { version = "1.0.1", features = ['serde'] }
 byteorder = "1.4.3"
 futures = "0.3.13"
+hex = "0.4.3"
 hyper = "0.14"
 itertools = "0.10.3"
 lazy_static = "1.4.0"
 log = "0.4.14"
 clap = "3.0"
 daemonize = "0.4.1"
-tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
+tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
+tokio-util = { version = "0.7", features = ["io"] }
 postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
@@ -26,14 +27,14 @@ tokio-stream = "0.1.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
 crc32c = "0.6.0"
 thiserror = "1.0"
-hex = { version = "0.4.3", features = ["serde"] }
 tar = "0.4.33"
 humantime = "2.1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
+serde_with = "1.12.0"
+
 toml_edit = { version = "0.13", features = ["easy"] }
 scopeguard = "1.1.0"
-async-trait = "0.1"
 const_format = "0.2.21"
 tracing = "0.1.27"
 tracing-futures = "0.2"
@@ -44,14 +45,15 @@ once_cell = "1.8.0"
 crossbeam-utils = "0.8.5"
 fail = "0.5.0"

-rust-s3 = { version = "0.28", default-features = false, features = ["no-verify-ssl", "tokio-rustls-tls"] }
+rusoto_core = "0.47"
+rusoto_s3 = "0.47"
+async-trait = "0.1"
 async-compression = {version = "0.3", features = ["zstd", "tokio"]}

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_metrics = { path = "../zenith_metrics" }
 zenith_utils = { path = "../zenith_utils" }
-workspace_hack = { path = "../workspace_hack" }
-plotly = "0.7.0"
+workspace_hack = { version = "0.1", path = "../workspace_hack" }

 [dev-dependencies]
 hex-literal = "0.3"
--- a/pageserver/README.md
+++ b/pageserver/README.md
@@ -13,7 +13,7 @@ keeps track of WAL records which are not synced to S3 yet.

 The Page Server consists of multiple threads that operate on a shared
 repository of page versions:
-
+```
                                           | WAL
                                           V
                                   +--------------+
@@ -46,7 +46,7 @@ Legend:

 --->   Data flow
 <---
-
+```

 Page Service
 ------------
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{Context, Result};
+use anyhow::{ensure, Context, Result};
 use bytes::{BufMut, BytesMut};
 use log::*;
 use std::fmt::Write as FmtWrite;
@@ -20,7 +20,7 @@ use std::sync::Arc;
 use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};

-use crate::relish::*;
+use crate::reltag::SlruKind;
 use crate::repository::Timeline;
 use crate::DatadirTimelineImpl;
 use postgres_ffi::xlog_utils::*;
@@ -65,6 +65,7 @@ impl<'a> Basebackup<'a> {
        // prev_lsn to Lsn(0) if we cannot provide the correct value.
        let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
+            info!("waiting for {}", req_lsn);
            timeline.tline.wait_lsn(req_lsn)?;

            // If the requested point is the end of the timeline, we can
@@ -153,7 +154,7 @@ impl<'a> Basebackup<'a> {
            let img = self
                .timeline
                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
-            assert!(img.len() == pg_constants::BLCKSZ as usize);
+            ensure!(img.len() == pg_constants::BLCKSZ as usize);

            slru_buf.extend_from_slice(&img);
        }
@@ -180,7 +181,7 @@ impl<'a> Basebackup<'a> {
    ) -> anyhow::Result<()> {
        let relmap_img = if has_relmap_file {
            let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
-            assert!(img.len() == 512);
+            ensure!(img.len() == 512);
            Some(img)
        } else {
            None
@@ -220,7 +221,8 @@ impl<'a> Basebackup<'a> {
            {
                return Ok(());
            }
-            assert!(spcnode == pg_constants::DEFAULTTABLESPACE_OID);
+            // User defined tablespaces are not supported
+            ensure!(spcnode == pg_constants::DEFAULTTABLESPACE_OID);

            // Append dir path for each database
            let path = format!("base/{}", dbnode);
@@ -314,7 +316,7 @@ impl<'a> Basebackup<'a> {
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
        let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
-        assert!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
+        ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
        self.ar.append(&header, &wal_seg[..])?;
        Ok(())
    }
--- a/pageserver/src/bin/dump_layerfile.rs
+++ b/pageserver/src/bin/dump_layerfile.rs
@@ -4,6 +4,7 @@
 use anyhow::Result;
 use clap::{App, Arg};
 use pageserver::layered_repository::dump_layerfile_from_path;
+use pageserver::page_cache;
 use pageserver::virtual_file;
 use std::path::PathBuf;
 use zenith_utils::GIT_VERSION;
@@ -24,8 +25,9 @@ fn main() -> Result<()> {

    // Basic initialization of things that don't change after startup
    virtual_file::init(10);
+    page_cache::init(100);

-    dump_layerfile_from_path(&path)?;
+    dump_layerfile_from_path(&path, true)?;

    Ok(())
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -18,16 +18,18 @@ use daemonize::Daemonize;

 use pageserver::{
    config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, remote_storage, tenant_mgr, thread_mgr,
+    http, page_cache, page_service,
+    remote_storage::{self, SyncStartupData},
+    repository::{Repository, TimelineSyncStatusUpdate},
+    tenant_mgr, thread_mgr,
    thread_mgr::ThreadKind,
    timelines, virtual_file, LOG_FILE_NAME,
 };
 use zenith_utils::http::endpoint;
-use zenith_utils::postgres_backend;
 use zenith_utils::shutdown::exit_now;
 use zenith_utils::signals::{self, Signal};

-fn main() -> Result<()> {
+fn main() -> anyhow::Result<()> {
    zenith_metrics::set_common_metrics_prefix("pageserver");
    let arg_matches = App::new("Zenith page server")
        .about("Materializes WAL stream to pages and serves them to the postgres")
@@ -113,7 +115,7 @@ fn main() -> Result<()> {
        // We're initializing the repo, so there's no config file yet
        DEFAULT_CONFIG_FILE
            .parse::<toml_edit::Document>()
-            .expect("could not parse built-in config file")
+            .context("could not parse built-in config file")?
    } else {
        // Supplement the CLI arguments with the config file
        let cfg_file_contents = std::fs::read_to_string(&cfg_file_path)
@@ -160,8 +162,7 @@ fn main() -> Result<()> {

    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
-
-    page_cache::init(conf);
+    page_cache::init(conf.page_cache_size);

    // Create repo and exit if init was requested
    if init {
@@ -210,7 +211,9 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()

        // There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
        // that we will see any accidental manual fprintf's or backtraces.
-        let stdout = log_file.try_clone().unwrap();
+        let stdout = log_file
+            .try_clone()
+            .with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
        let stderr = log_file;

        let daemonize = Daemonize::new()
@@ -230,11 +233,47 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    }

    let signals = signals::install_shutdown_handlers()?;
-    let sync_startup = remote_storage::start_local_timeline_sync(conf)
+
+    // Initialize repositories with locally available timelines.
+    // Timelines that are only partially available locally (remote storage has more data than this pageserver)
+    // are scheduled for download and added to the repository once download is completed.
+    let SyncStartupData {
+        remote_index,
+        local_timeline_init_statuses,
+    } = remote_storage::start_local_timeline_sync(conf)
        .context("Failed to set up local files sync with external storage")?;

-    // Initialize tenant manager.
-    tenant_mgr::set_timeline_states(conf, sync_startup.initial_timeline_states);
+    for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
+        // initialize local tenant
+        let repo = tenant_mgr::load_local_repo(conf, tenant_id, &remote_index);
+        for (timeline_id, init_status) in local_timeline_init_statuses {
+            match init_status {
+                remote_storage::LocalTimelineInitStatus::LocallyComplete => {
+                    debug!("timeline {} for tenant {} is locally complete, registering it in repository", tenant_id, timeline_id);
+                    // Lets fail here loudly to be on the safe side.
+                    // XXX: It may be a better api to actually distinguish between repository startup
+                    //   and processing of newly downloaded timelines.
+                    repo.apply_timeline_remote_sync_status_update(
+                        timeline_id,
+                        TimelineSyncStatusUpdate::Downloaded,
+                    )
+                    .with_context(|| {
+                        format!(
+                            "Failed to bootstrap timeline {} for tenant {}",
+                            timeline_id, tenant_id
+                        )
+                    })?
+                }
+                remote_storage::LocalTimelineInitStatus::NeedsSync => {
+                    debug!(
+                        "timeline {} for tenant {} needs sync, \
+                         so skipped for adding into repository until sync is finished",
+                        tenant_id, timeline_id
+                    );
+                }
+            }
+        }
+    }

    // initialize authentication for incoming connections
    let auth = match &conf.auth_type {
@@ -255,8 +294,9 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        None,
        None,
        "http_endpoint_thread",
+        false,
        move || {
-            let router = http::make_router(conf, auth_cloned);
+            let router = http::make_router(conf, auth_cloned, remote_index);
            endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
        },
    )?;
@@ -268,6 +308,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        None,
        None,
        "libpq endpoint thread",
+        false,
        move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type),
    )?;

@@ -285,38 +326,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
                "Got {}. Terminating gracefully in fast shutdown mode",
                signal.name()
            );
-            shutdown_pageserver();
+            pageserver::shutdown_pageserver();
            unreachable!()
        }
    })
 }
-
-fn shutdown_pageserver() {
-    // Shut down the libpq endpoint thread. This prevents new connections from
-    // being accepted.
-    thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
-
-    // Shut down any page service threads.
-    postgres_backend::set_pgbackend_shutdown_requested();
-    thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None);
-
-    // Shut down all the tenants. This flushes everything to disk and kills
-    // the checkpoint and GC threads.
-    tenant_mgr::shutdown_all_tenants();
-
-    // Stop syncing with remote storage.
-    //
-    // FIXME: Does this wait for the sync thread to finish syncing what's queued up?
-    // Should it?
-    thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None);
-
-    // Shut down the HTTP endpoint last, so that you can still check the server's
-    // status while it's shutting down.
-    thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None);
-
-    // There should be nothing left, but let's be sure
-    thread_mgr::shutdown_threads(None, None, None);
-
-    info!("Shut down successfully completed");
-    std::process::exit(0);
-}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,8 +30,13 @@ pub mod defaults {
    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
    // would be more appropriate. But a low value forces the code to be exercised more,
    // which is good for now to trigger bugs.
+    // This parameter actually determines L0 layer file size.
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;

+    // Target file size, when creating image and delta layers.
+    // This parameter determines L1 layer file size.
+    pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
+
    pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s";

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
@@ -41,7 +46,7 @@ pub mod defaults {
    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";

    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
-    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100;
+    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 10;
    pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;

    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
@@ -58,6 +63,7 @@ pub mod defaults {
 #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'

 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
+#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
 #compaction_period = '{DEFAULT_COMPACTION_PERIOD}'

 #gc_period = '{DEFAULT_GC_PERIOD}'
@@ -91,8 +97,13 @@ pub struct PageServerConf {
    // Flush out an inmemory layer, if it's holding WAL older than this
    // This puts a backstop on how much WAL needs to be re-digested if the
    // page server crashes.
+    // This parameter actually determines L0 layer file size.
    pub checkpoint_distance: u64,

+    // Target file size, when creating image and delta layers.
+    // This parameter determines L1 layer file size.
+    pub compaction_target_size: u64,
+
    // How often to check if there's compaction work to be done.
    pub compaction_period: Duration,

@@ -149,6 +160,7 @@ struct PageServerConfigBuilder {

    checkpoint_distance: BuilderValue<u64>,

+    compaction_target_size: BuilderValue<u64>,
    compaction_period: BuilderValue<Duration>,

    gc_horizon: BuilderValue<u64>,
@@ -183,6 +195,7 @@ impl Default for PageServerConfigBuilder {
            listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
            listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
            checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE),
+            compaction_target_size: Set(DEFAULT_COMPACTION_TARGET_SIZE),
            compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                .expect("cannot parse default compaction period")),
            gc_horizon: Set(DEFAULT_GC_HORIZON),
@@ -220,6 +233,10 @@ impl PageServerConfigBuilder {
        self.checkpoint_distance = BuilderValue::Set(checkpoint_distance)
    }

+    pub fn compaction_target_size(&mut self, compaction_target_size: u64) {
+        self.compaction_target_size = BuilderValue::Set(compaction_target_size)
+    }
+
    pub fn compaction_period(&mut self, compaction_period: Duration) {
        self.compaction_period = BuilderValue::Set(compaction_period)
    }
@@ -290,6 +307,9 @@ impl PageServerConfigBuilder {
            checkpoint_distance: self
                .checkpoint_distance
                .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?,
+            compaction_target_size: self
+                .compaction_target_size
+                .ok_or(anyhow::anyhow!("missing compaction_target_size"))?,
            compaction_period: self
                .compaction_period
                .ok_or(anyhow::anyhow!("missing compaction_period"))?,
@@ -341,10 +361,10 @@ pub struct RemoteStorageConfig {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum RemoteStorageKind {
    /// Storage based on local file system.
-    /// Specify a root folder to place all stored relish data into.
+    /// Specify a root folder to place all stored files into.
    LocalFs(PathBuf),
-    /// AWS S3 based storage, storing all relishes into the root
-    /// of the S3 bucket from the config.
+    /// AWS S3 based storage, storing all files in the S3 bucket
+    /// specified by the config
    AwsS3(S3Config),
 }

@@ -429,6 +449,9 @@ impl PageServerConf {
                "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
                "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
                "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?),
+                "compaction_target_size" => {
+                    builder.compaction_target_size(parse_toml_u64(key, item)?)
+                }
                "compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?),
                "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?),
                "gc_period" => builder.gc_period(parse_toml_duration(key, item)?),
@@ -565,6 +588,7 @@ impl PageServerConf {
        PageServerConf {
            id: ZNodeId(0),
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
+            compaction_target_size: 4 * 1024 * 1024,
            compaction_period: Duration::from_secs(10),
            gc_horizon: defaults::DEFAULT_GC_HORIZON,
            gc_period: Duration::from_secs(10),
@@ -636,6 +660,7 @@ listen_http_addr = '127.0.0.1:9898'

 checkpoint_distance = 111 # in bytes

+compaction_target_size = 111 # in bytes
 compaction_period = '111 s'

 gc_period = '222 s'
@@ -673,6 +698,7 @@ id = 10
                listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
                listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
                checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
+                compaction_target_size: defaults::DEFAULT_COMPACTION_TARGET_SIZE,
                compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?,
                gc_horizon: defaults::DEFAULT_GC_HORIZON,
                gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?,
@@ -717,6 +743,7 @@ id = 10
                listen_pg_addr: "127.0.0.1:64000".to_string(),
                listen_http_addr: "127.0.0.1:9898".to_string(),
                checkpoint_distance: 111,
+                compaction_target_size: 111,
                compaction_period: Duration::from_secs(111),
                gc_horizon: 222,
                gc_period: Duration::from_secs(222),
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -1,122 +1,36 @@
-use crate::timelines::TimelineInfo;
-use anyhow::{anyhow, bail, Context};
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use zenith_utils::{
    lsn::Lsn,
-    zid::{HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTimelineId},
+    zid::{ZNodeId, ZTenantId, ZTimelineId},
 };

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    pub new_timeline_id: Option<HexZTimelineId>,
-    pub ancestor_timeline_id: Option<HexZTimelineId>,
+    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub new_timeline_id: Option<ZTimelineId>,
+    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_timeline_id: Option<ZTimelineId>,
+    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_start_lsn: Option<Lsn>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateRequest {
-    pub new_tenant_id: Option<HexZTenantId>,
+    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub new_tenant_id: Option<ZTenantId>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
-pub struct TimelineInfoResponse {
-    pub kind: String,
-    #[serde(with = "hex")]
-    timeline_id: ZTimelineId,
-    #[serde(with = "hex")]
-    tenant_id: ZTenantId,
-    disk_consistent_lsn: String,
-    last_record_lsn: Option<String>,
-    prev_record_lsn: Option<String>,
-    ancestor_timeline_id: Option<HexZTimelineId>,
-    ancestor_lsn: Option<String>,
-    current_logical_size: Option<usize>,
-    current_logical_size_non_incremental: Option<usize>,
-}
-
-impl From<TimelineInfo> for TimelineInfoResponse {
-    fn from(other: TimelineInfo) -> Self {
-        match other {
-            TimelineInfo::Local {
-                timeline_id,
-                tenant_id,
-                last_record_lsn,
-                prev_record_lsn,
-                ancestor_timeline_id,
-                ancestor_lsn,
-                disk_consistent_lsn,
-                current_logical_size,
-                current_logical_size_non_incremental,
-            } => TimelineInfoResponse {
-                kind: "Local".to_owned(),
-                timeline_id,
-                tenant_id,
-                disk_consistent_lsn: disk_consistent_lsn.to_string(),
-                last_record_lsn: Some(last_record_lsn.to_string()),
-                prev_record_lsn: Some(prev_record_lsn.to_string()),
-                ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from),
-                ancestor_lsn: ancestor_lsn.map(|lsn| lsn.to_string()),
-                current_logical_size: Some(current_logical_size),
-                current_logical_size_non_incremental,
-            },
-            TimelineInfo::Remote {
-                timeline_id,
-                tenant_id,
-                disk_consistent_lsn,
-            } => TimelineInfoResponse {
-                kind: "Remote".to_owned(),
-                timeline_id,
-                tenant_id,
-                disk_consistent_lsn: disk_consistent_lsn.to_string(),
-                last_record_lsn: None,
-                prev_record_lsn: None,
-                ancestor_timeline_id: None,
-                ancestor_lsn: None,
-                current_logical_size: None,
-                current_logical_size_non_incremental: None,
-            },
-        }
-    }
-}
-
-impl TryFrom<TimelineInfoResponse> for TimelineInfo {
-    type Error = anyhow::Error;
-
-    fn try_from(other: TimelineInfoResponse) -> anyhow::Result<Self> {
-        let parse_lsn_hex_string = |lsn_string: String| {
-            lsn_string
-                .parse::<Lsn>()
-                .with_context(|| format!("Failed to parse Lsn as hex string from '{}'", lsn_string))
-        };
-
-        let disk_consistent_lsn = parse_lsn_hex_string(other.disk_consistent_lsn)?;
-        Ok(match other.kind.as_str() {
-            "Local" => TimelineInfo::Local {
-                timeline_id: other.timeline_id,
-                tenant_id: other.tenant_id,
-                last_record_lsn: other
-                    .last_record_lsn
-                    .ok_or(anyhow!("Local timeline should have last_record_lsn"))
-                    .and_then(parse_lsn_hex_string)?,
-                prev_record_lsn: other
-                    .prev_record_lsn
-                    .ok_or(anyhow!("Local timeline should have prev_record_lsn"))
-                    .and_then(parse_lsn_hex_string)?,
-                ancestor_timeline_id: other.ancestor_timeline_id.map(ZTimelineId::from),
-                ancestor_lsn: other.ancestor_lsn.map(parse_lsn_hex_string).transpose()?,
-                disk_consistent_lsn,
-                current_logical_size: other.current_logical_size.ok_or(anyhow!("No "))?,
-                current_logical_size_non_incremental: other.current_logical_size_non_incremental,
-            },
-            "Remote" => TimelineInfo::Remote {
-                timeline_id: other.timeline_id,
-                tenant_id: other.tenant_id,
-                disk_consistent_lsn,
-            },
-            unknown => bail!("Unknown timeline kind: {}", unknown),
-        })
-    }
-}
+#[serde(transparent)]
+pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId);

 #[derive(Serialize)]
 pub struct StatusResponse {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -18,7 +18,7 @@ paths:
              schema:
                type: object
                required:
-                - id
+                  - id
                properties:
                  id:
                    type: integer
@@ -122,6 +122,110 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+
+
+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/attach:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Attach remote timeline
+      responses:
+        "200":
+          description: Timeline attaching scheduled
+        "400":
+          description: Error when no tenant id found in path or no timeline id
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Timeline not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Timeline download is already in progress
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+
+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/detach:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Detach local timeline
+      responses:
+        "200":
+          description: Timeline detached
+        "400":
+          description: Error when no tenant id found in path or no timeline id
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+
  /v1/tenant/{tenant_id}/timeline/:
    parameters:
      - name: tenant_id
@@ -148,6 +252,7 @@ paths:
                  format: hex
                ancestor_start_lsn:
                  type: string
+                  format: hex
      responses:
        "201":
          description: TimelineInfo
@@ -178,7 +283,7 @@ paths:
          content:
            application/json:
              schema:
-                $ref: "#/components/schemas/AlreadyExistsError"
+                $ref: "#/components/schemas/ConflictError"
        "500":
          description: Generic operation error
          content:
@@ -259,7 +364,7 @@ paths:
          content:
            application/json:
              schema:
-                $ref: "#/components/schemas/AlreadyExistsError"
+                $ref: "#/components/schemas/ConflictError"
        "500":
          description: Generic operation error
          content:
@@ -289,7 +394,6 @@ components:
      required:
        - timeline_id
        - tenant_id
-        - disk_consistent_lsn
      properties:
        timeline_id:
          type: string
@@ -297,17 +401,44 @@ components:
        tenant_id:
          type: string
          format: hex
+        local:
+          $ref: "#/components/schemas/LocalTimelineInfo"
+        remote:
+          $ref: "#/components/schemas/RemoteTimelineInfo"
+    RemoteTimelineInfo:
+      type: object
+      required:
+        - awaits_download
+      properties:
+        awaits_download:
+          type: boolean
+        remote_consistent_lsn:
+          type: string
+          format: hex
+    LocalTimelineInfo:
+      type: object
+      required:
+        - last_record_lsn
+        - disk_consistent_lsn
+        - timeline_state
+      properties:
        last_record_lsn:
          type: string
-        prev_record_lsn:
+          format: hex
+        disk_consistent_lsn:
+          type: string
+          format: hex
+        timeline_state:
          type: string
        ancestor_timeline_id:
          type: string
          format: hex
        ancestor_lsn:
          type: string
-        disk_consistent_lsn:
+          format: hex
+        prev_record_lsn:
          type: string
+          format: hex
        current_logical_size:
          type: integer
        current_logical_size_non_incremental:
@@ -327,14 +458,21 @@ components:
      properties:
        msg:
          type: string
-    AlreadyExistsError:
+    ForbiddenError:
      type: object
      required:
        - msg
      properties:
        msg:
          type: string
-    ForbiddenError:
+    NotFoundError:
+      type: object
+      required:
+        - msg
+      properties:
+        msg:
+          type: string
+    ConflictError:
      type: object
      required:
        - msg
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -16,25 +16,29 @@ use zenith_utils::http::{
    request::parse_request_param,
 };
 use zenith_utils::http::{RequestExt, RouterBuilder};
-use zenith_utils::zid::{HexZTenantId, ZTimelineId};
+use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId};

 use super::models::{
-    StatusResponse, TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse,
+    StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest,
 };
+use crate::remote_storage::{schedule_timeline_download, RemoteIndex};
 use crate::repository::Repository;
-use crate::repository::RepositoryTimeline;
-use crate::timelines::TimelineInfo;
+use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
 use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId};

-#[derive(Debug)]
 struct State {
    conf: &'static PageServerConf,
    auth: Option<Arc<JwtAuth>>,
+    remote_index: RemoteIndex,
    allowlist_routes: Vec<Uri>,
 }

 impl State {
-    fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
+    fn new(
+        conf: &'static PageServerConf,
+        auth: Option<Arc<JwtAuth>>,
+        remote_index: RemoteIndex,
+    ) -> Self {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
            .iter()
            .map(|v| v.parse().unwrap())
@@ -43,6 +47,7 @@ impl State {
            conf,
            auth,
            allowlist_routes,
+            remote_index,
        }
    }
 }
@@ -63,10 +68,7 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
 // healthcheck handler
 async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let config = get_config(&request);
-    Ok(json_response(
-        StatusCode::OK,
-        StatusResponse { id: config.id },
-    )?)
+    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

 async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -89,7 +91,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
    .map_err(ApiError::from_err)??;

    Ok(match new_timeline_info {
-        Some(info) => json_response(StatusCode::CREATED, TimelineInfoResponse::from(info))?,
+        Some(info) => json_response(StatusCode::CREATED, info)?,
        None => json_response(StatusCode::CONFLICT, ())?,
    })
 }
@@ -98,16 +100,35 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
    let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
-    let response_data: Vec<TimelineInfoResponse> = tokio::task::spawn_blocking(move || {
+    let local_timeline_infos = tokio::task::spawn_blocking(move || {
        let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
-        crate::timelines::get_timelines(tenant_id, include_non_incremental_logical_size)
+        crate::timelines::get_local_timelines(tenant_id, include_non_incremental_logical_size)
    })
    .await
-    .map_err(ApiError::from_err)??
-    .into_iter()
-    .map(TimelineInfoResponse::from)
-    .collect();
-    Ok(json_response(StatusCode::OK, response_data)?)
+    .map_err(ApiError::from_err)??;
+
+    let mut response_data = Vec::with_capacity(local_timeline_infos.len());
+    for (timeline_id, local_timeline_info) in local_timeline_infos {
+        response_data.push(TimelineInfo {
+            tenant_id,
+            timeline_id,
+            local: Some(local_timeline_info),
+            remote: get_state(&request)
+                .remote_index
+                .read()
+                .await
+                .timeline_entry(&ZTenantTimelineId {
+                    tenant_id,
+                    timeline_id,
+                })
+                .map(|remote_entry| RemoteTimelineInfo {
+                    remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
+                    awaits_download: remote_entry.get_awaits_download(),
+                }),
+        })
+    }
+
+    json_response(StatusCode::OK, response_data)
 }

 // Gate non incremental logical size calculation behind a flag
@@ -130,20 +151,60 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    check_permission(&request, Some(tenant_id))?;

    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
+    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);

-    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter =
-            info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id)
-                .entered();
-        let include_non_incremental_logical_size =
-            get_include_non_incremental_logical_size(&request);
-        TimelineInfo::from_ids(tenant_id, timeline_id, include_non_incremental_logical_size)
+    let span = info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id);
+
+    let (local_timeline_info, span) = tokio::task::spawn_blocking(move || {
+        let entered = span.entered();
+        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
+        let local_timeline = {
+            repo.get_timeline(timeline_id)
+                .as_ref()
+                .map(|timeline| {
+                    LocalTimelineInfo::from_repo_timeline(
+                        tenant_id,
+                        timeline_id,
+                        timeline,
+                        include_non_incremental_logical_size,
+                    )
+                })
+                .transpose()?
+        };
+        Ok::<_, anyhow::Error>((local_timeline, entered.exit()))
    })
    .await
-    .map_err(ApiError::from_err)?
-    .map(TimelineInfoResponse::from)?;
+    .map_err(ApiError::from_err)??;

-    Ok(json_response(StatusCode::OK, response_data)?)
+    let remote_timeline_info = {
+        let remote_index_read = get_state(&request).remote_index.read().await;
+        remote_index_read
+            .timeline_entry(&ZTenantTimelineId {
+                tenant_id,
+                timeline_id,
+            })
+            .map(|remote_entry| RemoteTimelineInfo {
+                remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
+                awaits_download: remote_entry.get_awaits_download(),
+            })
+    };
+
+    let _enter = span.entered();
+
+    if local_timeline_info.is_none() && remote_timeline_info.is_none() {
+        return Err(ApiError::NotFound(
+            "Timeline is not found neither locally nor remotely".to_string(),
+        ));
+    }
+
+    let timeline_info = TimelineInfo {
+        tenant_id,
+        timeline_id,
+        local: local_timeline_info,
+        remote: remote_timeline_info,
+    };
+
+    json_response(StatusCode::OK, timeline_info)
 }

 async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -151,32 +212,39 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
    check_permission(&request, Some(tenant_id))?;

    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
+    let span = info_span!("timeline_attach_handler", tenant = %tenant_id, timeline = %timeline_id);

-    tokio::task::spawn_blocking(move || {
-        let _enter =
-            info_span!("timeline_attach_handler", tenant = %tenant_id, timeline = %timeline_id)
-                .entered();
-        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
-        match repo.get_timeline(timeline_id)? {
-            RepositoryTimeline::Local { .. } => {
-                anyhow::bail!("Timeline with id {} is already local", timeline_id)
-            }
-            RepositoryTimeline::Remote {
-                id: _,
-                disk_consistent_lsn: _,
-            } => {
-                // FIXME (rodionov) get timeline already schedules timeline for download, and duplicate tasks can cause errors
-                //  first should be fixed in https://github.com/zenithdb/zenith/issues/997
-                // TODO (rodionov) change timeline state to awaits download (incapsulate it somewhere in the repo)
-                // TODO (rodionov) can we safely request replication on the timeline before sync is completed? (can be implemented on top of the #997)
-                Ok(())
-            }
-        }
+    let span = tokio::task::spawn_blocking(move || {
+        let entered = span.entered();
+        if tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).is_ok() {
+            // TODO: maybe answer with 309 Not Modified here?
+            anyhow::bail!("Timeline is already present locally")
+        };
+        Ok(entered.exit())
    })
    .await
    .map_err(ApiError::from_err)??;

-    Ok(json_response(StatusCode::ACCEPTED, ())?)
+    let mut remote_index_write = get_state(&request).remote_index.write().await;
+
+    let _enter = span.entered(); // entered guard cannot live across awaits (non Send)
+    let index_entry = remote_index_write
+        .timeline_entry_mut(&ZTenantTimelineId {
+            tenant_id,
+            timeline_id,
+        })
+        .ok_or_else(|| ApiError::NotFound("Unknown remote timeline".to_string()))?;
+
+    if index_entry.get_awaits_download() {
+        return Err(ApiError::Conflict(
+            "Timeline download is already in progress".to_string(),
+        ));
+    }
+
+    index_entry.set_awaits_download(true);
+    schedule_timeline_download(tenant_id, timeline_id);
+
+    json_response(StatusCode::ACCEPTED, ())
 }

 async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -195,7 +263,7 @@ async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body
    .await
    .map_err(ApiError::from_err)??;

-    Ok(json_response(StatusCode::OK, ())?)
+    json_response(StatusCode::OK, ())
 }

 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -209,7 +277,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
    .await
    .map_err(ApiError::from_err)??;

-    Ok(json_response(StatusCode::OK, response_data)?)
+    json_response(StatusCode::OK, response_data)
 }

 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -217,19 +285,23 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    check_permission(&request, None)?;

    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    let remote_index = get_state(&request).remote_index.clone();
+
+    let target_tenant_id = request_data
+        .new_tenant_id
+        .map(ZTenantId::from)
+        .unwrap_or_else(ZTenantId::generate);

    let new_tenant_id = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_create", tenant = ?request_data.new_tenant_id).entered();
-        tenant_mgr::create_tenant_repository(
-            get_config(&request),
-            request_data.new_tenant_id.map(ZTenantId::from),
-        )
+        let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
+
+        tenant_mgr::create_tenant_repository(get_config(&request), target_tenant_id, remote_index)
    })
    .await
    .map_err(ApiError::from_err)??;

    Ok(match new_tenant_id {
-        Some(id) => json_response(StatusCode::CREATED, HexZTenantId::from(id))?,
+        Some(id) => json_response(StatusCode::CREATED, TenantCreateResponse(id))?,
        None => json_response(StatusCode::CONFLICT, ())?,
    })
 }
@@ -244,6 +316,7 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
 pub fn make_router(
    conf: &'static PageServerConf,
    auth: Option<Arc<JwtAuth>>,
+    remote_index: RemoteIndex,
 ) -> RouterBuilder<hyper::Body, ApiError> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -259,7 +332,7 @@ pub fn make_router(
    }

    router
-        .data(Arc::new(State::new(conf, auth)))
+        .data(Arc::new(State::new(conf, auth, remote_index)))
        .get("/v1/status", status_handler)
        .get("/v1/tenant", tenant_list_handler)
        .post("/v1/tenant", tenant_create_handler)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,7 +12,7 @@ use bytes::Bytes;
 use tracing::*;

 use crate::pgdatadir_mapping::*;
-use crate::relish::*;
+use crate::reltag::{RelTag, SlruKind};
 use crate::repository::Repository;
 use crate::walingest::WalIngest;
 use postgres_ffi::relfile_utils::*;
@@ -35,8 +35,8 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
 ) -> Result<()> {
    let mut pg_control: Option<ControlFileData> = None;

-    let mut writer = tline.begin_record(lsn);
-    writer.init_empty()?;
+    let mut modification = tline.begin_modification(lsn);
+    modification.init_empty()?;

    // Scan 'global'
    let mut relfiles: Vec<PathBuf> = Vec::new();
@@ -46,11 +46,11 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
            None => continue,

            Some("pg_control") => {
-                pg_control = Some(import_control_file(&mut writer, &direntry.path())?);
+                pg_control = Some(import_control_file(&mut modification, &direntry.path())?);
            }
            Some("pg_filenode.map") => {
                import_relmap_file(
-                    &mut writer,
+                    &mut modification,
                    pg_constants::GLOBALTABLESPACE_OID,
                    0,
                    &direntry.path(),
@@ -62,7 +62,12 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
        }
    }
    for relfile in relfiles {
-        import_relfile(&mut writer, &relfile, pg_constants::GLOBALTABLESPACE_OID, 0)?;
+        import_relfile(
+            &mut modification,
+            &relfile,
+            pg_constants::GLOBALTABLESPACE_OID,
+            0,
+        )?;
    }

    // Scan 'base'. It contains database dirs, the database OID is the filename.
@@ -71,11 +76,11 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
        let direntry = direntry?;

        //skip all temporary files
-        if direntry.file_name().to_str().unwrap() == "pgsql_tmp" {
+        if direntry.file_name().to_string_lossy() == "pgsql_tmp" {
            continue;
        }

-        let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;
+        let dboid = direntry.file_name().to_string_lossy().parse::<u32>()?;

        let mut relfiles: Vec<PathBuf> = Vec::new();
        for direntry in fs::read_dir(direntry.path())? {
@@ -84,10 +89,10 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
                None => continue,

                Some("PG_VERSION") => {
-                    //writer.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?;
+                    //modification.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?;
                }
                Some("pg_filenode.map") => import_relmap_file(
-                    &mut writer,
+                    &mut modification,
                    pg_constants::DEFAULTTABLESPACE_OID,
                    dboid,
                    &direntry.path(),
@@ -99,7 +104,7 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
        }
        for relfile in relfiles {
            import_relfile(
-                &mut writer,
+                &mut modification,
                &relfile,
                pg_constants::DEFAULTTABLESPACE_OID,
                dboid,
@@ -108,25 +113,25 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
    }
    for entry in fs::read_dir(path.join("pg_xact"))? {
        let entry = entry?;
-        import_slru_file(&mut writer, SlruKind::Clog, &entry.path())?;
+        import_slru_file(&mut modification, SlruKind::Clog, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
        let entry = entry?;
-        import_slru_file(&mut writer, SlruKind::MultiXactMembers, &entry.path())?;
+        import_slru_file(&mut modification, SlruKind::MultiXactMembers, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
        let entry = entry?;
-        import_slru_file(&mut writer, SlruKind::MultiXactOffsets, &entry.path())?;
+        import_slru_file(&mut modification, SlruKind::MultiXactOffsets, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_twophase"))? {
        let entry = entry?;
-        let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
-        import_twophase_file(&mut writer, xid, &entry.path())?;
+        let xid = u32::from_str_radix(&entry.path().to_string_lossy(), 16)?;
+        import_twophase_file(&mut modification, xid, &entry.path())?;
    }
    // TODO: Scan pg_tblspc

    // We're done importing all the data files.
-    writer.finish()?;
+    modification.commit()?;

    // We expect the Postgres server to be shut down cleanly.
    let pg_control = pg_control.context("pg_control file not found")?;
@@ -154,20 +159,19 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(

 // subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
 fn import_relfile<R: Repository>(
-    timeline: &mut DatadirTimelineWriter<R>,
+    modification: &mut DatadirModification<R>,
    path: &Path,
    spcoid: Oid,
    dboid: Oid,
-) -> Result<()> {
+) -> anyhow::Result<()> {
    // Does it look like a relation file?
    trace!("importing rel file {}", path.display());

-    let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
-    if let Err(e) = p {
-        warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
-        return Err(e.into());
-    }
-    let (relnode, forknum, segno) = p.unwrap();
+    let (relnode, forknum, segno) = parse_relfilename(&path.file_name().unwrap().to_string_lossy())
+        .map_err(|e| {
+            warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
+            e
+        })?;

    let mut file = File::open(path)?;
    let mut buf: [u8; 8192] = [0u8; 8192];
@@ -186,14 +190,14 @@ fn import_relfile<R: Repository>(
        relnode,
        forknum,
    };
-    timeline.put_rel_creation(rel, nblocks as u32)?;
+    modification.put_rel_creation(rel, nblocks as u32)?;

    let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
    loop {
        let r = file.read_exact(&mut buf);
        match r {
            Ok(_) => {
-                timeline.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
+                modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
            }

            // TODO: UnexpectedEof is expected
@@ -216,7 +220,7 @@ fn import_relfile<R: Repository>(

 /// Import a relmapper (pg_filenode.map) file into the repository
 fn import_relmap_file<R: Repository>(
-    timeline: &mut DatadirTimelineWriter<R>,
+    modification: &mut DatadirModification<R>,
    spcnode: Oid,
    dbnode: Oid,
    path: &Path,
@@ -228,13 +232,13 @@ fn import_relmap_file<R: Repository>(

    trace!("importing relmap file {}", path.display());

-    timeline.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?;
+    modification.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?;
    Ok(())
 }

 /// Import a twophase state file (pg_twophase/<xid>) into the repository
 fn import_twophase_file<R: Repository>(
-    timeline: &mut DatadirTimelineWriter<R>,
+    modification: &mut DatadirModification<R>,
    xid: TransactionId,
    path: &Path,
 ) -> Result<()> {
@@ -245,7 +249,7 @@ fn import_twophase_file<R: Repository>(

    trace!("importing non-rel file {}", path.display());

-    timeline.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?;
+    modification.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?;
    Ok(())
 }

@@ -255,7 +259,7 @@ fn import_twophase_file<R: Repository>(
 /// The control file is imported as is, but we also extract the checkpoint record
 /// from it and store it separated.
 fn import_control_file<R: Repository>(
-    timeline: &mut DatadirTimelineWriter<R>,
+    modification: &mut DatadirModification<R>,
    path: &Path,
 ) -> Result<ControlFileData> {
    let mut file = File::open(path)?;
@@ -266,12 +270,12 @@ fn import_control_file<R: Repository>(
    trace!("importing control file {}", path.display());

    // Import it as ControlFile
-    timeline.put_control_file(Bytes::copy_from_slice(&buffer[..]))?;
+    modification.put_control_file(Bytes::copy_from_slice(&buffer[..]))?;

    // Extract the checkpoint record and import it separately.
    let pg_control = ControlFileData::decode(&buffer)?;
    let checkpoint_bytes = pg_control.checkPointCopy.encode();
-    timeline.put_checkpoint(checkpoint_bytes)?;
+    modification.put_checkpoint(checkpoint_bytes)?;

    Ok(pg_control)
 }
@@ -280,7 +284,7 @@ fn import_control_file<R: Repository>(
 /// Import an SLRU segment file
 ///
 fn import_slru_file<R: Repository>(
-    timeline: &mut DatadirTimelineWriter<R>,
+    modification: &mut DatadirModification<R>,
    slru: SlruKind,
    path: &Path,
 ) -> Result<()> {
@@ -288,7 +292,7 @@ fn import_slru_file<R: Repository>(

    let mut file = File::open(path)?;
    let mut buf: [u8; 8192] = [0u8; 8192];
-    let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
+    let segno = u32::from_str_radix(&path.file_name().unwrap().to_string_lossy(), 16)?;

    let len = file.metadata().unwrap().len();
    ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ
@@ -296,14 +300,19 @@ fn import_slru_file<R: Repository>(

    ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64);

-    timeline.put_slru_segment_creation(slru, segno, nblocks as u32)?;
+    modification.put_slru_segment_creation(slru, segno, nblocks as u32)?;

    let mut rpageno = 0;
    loop {
        let r = file.read_exact(&mut buf);
        match r {
            Ok(_) => {
-                timeline.put_slru_page_image(slru, segno, rpageno, Bytes::copy_from_slice(&buf))?;
+                modification.put_slru_page_image(
+                    slru,
+                    segno,
+                    rpageno,
+                    Bytes::copy_from_slice(&buf),
+                )?;
            }

            // TODO: UnexpectedEof is expected
--- a/pageserver/src/keyspace.rs
+++ b/pageserver/src/keyspace.rs
@@ -2,23 +2,20 @@ use crate::repository::{key_range_size, singleton_range, Key};
 use postgres_ffi::pg_constants;
 use std::ops::Range;

-// Target file size, when creating image and delta layers
-pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; // 128 MB
-
 ///
 /// Represents a set of Keys, in a compact form.
 ///
 #[derive(Clone, Debug)]
 pub struct KeySpace {
-    // Contiguous ranges of keys that belong to the key space. In key order, and
-    // with no overlap.
+    /// Contiguous ranges of keys that belong to the key space. In key order,
+    /// and with no overlap.
    pub ranges: Vec<Range<Key>>,
 }

 impl KeySpace {
    ///
-    /// Partition a key space into roughly chunks of roughly 'target_size' bytes in
-    /// each patition.
+    /// Partition a key space into roughly chunks of roughly 'target_size' bytes
+    /// in each patition.
    ///
    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
        // Assume that each value is 8k in size.
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -1,40 +1,42 @@
 # Overview

-The on-disk format is based on immutable files. The page server receives a
-stream of incoming WAL, parses the WAL records to determine which pages they
-apply to, and accumulates the incoming changes in memory. Every now and then,
-the accumulated changes are written out to new immutable files. This process is
-called checkpointing. Old versions of on-disk files that are not needed by any
-timeline are removed by GC process.
-
 The main responsibility of the Page Server is to process the incoming WAL, and
 reprocess it into a format that allows reasonably quick access to any page
-version.
+version. The page server slices the incoming WAL per relation and page, and
+packages the sliced WAL into suitably-sized "layer files". The layer files
+contain all the history of the database, back to some reasonable retention
+period. This system replaces the base backups and the WAL archive used in a
+traditional PostgreSQL installation. The layer files are immutable, they are not
+modified in-place after creation. New layer files are created for new incoming
+WAL, and old layer files are removed when they are no longer needed.
+
+The on-disk format is based on immutable files. The page server receives a
+stream of incoming WAL, parses the WAL records to determine which pages they
+apply to, and accumulates the incoming changes in memory. Whenever enough WAL
+has been accumulated in memory, it is written out to a new immutable file. That
+process accumulates "L0 delta files" on disk. When enough L0 files have been
+accumulated, they are merged and re-partitioned into L1 files, and old files
+that are no longer needed are removed by Garbage Collection (GC).

 The incoming WAL contains updates to arbitrary pages in the system. The
 distribution depends on the workload: the updates could be totally random, or
 there could be a long stream of updates to a single relation when data is bulk
-loaded, for example, or something in between. The page server slices the
-incoming WAL per relation and page, and packages the sliced WAL into
-suitably-sized "layer files". The layer files contain all the history of the
-database, back to some reasonable retention period. This system replaces the
-base backups and the WAL archive used in a traditional PostgreSQL
-installation. The layer files are immutable, they are not modified in-place
-after creation. New layer files are created for new incoming WAL, and old layer
-files are removed when they are no longer needed. We could also replace layer
-files with new files that contain the same information, merging small files for
-example, but that hasn't been implemented yet.
+loaded, for example, or something in between.

+Cloud Storage                   Page Server                           Safekeeper
+                        L1               L0             Memory            WAL

-Cloud Storage                   Page Server                   Safekeeper
-                     Local disk                Memory            WAL
-
-|AAAA|               |AAAA|AAAA|               |AA
-|BBBB|               |BBBB|BBBB|               |
-|CCCC|CCCC|  <----   |CCCC|CCCC|CCCC|   <---   |CC     <----   ADEBAABED
-|DDDD|DDDD|          |DDDD|DDDD|               |DDD
-|EEEE|               |EEEE|EEEE|EEEE|          |E
-
+----+               +----+----+
+|AAAA|               |AAAA|AAAA|      +---+-----+         |
+----+               +----+----+      |   |     |         |AA
+|BBBB|               |BBBB|BBBB|      |BB | AA  |         |BB
+----+----+          +----+----+      |C  | BB  |         |CC
+|CCCC|CCCC|  <----   |CCCC|CCCC| <--- |D  | CC  |  <---   |DDD     <----   ADEBAABED
+----+----+          +----+----+      |   | DDD |         |E
+|DDDD|DDDD|          |DDDD|DDDD|      |E  |     |         |
+----+----+          +----+----+      |   |     |
+|EEEE|               |EEEE|EEEE|      +---+-----+
+----+               +----+----+

 In this illustration, WAL is received as a stream from the Safekeeper, from the
 right.  It is immediately captured by the page server and stored quickly in
@@ -42,39 +44,29 @@ memory. The page server memory can be thought of as a quick "reorder buffer",
 used to hold the incoming WAL and reorder it so that we keep the WAL records for
 the same page and relation close to each other.

-From the page server memory, whenever enough WAL has been accumulated for one
-relation segment, it is moved to local disk, as a new layer file, and the memory
-is released.
+From the page server memory, whenever enough WAL has been accumulated, it is flushed
+to disk into a new L0 layer file, and the memory is released.
+
+When enough L0 files have been accumulated, they are merged together rand sliced
+per key-space, producing a new set of files where each file contains a more
+narrow key range, but larger LSN range.

 From the local disk, the layers are further copied to Cloud Storage, for
 long-term archival. After a layer has been copied to Cloud Storage, it can be
 removed from local disk, although we currently keep everything locally for fast
 access. If a layer is needed that isn't found locally, it is fetched from Cloud
-Storage and stored in local disk.
-
-# Terms used in layered repository
-
- Relish - one PostgreSQL relation or similarly treated file.
- Segment - one slice of a Relish that is stored in a LayeredTimeline.
- Layer -  specific version of a relish Segment in a range of LSNs.
+Storage and stored in local disk. L0 and L1 files are both uploaded to Cloud
+Storage.

 # Layer map

-The LayerMap tracks what layers exist for all the relishes in a timeline.
-
-LayerMap consists of two data structures:
- segs - All the layers keyed by segment tag
- open_layers - data structure that hold all open layers ordered by oldest_pending_lsn for quick access during checkpointing. oldest_pending_lsn is the LSN of the oldest page version stored in this layer.
-
-All operations that update InMemory Layers should update both structures to keep them up-to-date.
-
- LayeredTimeline - implements Timeline interface.
-
-All methods of LayeredTimeline are aware of its ancestors and return data taking them into account.
-TODO: Are there any exceptions to this?
-For example, timeline.list_rels(lsn) will return all segments that are visible in this timeline at the LSN,
-including ones that were not modified in this timeline and thus don't have a layer in the timeline's LayerMap.
+The LayerMap tracks what layers exist in a timeline.

+Currently, the layer map is just a resizeable array (Vec). On a GetPage@LSN or
+other read request, the layer map scans through the array to find the right layer
+that contains the data for the requested page. The read-code in LayeredTimeline
+is aware of the ancestor, and returns data from the ancestor timeline if it's
+not found on the current timeline.

 # Different kinds of layers

@@ -92,11 +84,11 @@ To avoid OOM errors, InMemory layers can be spilled to disk into ephemeral file.
 TODO: Clarify the difference between Closed, Historic and Frozen.

 There are two kinds of OnDisk layers:
- ImageLayer represents an image or a snapshot of a 10 MB relish segment, at one particular LSN.
- DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for one
-  relish segment.
-
-Dropped segments are always represented on disk by DeltaLayer.
+- ImageLayer represents a snapshot of all the keys in a particular range, at one
+  particular LSN. Any keys that are not present in the ImageLayer are known not
+  to exist at that LSN.
+- DeltaLayer represents a collection of WAL records or page images in a range of
+  LSNs, for a range of keys.

 # Layer life cycle

@@ -109,71 +101,71 @@ layer or a delta layer, it is a valid end bound. An image layer represents
 snapshot at one LSN, so end_lsn is always the snapshot LSN + 1

 Every layer starts its life as an Open In-Memory layer. When the page server
-receives the first WAL record for a segment, it creates a new In-Memory layer
-for it, and puts it to the layer map. Later, the layer is old enough, its
-contents are written to disk, as On-Disk layers. This process is called
-"evicting" a layer.
+receives the first WAL record for a timeline, it creates a new In-Memory layer
+for it, and puts it to the layer map. Later, when the layer becomes full, its
+contents are written to disk, as an on-disk layers.

-Layer eviction is a two-step process: First, the layer is marked as closed, so
-that it no longer accepts new WAL records, and the layer map is updated
-accordingly. If a new WAL record for that segment arrives after this step, a new
-Open layer is created to hold it. After this first step, the layer is a Closed
+Flushing a layer is a two-step process: First, the layer is marked as closed, so
+that it no longer accepts new WAL records, and a new in-memory layer is created
+to hold any WAL after that point. After this first step, the layer is a Closed
 InMemory state. This first step is called "freezing" the layer.

-In the second step, new Delta and Image layers are created, containing all the
-data in the Frozen InMemory layer. When the new layers are ready, the original
-frozen layer is replaced with the new layers in the layer map, and the original
-frozen layer is dropped, releasing the memory.
+In the second step, a new Delta layers is created, containing all the data from
+the Frozen InMemory layer. When it has been created and flushed to disk, the
+original frozen layer is replaced with the new layers in the layer map, and the
+original frozen layer is dropped, releasing the memory.

 # Layer files (On-disk layers)

-The files are called "layer files". Each layer file corresponds
-to one RELISH_SEG_SIZE slice of a PostgreSQL relation fork or
-non-rel file in a range of LSNs. The layer files
-for each timeline are stored in the timeline's subdirectory under
+The files are called "layer files". Each layer file covers a range of keys, and
+a range of LSNs (or a single LSN, in case of image layers). You can think of it
+as a rectangle in the two-dimensional key-LSN space. The layer files for each
+timeline are stored in the timeline's subdirectory under
 .zenith/tenants/<tenantid>/timelines.

-There are two kind of layer file: base images, and deltas. A base
-image file contains a layer of a segment as it was at one LSN,
-whereas a delta file contains modifications to a segment - mostly in
-the form of WAL records - in a range of LSN
+There are two kind of layer files: images, and delta layers. An image file
+contains a snapshot of all keys at a particular LSN, whereas a delta file
+contains modifications to a segment - mostly in the form of WAL records - in a
+range of LSN.

-base image file:
+image file:

-    rel_<spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>
+    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
+              start key                          end key                           LSN
+
+The first parts define the key range that the layer covers. See
+pgdatadir_mapping.rs for how the key space is used. The last part is the LSN.

 delta file:

-    rel_<spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>_<end LSN>
+Delta files are named similarly, but they cover a range of LSNs:

-For example:
+    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
+              start key                          end key                          start LSN     end LSN

-    rel_1663_13990_2609_0_10_000000000169C348
-    rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
+A delta file contains all the key-values in the key-range that were updated in
+the LSN range. If a key has not been modified, there is no trace of it in the
+delta layer.

-In addition to the relations, with "rel_*" prefix, we use the same
-format for storing various smaller files from the PostgreSQL data
-directory. They will use different suffixes and the naming scheme up
-to the LSNs vary. The Zenith source code uses the term "relish" to
-mean "a relation, or other file that's treated like a relation in the
-storage" For example, a base image of a CLOG segment would be named
-like this:

-    pg_xact_0000_0_00000000198B06B0
+A delta layer file can cover a part of the overall key space, as in the previous
+example, or the whole key range like this:

-There is no difference in how the relation and non-relation files are
-managed, except that the first part of file names is different.
-Internally, the relations and non-relation files that are managed in
-the versioned store are together called "relishes".
+    000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051

-If a file has been dropped, the last layer file for it is created
-with the _DROPPED suffix, e.g.
-
-    rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED
+A file that covers the whole key range is called a L0 file (Level 0), while a
+file that covers only part of the key range is called a L1 file. The "level" of
+a file is not explicitly stored anywhere, you can only distinguish them by
+looking at the key range that a file covers. The read-path doesn't need to
+treat L0 and L1 files any differently.


 ## Notation used in this document

+FIXME: This is somewhat obsolete, the layer files cover a key-range rather than
+a particular relation nowadays. However, the description on how you find a page
+version, and how branching and GC works is still valid.
+
 The full path of a delta file looks like this:

    .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
--- a/pageserver/src/layered_repository/blob_io.rs
+++ b/pageserver/src/layered_repository/blob_io.rs
@@ -0,0 +1,139 @@
+//!
+//! Functions for reading and writing variable-sized "blobs".
+//!
+//! Each blob begins with a 4-byte length, followed by the actual data.
+//!
+use crate::layered_repository::block_io::{BlockCursor, BlockReader};
+use crate::page_cache::PAGE_SZ;
+use std::cmp::min;
+use std::io::Error;
+
+/// For reading
+pub trait BlobCursor {
+    /// Read a blob into a new buffer.
+    fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+        let mut buf = Vec::new();
+        self.read_blob_into_buf(offset, &mut buf)?;
+        Ok(buf)
+    }
+
+    /// Read blob into the given buffer. Any previous contents in the buffer
+    /// are overwritten.
+    fn read_blob_into_buf(
+        &mut self,
+        offset: u64,
+        dstbuf: &mut Vec<u8>,
+    ) -> Result<(), std::io::Error>;
+}
+
+impl<'a, R> BlobCursor for BlockCursor<R>
+where
+    R: BlockReader,
+{
+    fn read_blob_into_buf(
+        &mut self,
+        offset: u64,
+        dstbuf: &mut Vec<u8>,
+    ) -> Result<(), std::io::Error> {
+        let mut blknum = (offset / PAGE_SZ as u64) as u32;
+        let mut off = (offset % PAGE_SZ as u64) as usize;
+
+        let mut buf = self.read_blk(blknum)?;
+
+        // read length
+        let mut len_buf = [0u8; 4];
+        let thislen = PAGE_SZ - off;
+        if thislen < 4 {
+            // it is split across two pages
+            len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
+            blknum += 1;
+            buf = self.read_blk(blknum)?;
+            len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
+            off = 4 - thislen;
+        } else {
+            len_buf.copy_from_slice(&buf[off..off + 4]);
+            off += 4;
+        }
+        let len = u32::from_ne_bytes(len_buf) as usize;
+
+        dstbuf.clear();
+
+        // Read the payload
+        let mut remain = len;
+        while remain > 0 {
+            let mut page_remain = PAGE_SZ - off;
+            if page_remain == 0 {
+                // continue on next page
+                blknum += 1;
+                buf = self.read_blk(blknum)?;
+                off = 0;
+                page_remain = PAGE_SZ;
+            }
+            let this_blk_len = min(remain, page_remain);
+            dstbuf.extend_from_slice(&buf[off..off + this_blk_len]);
+            remain -= this_blk_len;
+            off += this_blk_len;
+        }
+        Ok(())
+    }
+}
+
+///
+/// Abstract trait for a data sink that you can write blobs to.
+///
+pub trait BlobWriter {
+    /// Write a blob of data. Returns the offset that it was written to,
+    /// which can be used to retrieve the data later.
+    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error>;
+}
+
+///
+/// An implementation of BlobWriter to write blobs to anything that
+/// implements std::io::Write.
+///
+pub struct WriteBlobWriter<W>
+where
+    W: std::io::Write,
+{
+    inner: W,
+    offset: u64,
+}
+
+impl<W> WriteBlobWriter<W>
+where
+    W: std::io::Write,
+{
+    pub fn new(inner: W, start_offset: u64) -> Self {
+        WriteBlobWriter {
+            inner,
+            offset: start_offset,
+        }
+    }
+
+    pub fn size(&self) -> u64 {
+        self.offset
+    }
+
+    /// Access the underlying Write object.
+    ///
+    /// NOTE: WriteBlobWriter keeps track of the current write offset. If
+    /// you write something directly to the inner Write object, it makes the
+    /// internally tracked 'offset' to go out of sync. So don't do that.
+    pub fn into_inner(self) -> W {
+        self.inner
+    }
+}
+
+impl<W> BlobWriter for WriteBlobWriter<W>
+where
+    W: std::io::Write,
+{
+    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
+        let offset = self.offset;
+        self.inner
+            .write_all(&((srcbuf.len()) as u32).to_ne_bytes())?;
+        self.inner.write_all(srcbuf)?;
+        self.offset += 4 + srcbuf.len() as u64;
+        Ok(offset)
+    }
+}
--- a/pageserver/src/layered_repository/block_io.rs
+++ b/pageserver/src/layered_repository/block_io.rs
@@ -0,0 +1,219 @@
+//!
+//! Low-level Block-oriented I/O functions
+//!
+
+use crate::page_cache;
+use crate::page_cache::{ReadBufResult, PAGE_SZ};
+use bytes::Bytes;
+use lazy_static::lazy_static;
+use std::ops::{Deref, DerefMut};
+use std::os::unix::fs::FileExt;
+use std::sync::atomic::AtomicU64;
+
+/// This is implemented by anything that can read 8 kB (PAGE_SZ)
+/// blocks, using the page cache
+///
+/// There are currently two implementations: EphemeralFile, and FileBlockReader
+/// below.
+pub trait BlockReader {
+    type BlockLease: Deref<Target = [u8; PAGE_SZ]> + 'static;
+
+    ///
+    /// Read a block. Returns a "lease" object that can be used to
+    /// access to the contents of the page. (For the page cache, the
+    /// lease object represents a lock on the buffer.)
+    ///
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error>;
+
+    ///
+    /// Create a new "cursor" for reading from this reader.
+    ///
+    /// A cursor caches the last accessed page, allowing for faster
+    /// access if the same block is accessed repeatedly.
+    fn block_cursor(&self) -> BlockCursor<&Self>
+    where
+        Self: Sized,
+    {
+        BlockCursor::new(self)
+    }
+}
+
+impl<B> BlockReader for &B
+where
+    B: BlockReader,
+{
+    type BlockLease = B::BlockLease;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+        (*self).read_blk(blknum)
+    }
+}
+
+///
+/// A "cursor" for efficiently reading multiple pages from a BlockReader
+///
+/// A cursor caches the last accessed page, allowing for faster access if the
+/// same block is accessed repeatedly.
+///
+/// You can access the last page with `*cursor`. 'read_blk' returns 'self', so
+/// that in many cases you can use a BlockCursor as a drop-in replacement for
+/// the underlying BlockReader. For example:
+///
+/// ```no_run
+/// # use pageserver::layered_repository::block_io::{BlockReader, FileBlockReader};
+/// # let reader: FileBlockReader<std::fs::File> = todo!();
+/// let cursor = reader.block_cursor();
+/// let buf = cursor.read_blk(1);
+/// // do stuff with 'buf'
+/// let buf = cursor.read_blk(2);
+/// // do stuff with 'buf'
+/// ```
+///
+pub struct BlockCursor<R>
+where
+    R: BlockReader,
+{
+    reader: R,
+    /// last accessed page
+    cache: Option<(u32, R::BlockLease)>,
+}
+
+impl<R> BlockCursor<R>
+where
+    R: BlockReader,
+{
+    pub fn new(reader: R) -> Self {
+        BlockCursor {
+            reader,
+            cache: None,
+        }
+    }
+
+    pub fn read_blk(&mut self, blknum: u32) -> Result<&Self, std::io::Error> {
+        // Fast return if this is the same block as before
+        if let Some((cached_blk, _buf)) = &self.cache {
+            if *cached_blk == blknum {
+                return Ok(self);
+            }
+        }
+
+        // Read the block from the underlying reader, and cache it
+        self.cache = None;
+        let buf = self.reader.read_blk(blknum)?;
+        self.cache = Some((blknum, buf));
+
+        Ok(self)
+    }
+}
+
+impl<R> Deref for BlockCursor<R>
+where
+    R: BlockReader,
+{
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &<Self as Deref>::Target {
+        &self.cache.as_ref().unwrap().1
+    }
+}
+
+lazy_static! {
+    static ref NEXT_ID: AtomicU64 = AtomicU64::new(1);
+}
+
+/// An adapter for reading a (virtual) file using the page cache.
+///
+/// The file is assumed to be immutable. This doesn't provide any functions
+/// for modifying the file, nor for invalidating the cache if it is modified.
+pub struct FileBlockReader<F> {
+    pub file: F,
+
+    /// Unique ID of this file, used as key in the page cache.
+    file_id: u64,
+}
+
+impl<F> FileBlockReader<F>
+where
+    F: FileExt,
+{
+    pub fn new(file: F) -> Self {
+        let file_id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+
+        FileBlockReader { file_id, file }
+    }
+
+    /// Read a page from the underlying file into given buffer.
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
+        assert!(buf.len() == PAGE_SZ);
+        self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
+    }
+}
+
+impl<F> BlockReader for FileBlockReader<F>
+where
+    F: FileExt,
+{
+    type BlockLease = page_cache::PageReadGuard<'static>;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+        // Look up the right page
+        let cache = page_cache::get();
+        loop {
+            match cache.read_immutable_buf(self.file_id, blknum) {
+                ReadBufResult::Found(guard) => break Ok(guard),
+                ReadBufResult::NotFound(mut write_guard) => {
+                    // Read the page from disk into the buffer
+                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
+                    write_guard.mark_valid();
+
+                    // Swap for read lock
+                    continue;
+                }
+            };
+        }
+    }
+}
+
+///
+/// Trait for block-oriented output
+///
+pub trait BlockWriter {
+    ///
+    /// Write a page to the underlying storage.
+    ///
+    /// 'buf' must be of size PAGE_SZ. Returns the block number the page was
+    /// written to.
+    ///
+    fn write_blk(&mut self, buf: Bytes) -> Result<u32, std::io::Error>;
+}
+
+///
+/// A simple in-memory buffer of blocks.
+///
+pub struct BlockBuf {
+    pub blocks: Vec<Bytes>,
+}
+impl BlockWriter for BlockBuf {
+    fn write_blk(&mut self, buf: Bytes) -> Result<u32, std::io::Error> {
+        assert!(buf.len() == PAGE_SZ);
+        let blknum = self.blocks.len();
+        self.blocks.push(buf);
+        tracing::info!("buffered block {}", blknum);
+        Ok(blknum as u32)
+    }
+}
+
+impl BlockBuf {
+    pub fn new() -> Self {
+        BlockBuf { blocks: Vec::new() }
+    }
+
+    pub fn size(&self) -> u64 {
+        (self.blocks.len() * PAGE_SZ) as u64
+    }
+}
+impl Default for BlockBuf {
+    fn default() -> Self {
+        Self::new()
+    }
+}
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -1,23 +1,14 @@
 //! A DeltaLayer represents a collection of WAL records or page images in a range of
 //! LSNs, and in a range of Keys. It is stored on a file on disk.
 //!
-//! Usually a delta layer only contains differences - in the form of WAL records against
-//! a base LSN. However, if a segment is newly created, by creating a new relation or
-//! extending an old one, there might be no base image. In that case, all the entries in
-//! the delta layer must be page images or WAL records with the 'will_init' flag set, so
-//! that they can be replayed without referring to an older page version. Also in some
-//! circumstances, the predecessor layer might actually be another delta layer. That
-//! can happen when you create a new branch in the middle of a delta layer, and the WAL
-//! records on the new branch are put in a new delta layer.
+//! Usually a delta layer only contains differences, in the form of WAL records
+//! against a base LSN. However, if a relation extended or a whole new relation
+//! is created, there would be no base for the new pages. The entries for them
+//! must be page images or WAL records with the 'will_init' flag set, so that
+//! they can be replayed without referring to an older page version.
 //!
-//! When a delta file needs to be accessed, we slurp the 'index' metadata
-//! into memory, into the DeltaLayerInner struct. See load() and unload() functions.
-//! To access a particular value, we search `index` for the given key.
-//! The byte offset in the index can be used to find the value in
-//! VALUES_CHAPTER.
-//!
-//! On disk, the delta files are stored in timelines/<timelineid> directory.
-//! Currently, there are no subdirectories, and each delta file is named like this:
+//! The delta files are stored in timelines/<timelineid> directory.  Currently,
+//! there are no subdirectories, and each delta file is named like this:
 //!
 //!    <key start>-<key end>__<start LSN>-<end LSN
 //!
@@ -25,74 +16,154 @@
 //!
 //!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
 //!
-//!
-//! A delta file is constructed using the 'bookfile' crate. Each file consists of three
-//! parts: the 'index', the values, and a short summary header. They are stored as
-//! separate chapters.
+//! Every delta file consists of three parts: "summary", "index", and
+//! "values". The summary is a fixed size header at the beginning of the file,
+//! and it contains basic information about the layer, and offsets to the other
+//! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
+//! "values" part.  The actual page images and WAL records are stored in the
+//! "values" part.
 //!
 use crate::config::PageServerConf;
+use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
+use crate::layered_repository::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
+use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
 use crate::layered_repository::storage_layer::{
-    BlobRef, Layer, ValueReconstructResult, ValueReconstructState,
+    Layer, ValueReconstructResult, ValueReconstructState,
 };
-use crate::repository::{Key, Value};
+use crate::page_cache::{PageReadGuard, PAGE_SZ};
+use crate::repository::{Key, Value, KEY_SIZE};
 use crate::virtual_file::VirtualFile;
 use crate::walrecord;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, Result};
+use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
+use anyhow::{bail, ensure, Context, Result};
 use log::*;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
-use zenith_utils::vec_map::VecMap;
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::fs;
-use std::io::BufWriter;
-use std::io::Write;
+use std::io::{BufWriter, Write};
+use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{RwLock, RwLockReadGuard};
-
-use bookfile::{Book, BookWriter, ChapterWriter};
+use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};

 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

-// Magic constant to identify a Zenith delta file
-pub const DELTA_FILE_MAGIC: u32 = 0x5A616E11;
-
-/// Mapping from (key, lsn) -> page/WAL record
-/// byte ranges in VALUES_CHAPTER
-static INDEX_CHAPTER: u64 = 1;
-
-/// Page/WAL bytes - cannot be interpreted
-/// without the page versions from the INDEX_CHAPTER
-static VALUES_CHAPTER: u64 = 2;
-
-/// Contains the [`Summary`] struct
-static SUMMARY_CHAPTER: u64 = 3;
-
+///
+/// Header stored in the beginning of the file
+///
+/// After this comes the 'values' part, starting on block 1. After that,
+/// the 'index' starts at the block indicated by 'index_start_blk'
+///
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 struct Summary {
+    /// Magic value to identify this as a zenith delta file. Always DELTA_FILE_MAGIC.
+    magic: u16,
+    format_version: u16,
+
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
    key_range: Range<Key>,
    lsn_range: Range<Lsn>,
+
+    /// Block number where the 'index' part of the file begins.
+    index_start_blk: u32,
+    /// Block within the 'index', where the B-tree root page is stored
+    index_root_blk: u32,
 }

 impl From<&DeltaLayer> for Summary {
    fn from(layer: &DeltaLayer) -> Self {
        Self {
+            magic: DELTA_FILE_MAGIC,
+            format_version: STORAGE_FORMAT_VERSION,
+
            tenantid: layer.tenantid,
            timelineid: layer.timelineid,
            key_range: layer.key_range.clone(),
            lsn_range: layer.lsn_range.clone(),
+
+            index_start_blk: 0,
+            index_root_blk: 0,
        }
    }
 }

+// Flag indicating that this version initialize the page
+const WILL_INIT: u64 = 1;
+
+///
+/// Struct representing reference to BLOB in layers. Reference contains BLOB
+/// offset, and for WAL records it also contains `will_init` flag. The flag
+/// helps to determine the range of records that needs to be applied, without
+/// reading/deserializing records themselves.
+///
+#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
+struct BlobRef(u64);
+
+impl BlobRef {
+    pub fn will_init(&self) -> bool {
+        (self.0 & WILL_INIT) != 0
+    }
+
+    pub fn pos(&self) -> u64 {
+        self.0 >> 1
+    }
+
+    pub fn new(pos: u64, will_init: bool) -> BlobRef {
+        let mut blob_ref = pos << 1;
+        if will_init {
+            blob_ref |= WILL_INIT;
+        }
+        BlobRef(blob_ref)
+    }
+}
+
+const DELTA_KEY_SIZE: usize = KEY_SIZE + 8;
+struct DeltaKey([u8; DELTA_KEY_SIZE]);
+
+///
+/// This is the key of the B-tree index stored in the delta layer. It consists
+/// of the serialized representation of a Key and LSN.
+///
+impl DeltaKey {
+    fn from_slice(buf: &[u8]) -> Self {
+        let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE];
+        bytes.copy_from_slice(buf);
+        DeltaKey(bytes)
+    }
+
+    fn from_key_lsn(key: &Key, lsn: Lsn) -> Self {
+        let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE];
+        key.write_to_byte_slice(&mut bytes[0..KEY_SIZE]);
+        bytes[KEY_SIZE..].copy_from_slice(&u64::to_be_bytes(lsn.0));
+        DeltaKey(bytes)
+    }
+
+    fn key(&self) -> Key {
+        Key::from_slice(&self.0)
+    }
+
+    fn lsn(&self) -> Lsn {
+        Lsn(u64::from_be_bytes(self.0[KEY_SIZE..].try_into().unwrap()))
+    }
+
+    fn extract_key_from_buf(buf: &[u8]) -> Key {
+        Key::from_slice(&buf[..KEY_SIZE])
+    }
+
+    fn extract_lsn_from_buf(buf: &[u8]) -> Lsn {
+        let mut lsn_buf = [0u8; 8];
+        lsn_buf.copy_from_slice(&buf[KEY_SIZE..]);
+        Lsn(u64::from_be_bytes(lsn_buf))
+    }
+}
+
 ///
 /// DeltaLayer is the in-memory data structure associated with an
 /// on-disk delta file.  We keep a DeltaLayer in memory for each
@@ -113,17 +184,15 @@ pub struct DeltaLayer {
 }

 pub struct DeltaLayerInner {
-    /// If false, the 'index' has not been loaded into memory yet.
+    /// If false, the fields below have not been loaded into memory yet.
    loaded: bool,

-    ///
-    /// All versions of all pages in the layer are kept here.
-    /// Indexed by block number and LSN. The value is an offset into the
-    /// chapter where the page version is stored.
-    ///
-    index: HashMap<Key, VecMap<Lsn, BlobRef>>,
+    // values copied from summary
+    index_start_blk: u32,
+    index_root_blk: u32,

-    book: Option<Book<VirtualFile>>,
+    /// Reader object for reading blocks from the file. (None if not loaded yet)
+    file: Option<FileBlockReader<VirtualFile>>,
 }

 impl Layer for DeltaLayer {
@@ -152,53 +221,55 @@ impl Layer for DeltaLayer {
        key: Key,
        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
+    ) -> anyhow::Result<ValueReconstructResult> {
        let mut need_image = true;

-        assert!(self.key_range.contains(&key));
+        ensure!(self.key_range.contains(&key));

        {
            // Open the file and lock the metadata in memory
            let inner = self.load()?;
-            let values_reader = inner
-                .book
-                .as_ref()
-                .expect("should be loaded in load call above")
-                .chapter_reader(VALUES_CHAPTER)?;

            // Scan the page versions backwards, starting from `lsn`.
-            if let Some(vec_map) = inner.index.get(&key) {
-                let slice = vec_map.slice_range(lsn_range);
-                let mut size = 0usize;
-                let mut first_pos = 0u64;
-                for (_entry_lsn, blob_ref) in slice.iter().rev() {
-                    size += blob_ref.size();
-                    first_pos = blob_ref.pos();
-                    if blob_ref.will_init() {
+            let file = inner.file.as_ref().unwrap();
+            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+                inner.index_start_blk,
+                inner.index_root_blk,
+                file,
+            );
+            let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
+
+            let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+
+            tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
+                let blob_ref = BlobRef(value);
+                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                    return false;
+                }
+                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                offsets.push((entry_lsn, blob_ref.pos()));
+
+                !blob_ref.will_init()
+            })?;
+
+            // Ok, 'offsets' now contains the offsets of all the entries we need to read
+            let mut cursor = file.block_cursor();
+            for (entry_lsn, pos) in offsets {
+                let buf = cursor.read_blob(pos)?;
+                let val = Value::des(&buf)?;
+                match val {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((entry_lsn, img));
+                        need_image = false;
                        break;
                    }
-                }
-                if size != 0 {
-                    let mut buf = vec![0u8; size];
-                    values_reader.read_exact_at(&mut buf, first_pos)?;
-                    for (entry_lsn, blob_ref) in slice.iter().rev() {
-                        let offs = (blob_ref.pos() - first_pos) as usize;
-                        let val = Value::des(&buf[offs..offs + blob_ref.size()])?;
-                        match val {
-                            Value::Image(img) => {
-                                reconstruct_state.img = Some((*entry_lsn, img));
-                                need_image = false;
-                                break;
-                            }
-                            Value::WalRecord(rec) => {
-                                let will_init = rec.will_init();
-                                reconstruct_state.records.push((*entry_lsn, rec));
-                                if will_init {
-                                    // This WAL record initializes the page, so no need to go further back
-                                    need_image = false;
-                                    break;
-                                }
-                            }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
                        }
                    }
                }
@@ -215,7 +286,7 @@ impl Layer for DeltaLayer {
        }
    }

-    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_> {
+    fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
        let inner = self.load().unwrap();

        match DeltaValueIter::new(inner) {
@@ -224,33 +295,6 @@ impl Layer for DeltaLayer {
        }
    }

-    ///
-    /// Release most of the memory used by this layer. If it's accessed again later,
-    /// it will need to be loaded back.
-    ///
-    fn unload(&self) -> Result<()> {
-        // FIXME: In debug mode, loading and unloading the index slows
-        // things down so much that you get timeout errors. At least
-        // with the test_parallel_copy test. So as an even more ad hoc
-        // stopgap fix for that, only unload every on average 10
-        // checkpoint cycles.
-        use rand::RngCore;
-        if rand::thread_rng().next_u32() > (u32::MAX / 10) {
-            return Ok(());
-        }
-
-        if let Ok(mut inner) = self.inner.try_write() {
-            inner.index = HashMap::default();
-            inner.loaded = false;
-
-            // Note: we keep the Book open. Is that a good idea? The virtual file
-            // machinery has its own rules for closing the file descriptor if it's not
-            // needed, but the Book struct uses up some memory, too.
-        }
-
-        Ok(())
-    }
-
    fn delete(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -266,7 +310,7 @@ impl Layer for DeltaLayer {
    }

    /// debugging function to print out the contents of the layer
-    fn dump(&self) -> Result<()> {
+    fn dump(&self, verbose: bool) -> Result<()> {
        println!(
            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.tenantid,
@@ -277,44 +321,67 @@ impl Layer for DeltaLayer {
            self.lsn_range.end
        );

+        if !verbose {
+            return Ok(());
+        }
+
        let inner = self.load()?;

-        let path = self.path();
-        let file = std::fs::File::open(&path)?;
-        let book = Book::new(file)?;
-        let chapter = book.chapter_reader(VALUES_CHAPTER)?;
+        println!(
+            "index_start_blk: {}, root {}",
+            inner.index_start_blk, inner.index_root_blk
+        );

-        let mut values: Vec<(&Key, &VecMap<Lsn, BlobRef>)> = inner.index.iter().collect();
-        values.sort_by_key(|k| k.0);
+        let file = inner.file.as_ref().unwrap();
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump()?;
+
+        let mut cursor = file.block_cursor();
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |delta_key, val| {
+                let blob_ref = BlobRef(val);
+                let key = DeltaKey::extract_key_from_buf(delta_key);
+                let lsn = DeltaKey::extract_lsn_from_buf(delta_key);

-        for (key, versions) in values {
-            for (lsn, blob_ref) in versions.as_slice() {
                let mut desc = String::new();
-                let mut buf = vec![0u8; blob_ref.size()];
-                chapter.read_exact_at(&mut buf, blob_ref.pos())?;
-                let val = Value::des(&buf);
-
-                match val {
-                    Ok(Value::Image(img)) => {
-                        write!(&mut desc, " img {} bytes", img.len())?;
-                    }
-                    Ok(Value::WalRecord(rec)) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec);
-                        write!(
-                            &mut desc,
-                            " rec {} bytes will_init: {} {}",
-                            buf.len(),
-                            rec.will_init(),
-                            wal_desc
-                        )?;
+                match cursor.read_blob(blob_ref.pos()) {
+                    Ok(buf) => {
+                        let val = Value::des(&buf);
+                        match val {
+                            Ok(Value::Image(img)) => {
+                                write!(&mut desc, " img {} bytes", img.len()).unwrap();
+                            }
+                            Ok(Value::WalRecord(rec)) => {
+                                let wal_desc = walrecord::describe_wal_record(&rec);
+                                write!(
+                                    &mut desc,
+                                    " rec {} bytes will_init: {} {}",
+                                    buf.len(),
+                                    rec.will_init(),
+                                    wal_desc
+                                )
+                                .unwrap();
+                            }
+                            Err(err) => {
+                                write!(&mut desc, " DESERIALIZATION ERROR: {}", err).unwrap();
+                            }
+                        }
                    }
                    Err(err) => {
-                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
+                        write!(&mut desc, " READ ERROR: {}", err).unwrap();
                    }
                }
                println!("  key {} at {}: {}", key, lsn, desc);
-            }
-        }
+                true
+            },
+        )?;

        Ok(())
    }
@@ -336,65 +403,81 @@ impl DeltaLayer {
    }

    ///
-    /// Load the contents of the file into memory
+    /// Open the underlying file and read the metadata into memory, if it's
+    /// not loaded already.
    ///
    fn load(&self) -> Result<RwLockReadGuard<DeltaLayerInner>> {
        loop {
-            // quick exit if already loaded
-            {
-                let inner = self.inner.read().unwrap();
-
-                if inner.loaded {
-                    return Ok(inner);
-                }
-            }
-            // need to upgrade to write lock
-            let mut inner = self.inner.write().unwrap();
-
-            let path = self.path();
-
-            // Open the file if it's not open already.
-            if inner.book.is_none() {
-                let file = VirtualFile::open(&path)?;
-                inner.book = Some(Book::new(file)?);
-            }
-            let book = inner.book.as_ref().unwrap();
-
-            match &self.path_or_conf {
-                PathOrConf::Conf(_) => {
-                    let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
-                    let actual_summary = Summary::des(&chapter)?;
-
-                    let expected_summary = Summary::from(self);
-
-                    if actual_summary != expected_summary {
-                        bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
-                    }
-                }
-                PathOrConf::Path(path) => {
-                    let actual_filename = Path::new(path.file_name().unwrap());
-                    let expected_filename = self.filename();
-
-                    if actual_filename != expected_filename {
-                        println!(
-                            "warning: filename does not match what is expected from in-file summary"
-                        );
-                        println!("actual: {:?}", actual_filename);
-                        println!("expected: {:?}", expected_filename);
-                    }
-                }
+            // Quick exit if already loaded
+            let inner = self.inner.read().unwrap();
+            if inner.loaded {
+                return Ok(inner);
            }

-            let chapter = book.read_chapter(INDEX_CHAPTER)?;
-            let index = HashMap::des(&chapter)?;
+            // Need to open the file and load the metadata. Upgrade our lock to
+            // a write lock. (Or rather, release and re-lock in write mode.)
+            drop(inner);
+            let inner = self.inner.write().unwrap();
+            if !inner.loaded {
+                self.load_inner(inner)?;
+            } else {
+                // Another thread loaded it while we were not holding the lock.
+            }

-            debug!("loaded from {}", &path.display());
-
-            inner.index = index;
-            inner.loaded = true;
+            // We now have the file open and loaded. There's no function to do
+            // that in the std library RwLock, so we have to release and re-lock
+            // in read mode. (To be precise, the lock guard was moved in the
+            // above call to `load_inner`, so it's already been released). And
+            // while we do that, another thread could unload again, so we have
+            // to re-check and retry if that happens.
        }
    }

+    fn load_inner(&self, mut inner: RwLockWriteGuard<DeltaLayerInner>) -> Result<()> {
+        let path = self.path();
+
+        // Open the file if it's not open already.
+        if inner.file.is_none() {
+            let file = VirtualFile::open(&path)
+                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+            inner.file = Some(FileBlockReader::new(file));
+        }
+        let file = inner.file.as_mut().unwrap();
+        let summary_blk = file.read_blk(0)?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+
+        match &self.path_or_conf {
+            PathOrConf::Conf(_) => {
+                let mut expected_summary = Summary::from(self);
+                expected_summary.index_start_blk = actual_summary.index_start_blk;
+                expected_summary.index_root_blk = actual_summary.index_root_blk;
+                if actual_summary != expected_summary {
+                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
+                }
+            }
+            PathOrConf::Path(path) => {
+                let actual_filename = Path::new(path.file_name().unwrap());
+                let expected_filename = self.filename();
+
+                if actual_filename != expected_filename {
+                    println!(
+                        "warning: filename does not match what is expected from in-file summary"
+                    );
+                    println!("actual: {:?}", actual_filename);
+                    println!("expected: {:?}", expected_filename);
+                }
+            }
+        }
+
+        inner.index_start_blk = actual_summary.index_start_blk;
+        inner.index_root_blk = actual_summary.index_root_blk;
+
+        debug!("loaded from {}", &path.display());
+
+        inner.loaded = true;
+        Ok(())
+    }
+
    /// Create a DeltaLayer struct representing an existing file on disk.
    pub fn new(
        conf: &'static PageServerConf,
@@ -410,8 +493,9 @@ impl DeltaLayer {
            lsn_range: filename.lsn_range.clone(),
            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
-                book: None,
-                index: HashMap::default(),
+                file: None,
+                index_start_blk: 0,
+                index_root_blk: 0,
            }),
        }
    }
@@ -419,12 +503,14 @@ impl DeltaLayer {
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
-    pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
+    pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
    where
        F: FileExt,
    {
-        let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
-        let summary = Summary::des(&chapter)?;
+        let mut summary_buf = Vec::new();
+        summary_buf.resize(PAGE_SZ, 0);
+        file.read_exact_at(&mut summary_buf, 0)?;
+        let summary = Summary::des_prefix(&summary_buf)?;

        Ok(DeltaLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
@@ -434,8 +520,9 @@ impl DeltaLayer {
            lsn_range: summary.lsn_range,
            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
-                book: None,
-                index: HashMap::default(),
+                file: None,
+                index_start_blk: 0,
+                index_root_blk: 0,
            }),
        })
    }
@@ -478,10 +565,9 @@ pub struct DeltaLayerWriter {
    key_start: Key,
    lsn_range: Range<Lsn>,

-    index: HashMap<Key, VecMap<Lsn, BlobRef>>,
+    tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,

-    values_writer: ChapterWriter<BufWriter<VirtualFile>>,
-    end_offset: u64,
+    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
 }

 impl DeltaLayerWriter {
@@ -495,25 +581,27 @@ impl DeltaLayerWriter {
        key_start: Key,
        lsn_range: Range<Lsn>,
    ) -> Result<DeltaLayerWriter> {
-        // Create the file
+        // Create the file initially with a temporary filename. We don't know
+        // the end key yet, so we cannot form the final filename yet. We will
+        // rename it when we're done.
        //
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-
        let path = conf.timeline_path(&timelineid, &tenantid).join(format!(
            "{}-XXX__{:016X}-{:016X}.temp",
            key_start,
            u64::from(lsn_range.start),
            u64::from(lsn_range.end)
        ));
-        info!("temp deltalayer path {}", path.display());
-        let file = VirtualFile::create(&path)?;
+        let mut file = VirtualFile::create(&path)?;
+        // make room for the header block
+        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
        let buf_writer = BufWriter::new(file);
-        let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
+        let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64);

-        // Open the page-versions chapter for writing. The calls to
-        // `put_value` will use this to write the contents.
-        let values_writer = book.new_chapter(VALUES_CHAPTER);
+        // Initialize the b-tree index builder
+        let block_buf = BlockBuf::new();
+        let tree_builder = DiskBtreeBuilder::new(block_buf);

        Ok(DeltaLayerWriter {
            conf,
@@ -522,9 +610,8 @@ impl DeltaLayerWriter {
            tenantid,
            key_start,
            lsn_range,
-            index: HashMap::new(),
-            values_writer,
-            end_offset: 0,
+            tree: tree_builder,
+            blob_writer,
        })
    }

@@ -534,65 +621,56 @@ impl DeltaLayerWriter {
    /// The values must be appended in key, lsn order.
    ///
    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
-        //info!("DELTA: key {} at {} on {}", key, lsn, self.path.display());
        assert!(self.lsn_range.start <= lsn);
-        // Remember the offset and size metadata. The metadata is written
-        // to a separate chapter, in `finish`.
-        let off = self.end_offset;
-        let buf = Value::ser(&val)?;
-        let len = buf.len();
-        self.values_writer.write_all(&buf)?;
-        self.end_offset += len as u64;
-        let vec_map = self.index.entry(key).or_default();
-        let blob_ref = BlobRef::new(off, len, val.will_init());
-        let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0;
-        if old.is_some() {
-            // We already had an entry for this LSN. That's odd..
-            bail!(
-                "Value for {} at {} already exists in delta layer being built",
-                key,
-                lsn
-            );
-        }
+
+        let off = self.blob_writer.write_blob(&Value::ser(&val)?)?;
+
+        let blob_ref = BlobRef::new(off, val.will_init());
+
+        let delta_key = DeltaKey::from_key_lsn(&key, lsn);
+        self.tree.append(&delta_key.0, blob_ref.0)?;

        Ok(())
    }

    pub fn size(&self) -> u64 {
-        self.end_offset
+        self.blob_writer.size() + self.tree.borrow_writer().size()
    }

    ///
    /// Finish writing the delta layer.
    ///
-    /// 'seg_sizes' is a list of size changes to store with the actual data.
-    ///
-    pub fn finish(self, key_end: Key) -> Result<DeltaLayer> {
-        // Close the values chapter
-        let book = self.values_writer.close()?;
+    pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        let index_start_blk =
+            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
+
+        let buf_writer = self.blob_writer.into_inner();
+        let mut file = buf_writer.into_inner()?;

        // Write out the index
-        let mut chapter = book.new_chapter(INDEX_CHAPTER);
-        let buf = HashMap::ser(&self.index)?;
-        chapter.write_all(&buf)?;
-        let book = chapter.close()?;
+        let (index_root_blk, block_buf) = self.tree.finish()?;
+        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
+        for buf in block_buf.blocks {
+            file.write_all(buf.as_ref())?;
+        }

-        let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
+        // Fill in the summary on blk 0
        let summary = Summary {
+            magic: DELTA_FILE_MAGIC,
+            format_version: STORAGE_FORMAT_VERSION,
            tenantid: self.tenantid,
            timelineid: self.timelineid,
            key_range: self.key_start..key_end,
            lsn_range: self.lsn_range.clone(),
+            index_start_blk,
+            index_root_blk,
        };
-        Summary::ser_into(&summary, &mut chapter)?;
-        let book = chapter.close()?;
-
-        // This flushes the underlying 'buf_writer'.
-        book.close()?;
+        file.seek(SeekFrom::Start(0))?;
+        Summary::ser_into(&summary, &mut file)?;

        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
-        // set inner.book here. The first read will have to re-open it.
+        // set inner.file here. The first read will have to re-open it.
        let layer = DeltaLayer {
            path_or_conf: PathOrConf::Conf(self.conf),
            tenantid: self.tenantid,
@@ -601,8 +679,9 @@ impl DeltaLayerWriter {
            lsn_range: self.lsn_range.clone(),
            inner: RwLock::new(DeltaLayerInner {
                loaded: false,
-                index: HashMap::new(),
-                book: None,
+                file: None,
+                index_start_blk,
+                index_root_blk,
            }),
        };

@@ -625,22 +704,6 @@ impl DeltaLayerWriter {

        Ok(layer)
    }
-
-    pub fn abort(self) {
-        match self.values_writer.close() {
-            Ok(book) => {
-                if let Err(err) = book.close() {
-                    error!("error while closing delta layer file: {}", err);
-                }
-            }
-            Err(err) => {
-                error!("error while closing chapter writer: {}", err);
-            }
-        }
-        if let Err(err) = std::fs::remove_file(self.path) {
-            error!("error removing unfinished delta layer file: {}", err);
-        }
-    }
 }

 ///
@@ -650,13 +713,23 @@ impl DeltaLayerWriter {
 /// That takes up quite a lot of memory. Should do this in a more streaming
 /// fashion.
 ///
-struct DeltaValueIter {
-    all_offsets: Vec<(Key, Lsn, BlobRef)>,
+struct DeltaValueIter<'a> {
+    all_offsets: Vec<(DeltaKey, BlobRef)>,
    next_idx: usize,
-    data: Vec<u8>,
+    reader: BlockCursor<Adapter<'a>>,
 }

-impl Iterator for DeltaValueIter {
+struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);
+
+impl<'a> BlockReader for Adapter<'a> {
+    type BlockLease = PageReadGuard<'static>;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+        self.0.file.as_ref().unwrap().read_blk(blknum)
+    }
+}
+
+impl<'a> Iterator for DeltaValueIter<'a> {
    type Item = Result<(Key, Lsn, Value)>;

    fn next(&mut self) -> Option<Self::Item> {
@@ -664,40 +737,43 @@ impl Iterator for DeltaValueIter {
    }
 }

-impl DeltaValueIter {
-    fn new(inner: RwLockReadGuard<DeltaLayerInner>) -> Result<Self> {
-        let mut index: Vec<(&Key, &VecMap<Lsn, BlobRef>)> = inner.index.iter().collect();
-        index.sort_by_key(|x| x.0);
+impl<'a> DeltaValueIter<'a> {
+    fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
+        let file = inner.file.as_ref().unwrap();
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );

-        let mut all_offsets: Vec<(Key, Lsn, BlobRef)> = Vec::new();
-        for (key, vec_map) in index.iter() {
-            for (lsn, blob_ref) in vec_map.as_slice().iter() {
-                all_offsets.push((**key, *lsn, *blob_ref));
-            }
-        }
+        let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value| {
+                all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
+                true
+            },
+        )?;

-        let values_reader = inner
-            .book
-            .as_ref()
-            .expect("should be loaded in load call above")
-            .chapter_reader(VALUES_CHAPTER)?;
-        let file_size = values_reader.len() as usize;
-        let mut layer = DeltaValueIter {
+        let iter = DeltaValueIter {
            all_offsets,
            next_idx: 0,
-            data: vec![0u8; file_size],
+            reader: BlockCursor::new(Adapter(inner)),
        };
-        values_reader.read_exact_at(&mut layer.data, 0)?;

-        Ok(layer)
+        Ok(iter)
    }

    fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
        if self.next_idx < self.all_offsets.len() {
-            let (key, lsn, blob_ref) = self.all_offsets[self.next_idx];
-            let offs = blob_ref.pos() as usize;
-            let size = blob_ref.size();
-            let val = Value::des(&self.data[offs..offs + size])?;
+            let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];
+
+            let key = delta_key.key();
+            let lsn = delta_key.lsn();
+
+            let buf = self.reader.read_blob(blob_ref.pos())?;
+            let val = Value::des(&buf)?;
            self.next_idx += 1;
            Ok(Some((key, lsn, val)))
        } else {
--- a/pageserver/src/layered_repository/disk_btree.rs
+++ b/pageserver/src/layered_repository/disk_btree.rs
@@ -0,0 +1,979 @@
+//!
+//! Simple on-disk B-tree implementation
+//!
+//! This is used as the index structure within image and delta layers
+//!
+//! Features:
+//! - Fixed-width keys
+//! - Fixed-width values (VALUE_SZ)
+//! - The tree is created in a bulk operation. Insert/deletion after creation
+//!   is not suppported
+//! - page-oriented
+//!
+//! TODO:
+//! - better errors (e.g. with thiserror?)
+//! - maybe something like an Adaptive Radix Tree would be more efficient?
+//! - the values stored by image and delta layers are offsets into the file,
+//!   and they are in monotonically increasing order. Prefix compression would
+//!   be very useful for them, too.
+//! - An Iterator interface would be more convenient for the callers than the
+//!   'visit' function
+//!
+use anyhow;
+use byteorder::{ReadBytesExt, BE};
+use bytes::{BufMut, Bytes, BytesMut};
+use hex;
+use std::cmp::Ordering;
+
+use crate::layered_repository::block_io::{BlockReader, BlockWriter};
+
+// The maximum size of a value stored in the B-tree. 5 bytes is enough currently.
+pub const VALUE_SZ: usize = 5;
+pub const MAX_VALUE: u64 = 0x007f_ffff_ffff;
+
+#[allow(dead_code)]
+pub const PAGE_SZ: usize = 8192;
+
+#[derive(Clone, Copy, Debug)]
+struct Value([u8; VALUE_SZ]);
+
+impl Value {
+    fn from_slice(slice: &[u8]) -> Value {
+        let mut b = [0u8; VALUE_SZ];
+        b.copy_from_slice(slice);
+        Value(b)
+    }
+
+    fn from_u64(x: u64) -> Value {
+        assert!(x <= 0x007f_ffff_ffff);
+        Value([
+            (x >> 32) as u8,
+            (x >> 24) as u8,
+            (x >> 16) as u8,
+            (x >> 8) as u8,
+            x as u8,
+        ])
+    }
+
+    fn from_blknum(x: u32) -> Value {
+        Value([
+            0x80,
+            (x >> 24) as u8,
+            (x >> 16) as u8,
+            (x >> 8) as u8,
+            x as u8,
+        ])
+    }
+
+    #[allow(dead_code)]
+    fn is_offset(self) -> bool {
+        self.0[0] & 0x80 != 0
+    }
+
+    fn to_u64(self) -> u64 {
+        let b = &self.0;
+        (b[0] as u64) << 32
+            | (b[1] as u64) << 24
+            | (b[2] as u64) << 16
+            | (b[3] as u64) << 8
+            | b[4] as u64
+    }
+
+    fn to_blknum(self) -> u32 {
+        let b = &self.0;
+        assert!(b[0] == 0x80);
+        (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
+    }
+}
+
+/// This is the on-disk representation.
+struct OnDiskNode<'a, const L: usize> {
+    // Fixed-width fields
+    num_children: u16,
+    level: u8,
+    prefix_len: u8,
+    suffix_len: u8,
+
+    // Variable-length fields. These are stored on-disk after the fixed-width
+    // fields, in this order. In the in-memory representation, these point to
+    // the right parts in the page buffer.
+    prefix: &'a [u8],
+    keys: &'a [u8],
+    values: &'a [u8],
+}
+
+impl<'a, const L: usize> OnDiskNode<'a, L> {
+    ///
+    /// Interpret a PAGE_SZ page as a node.
+    ///
+    fn deparse(buf: &[u8]) -> OnDiskNode<L> {
+        let mut cursor = std::io::Cursor::new(buf);
+        let num_children = cursor.read_u16::<BE>().unwrap();
+        let level = cursor.read_u8().unwrap();
+        let prefix_len = cursor.read_u8().unwrap();
+        let suffix_len = cursor.read_u8().unwrap();
+
+        let mut off = cursor.position();
+        let prefix_off = off as usize;
+        off += prefix_len as u64;
+
+        let keys_off = off as usize;
+        let keys_len = num_children as usize * suffix_len as usize;
+        off += keys_len as u64;
+
+        let values_off = off as usize;
+        let values_len = num_children as usize * VALUE_SZ as usize;
+        //off += values_len as u64;
+
+        let prefix = &buf[prefix_off..prefix_off + prefix_len as usize];
+        let keys = &buf[keys_off..keys_off + keys_len];
+        let values = &buf[values_off..values_off + values_len];
+
+        OnDiskNode {
+            num_children,
+            level,
+            prefix_len,
+            suffix_len,
+            prefix,
+            keys,
+            values,
+        }
+    }
+
+    ///
+    /// Read a value at 'idx'
+    ///
+    fn value(&self, idx: usize) -> Value {
+        let value_off = idx * VALUE_SZ;
+        let value_slice = &self.values[value_off..value_off + VALUE_SZ];
+        Value::from_slice(value_slice)
+    }
+
+    fn binary_search(&self, search_key: &[u8; L], keybuf: &mut [u8]) -> Result<usize, usize> {
+        let mut size = self.num_children as usize;
+        let mut low = 0;
+        let mut high = size;
+        while low < high {
+            let mid = low + size / 2;
+
+            let key_off = mid as usize * self.suffix_len as usize;
+            let suffix = &self.keys[key_off..key_off + self.suffix_len as usize];
+            // Does this match?
+            keybuf[self.prefix_len as usize..].copy_from_slice(suffix);
+
+            let cmp = keybuf[..].cmp(search_key);
+
+            if cmp == Ordering::Less {
+                low = mid + 1;
+            } else if cmp == Ordering::Greater {
+                high = mid;
+            } else {
+                return Ok(mid);
+            }
+            size = high - low;
+        }
+        Err(low)
+    }
+}
+
+///
+/// Public reader object, to search the tree.
+///
+pub struct DiskBtreeReader<R, const L: usize>
+where
+    R: BlockReader,
+{
+    start_blk: u32,
+    root_blk: u32,
+    reader: R,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum VisitDirection {
+    Forwards,
+    Backwards,
+}
+
+impl<R, const L: usize> DiskBtreeReader<R, L>
+where
+    R: BlockReader,
+{
+    pub fn new(start_blk: u32, root_blk: u32, reader: R) -> Self {
+        DiskBtreeReader {
+            start_blk,
+            root_blk,
+            reader,
+        }
+    }
+
+    ///
+    /// Read the value for given key. Returns the value, or None if it doesn't exist.
+    ///
+    pub fn get(&self, search_key: &[u8; L]) -> anyhow::Result<Option<u64>> {
+        let mut result: Option<u64> = None;
+        self.visit(search_key, VisitDirection::Forwards, |key, value| {
+            if key == search_key {
+                result = Some(value);
+            }
+            false
+        })?;
+        Ok(result)
+    }
+
+    ///
+    /// Scan the tree, starting from 'search_key', in the given direction. 'visitor'
+    /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
+    /// backwards)
+    ///
+    pub fn visit<V>(
+        &self,
+        search_key: &[u8; L],
+        dir: VisitDirection,
+        mut visitor: V,
+    ) -> anyhow::Result<bool>
+    where
+        V: FnMut(&[u8], u64) -> bool,
+    {
+        self.search_recurse(self.root_blk, search_key, dir, &mut visitor)
+    }
+
+    fn search_recurse<V>(
+        &self,
+        node_blknum: u32,
+        search_key: &[u8; L],
+        dir: VisitDirection,
+        visitor: &mut V,
+    ) -> anyhow::Result<bool>
+    where
+        V: FnMut(&[u8], u64) -> bool,
+    {
+        // Locate the node.
+        let blk = self.reader.read_blk(self.start_blk + node_blknum)?;
+
+        // Search all entries on this node
+        self.search_node(blk.as_ref(), search_key, dir, visitor)
+    }
+
+    fn search_node<V>(
+        &self,
+        node_buf: &[u8],
+        search_key: &[u8; L],
+        dir: VisitDirection,
+        visitor: &mut V,
+    ) -> anyhow::Result<bool>
+    where
+        V: FnMut(&[u8], u64) -> bool,
+    {
+        let node = OnDiskNode::deparse(node_buf);
+        let prefix_len = node.prefix_len as usize;
+        let suffix_len = node.suffix_len as usize;
+
+        assert!(node.num_children > 0);
+
+        let mut keybuf = Vec::new();
+        keybuf.extend(node.prefix);
+        keybuf.resize(prefix_len + suffix_len, 0);
+
+        if dir == VisitDirection::Forwards {
+            // Locate the first match
+            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                Ok(idx) => idx,
+                Err(idx) => {
+                    if node.level == 0 {
+                        // Imagine that the node contains the following keys:
+                        //
+                        // 1
+                        // 3  <-- idx
+                        // 5
+                        //
+                        // If the search key is '2' and there is exact match,
+                        // the binary search would return the index of key
+                        // '3'. That's cool, '3' is the first key to return.
+                        idx
+                    } else {
+                        // This is an internal page, so each key represents a lower
+                        // bound for what's in the child page. If there is no exact
+                        // match, we have to return the *previous* entry.
+                        //
+                        // 1  <-- return this
+                        // 3  <-- idx
+                        // 5
+                        idx.saturating_sub(1)
+                    }
+                }
+            };
+            // idx points to the first match now. Keep going from there
+            let mut key_off = idx * suffix_len;
+            while idx < node.num_children as usize {
+                let suffix = &node.keys[key_off..key_off + suffix_len];
+                keybuf[prefix_len..].copy_from_slice(suffix);
+                let value = node.value(idx as usize);
+                #[allow(clippy::collapsible_if)]
+                if node.level == 0 {
+                    // leaf
+                    if !visitor(&keybuf, value.to_u64()) {
+                        return Ok(false);
+                    }
+                } else {
+                    #[allow(clippy::collapsible_if)]
+                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
+                        return Ok(false);
+                    }
+                }
+                idx += 1;
+                key_off += suffix_len;
+            }
+        } else {
+            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                Ok(idx) => {
+                    // Exact match. That's the first entry to return, and walk
+                    // backwards from there. (The loop below starts from 'idx -
+                    // 1', so add one here to compensate.)
+                    idx + 1
+                }
+                Err(idx) => {
+                    // No exact match. The binary search returned the index of the
+                    // first key that's > search_key. Back off by one, and walk
+                    // backwards from there. (The loop below starts from idx - 1,
+                    // so we don't need to subtract one here)
+                    idx
+                }
+            };
+
+            // idx points to the first match + 1 now. Keep going from there.
+            let mut key_off = idx * suffix_len;
+            while idx > 0 {
+                idx -= 1;
+                key_off -= suffix_len;
+                let suffix = &node.keys[key_off..key_off + suffix_len];
+                keybuf[prefix_len..].copy_from_slice(suffix);
+                let value = node.value(idx as usize);
+                #[allow(clippy::collapsible_if)]
+                if node.level == 0 {
+                    // leaf
+                    if !visitor(&keybuf, value.to_u64()) {
+                        return Ok(false);
+                    }
+                } else {
+                    #[allow(clippy::collapsible_if)]
+                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
+                        return Ok(false);
+                    }
+                }
+                if idx == 0 {
+                    break;
+                }
+            }
+        }
+        Ok(true)
+    }
+
+    #[allow(dead_code)]
+    pub fn dump(&self) -> anyhow::Result<()> {
+        self.dump_recurse(self.root_blk, &[], 0)
+    }
+
+    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> anyhow::Result<()> {
+        let blk = self.reader.read_blk(self.start_blk + blknum)?;
+        let buf: &[u8] = blk.as_ref();
+
+        let node = OnDiskNode::<L>::deparse(buf);
+
+        print!("{:indent$}", "", indent = depth * 2);
+        println!(
+            "blk #{}: path {}: prefix {}, suffix_len {}",
+            blknum,
+            hex::encode(path),
+            hex::encode(node.prefix),
+            node.suffix_len
+        );
+
+        let mut idx = 0;
+        let mut key_off = 0;
+        while idx < node.num_children {
+            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
+            let val = node.value(idx as usize);
+            print!("{:indent$}", "", indent = depth * 2 + 2);
+            println!("{}: {}", hex::encode(key), hex::encode(val.0));
+
+            if node.level > 0 {
+                let child_path = [path, node.prefix].concat();
+                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
+            }
+            idx += 1;
+            key_off += node.suffix_len as usize;
+        }
+        Ok(())
+    }
+}
+
+///
+/// Public builder object, for creating a new tree.
+///
+/// Usage: Create a builder object by calling 'new', load all the data into the
+/// tree by calling 'append' for each key-value pair, and then call 'finish'
+///
+/// 'L' is the key length in bytes
+pub struct DiskBtreeBuilder<W, const L: usize>
+where
+    W: BlockWriter,
+{
+    writer: W,
+
+    ///
+    /// stack[0] is the current root page, stack.last() is the leaf.
+    ///
+    stack: Vec<BuildNode<L>>,
+
+    /// Last key that was appended to the tree. Used to sanity check that append
+    /// is called in increasing key order.
+    last_key: Option<[u8; L]>,
+}
+
+impl<W, const L: usize> DiskBtreeBuilder<W, L>
+where
+    W: BlockWriter,
+{
+    pub fn new(writer: W) -> Self {
+        DiskBtreeBuilder {
+            writer,
+            last_key: None,
+            stack: vec![BuildNode::new(0)],
+        }
+    }
+
+    pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<(), anyhow::Error> {
+        assert!(value <= MAX_VALUE);
+        if let Some(last_key) = &self.last_key {
+            assert!(key > last_key, "unsorted input");
+        }
+        self.last_key = Some(*key);
+
+        Ok(self.append_internal(key, Value::from_u64(value))?)
+    }
+
+    fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<(), std::io::Error> {
+        // Try to append to the current leaf buffer
+        let last = self.stack.last_mut().unwrap();
+        let level = last.level;
+        if last.push(key, value) {
+            return Ok(());
+        }
+
+        // It did not fit. Try to compress, and it it succeeds to make some room
+        // on the node, try appending to it again.
+        #[allow(clippy::collapsible_if)]
+        if last.compress() {
+            if last.push(key, value) {
+                return Ok(());
+            }
+        }
+
+        // Could not append to the current leaf. Flush it and create a new one.
+        self.flush_node()?;
+
+        // Replace the node we flushed with an empty one and append the new
+        // key to it.
+        let mut last = BuildNode::new(level);
+        if !last.push(key, value) {
+            panic!("could not push to new leaf node");
+        }
+        self.stack.push(last);
+
+        Ok(())
+    }
+
+    fn flush_node(&mut self) -> Result<(), std::io::Error> {
+        let last = self.stack.pop().unwrap();
+        let buf = last.pack();
+        let downlink_key = last.first_key();
+        let downlink_ptr = self.writer.write_blk(buf)?;
+
+        // Append the downlink to the parent
+        if self.stack.is_empty() {
+            self.stack.push(BuildNode::new(last.level + 1));
+        }
+        self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr))?;
+
+        Ok(())
+    }
+
+    ///
+    /// Flushes everything to disk, and returns the block number of the root page.
+    /// The caller must store the root block number "out-of-band", and pass it
+    /// to the DiskBtreeReader::new() when you want to read the tree again.
+    /// (In the image and delta layers, it is stored in the beginning of the file,
+    /// in the summary header)
+    ///
+    pub fn finish(mut self) -> Result<(u32, W), std::io::Error> {
+        // flush all levels, except the root.
+        while self.stack.len() > 1 {
+            self.flush_node()?;
+        }
+
+        let root = self.stack.first().unwrap();
+        let buf = root.pack();
+        let root_blknum = self.writer.write_blk(buf)?;
+
+        Ok((root_blknum, self.writer))
+    }
+
+    pub fn borrow_writer(&self) -> &W {
+        &self.writer
+    }
+}
+
+///
+/// BuildNode represesnts an incomplete page that we are appending to.
+///
+#[derive(Clone, Debug)]
+struct BuildNode<const L: usize> {
+    num_children: u16,
+    level: u8,
+    prefix: Vec<u8>,
+    suffix_len: usize,
+
+    keys: Vec<u8>,
+    values: Vec<u8>,
+
+    size: usize, // physical size of this node, if it was written to disk like this
+}
+
+const NODE_SIZE: usize = PAGE_SZ;
+
+const NODE_HDR_SIZE: usize = 2 + 1 + 1 + 1;
+
+impl<const L: usize> BuildNode<L> {
+    fn new(level: u8) -> Self {
+        BuildNode {
+            num_children: 0,
+            level,
+            prefix: Vec::new(),
+            suffix_len: 0,
+            keys: Vec::new(),
+            values: Vec::new(),
+            size: NODE_HDR_SIZE,
+        }
+    }
+
+    /// Try to append a key-value pair to this node. Returns 'true' on
+    /// success, 'false' if the page was full or the key was
+    /// incompatible with the prefix of the existing keys.
+    fn push(&mut self, key: &[u8; L], value: Value) -> bool {
+        // If we have already performed prefix-compression on the page,
+        // check that the incoming key has the same prefix.
+        if self.num_children > 0 {
+            // does the prefix allow it?
+            if !key.starts_with(&self.prefix) {
+                return false;
+            }
+        } else {
+            self.suffix_len = key.len();
+        }
+
+        // Is the node too full?
+        if self.size + self.suffix_len + VALUE_SZ >= NODE_SIZE {
+            return false;
+        }
+
+        // All clear
+        self.num_children += 1;
+        self.keys.extend(&key[self.prefix.len()..]);
+        self.values.extend(value.0);
+
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
+
+        self.size += self.suffix_len + VALUE_SZ;
+
+        true
+    }
+
+    ///
+    /// Perform prefix-compression.
+    ///
+    /// Returns 'true' on success, 'false' if no compression was possible.
+    ///
+    fn compress(&mut self) -> bool {
+        let first_suffix = self.first_suffix();
+        let last_suffix = self.last_suffix();
+
+        // Find the common prefix among all keys
+        let mut prefix_len = 0;
+        while prefix_len < self.suffix_len {
+            if first_suffix[prefix_len] != last_suffix[prefix_len] {
+                break;
+            }
+            prefix_len += 1;
+        }
+        if prefix_len == 0 {
+            return false;
+        }
+
+        // Can compress. Rewrite the keys without the common prefix.
+        self.prefix.extend(&self.keys[..prefix_len]);
+
+        let mut new_keys = Vec::new();
+        let mut key_off = 0;
+        while key_off < self.keys.len() {
+            let next_key_off = key_off + self.suffix_len;
+            new_keys.extend(&self.keys[key_off + prefix_len..next_key_off]);
+            key_off = next_key_off;
+        }
+        self.keys = new_keys;
+        self.suffix_len -= prefix_len;
+
+        self.size -= prefix_len * self.num_children as usize;
+        self.size += prefix_len;
+
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
+
+        true
+    }
+
+    ///
+    /// Serialize the node to on-disk format.
+    ///
+    fn pack(&self) -> Bytes {
+        assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
+        assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
+        assert!(self.num_children > 0);
+
+        let mut buf = BytesMut::new();
+
+        buf.put_u16(self.num_children);
+        buf.put_u8(self.level);
+        buf.put_u8(self.prefix.len() as u8);
+        buf.put_u8(self.suffix_len as u8);
+        buf.put(&self.prefix[..]);
+        buf.put(&self.keys[..]);
+        buf.put(&self.values[..]);
+
+        assert!(buf.len() == self.size);
+
+        assert!(buf.len() <= PAGE_SZ);
+        buf.resize(PAGE_SZ, 0);
+        buf.freeze()
+    }
+
+    fn first_suffix(&self) -> &[u8] {
+        &self.keys[..self.suffix_len]
+    }
+    fn last_suffix(&self) -> &[u8] {
+        &self.keys[self.keys.len() - self.suffix_len..]
+    }
+
+    /// Return the full first key of the page, including the prefix
+    fn first_key(&self) -> [u8; L] {
+        let mut key = [0u8; L];
+        key[..self.prefix.len()].copy_from_slice(&self.prefix);
+        key[self.prefix.len()..].copy_from_slice(self.first_suffix());
+        key
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::Rng;
+    use std::collections::BTreeMap;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    #[derive(Clone, Default)]
+    struct TestDisk {
+        blocks: Vec<Bytes>,
+    }
+    impl TestDisk {
+        fn new() -> Self {
+            Self::default()
+        }
+    }
+    impl BlockReader for TestDisk {
+        type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
+
+        fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+            let mut buf = [0u8; PAGE_SZ];
+            buf.copy_from_slice(&self.blocks[blknum as usize]);
+            Ok(std::rc::Rc::new(buf))
+        }
+    }
+    impl BlockWriter for &mut TestDisk {
+        fn write_blk(&mut self, buf: Bytes) -> Result<u32, std::io::Error> {
+            let blknum = self.blocks.len();
+            self.blocks.push(buf);
+            Ok(blknum as u32)
+        }
+    }
+
+    #[test]
+    fn basic() -> anyhow::Result<()> {
+        let mut disk = TestDisk::new();
+        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
+
+        let all_keys: Vec<&[u8; 6]> = vec![
+            b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb",
+        ];
+        let all_data: Vec<(&[u8; 6], u64)> = all_keys
+            .iter()
+            .enumerate()
+            .map(|(idx, key)| (*key, idx as u64))
+            .collect();
+        for (key, val) in all_data.iter() {
+            writer.append(key, *val)?;
+        }
+
+        let (root_offset, _writer) = writer.finish()?;
+
+        let reader = DiskBtreeReader::new(0, root_offset, disk);
+
+        reader.dump()?;
+
+        // Test the `get` function on all the keys.
+        for (key, val) in all_data.iter() {
+            assert_eq!(reader.get(key)?, Some(*val));
+        }
+        // And on some keys that don't exist
+        assert_eq!(reader.get(b"aaaaaa")?, None);
+        assert_eq!(reader.get(b"zzzzzz")?, None);
+        assert_eq!(reader.get(b"xaaabx")?, None);
+
+        // Test search with `visit` function
+        let search_key = b"xabaaa";
+        let expected: Vec<(Vec<u8>, u64)> = all_data
+            .iter()
+            .filter(|(key, _value)| key[..] >= search_key[..])
+            .map(|(key, value)| (key.to_vec(), *value))
+            .collect();
+
+        let mut data = Vec::new();
+        reader.visit(search_key, VisitDirection::Forwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
+        assert_eq!(data, expected);
+
+        // Test a backwards scan
+        let mut expected: Vec<(Vec<u8>, u64)> = all_data
+            .iter()
+            .filter(|(key, _value)| key[..] <= search_key[..])
+            .map(|(key, value)| (key.to_vec(), *value))
+            .collect();
+        expected.reverse();
+        let mut data = Vec::new();
+        reader.visit(search_key, VisitDirection::Backwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
+        assert_eq!(data, expected);
+
+        // Backward scan where nothing matches
+        reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
+            panic!("found unexpected key {}: {}", hex::encode(key), value);
+        })?;
+
+        // Full scan
+        let expected: Vec<(Vec<u8>, u64)> = all_data
+            .iter()
+            .map(|(key, value)| (key.to_vec(), *value))
+            .collect();
+        let mut data = Vec::new();
+        reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
+        assert_eq!(data, expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn lots_of_keys() -> anyhow::Result<()> {
+        let mut disk = TestDisk::new();
+        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
+
+        const NUM_KEYS: u64 = 1000;
+
+        let mut all_data: BTreeMap<u64, u64> = BTreeMap::new();
+
+        for idx in 0..NUM_KEYS {
+            let key_int: u64 = 1 + idx * 2;
+            let key = u64::to_be_bytes(key_int);
+            writer.append(&key, idx)?;
+
+            all_data.insert(key_int, idx);
+        }
+
+        let (root_offset, _writer) = writer.finish()?;
+
+        let reader = DiskBtreeReader::new(0, root_offset, disk);
+
+        reader.dump()?;
+
+        use std::sync::Mutex;
+
+        let result = Mutex::new(Vec::new());
+        let limit: AtomicUsize = AtomicUsize::new(10);
+        let take_ten = |key: &[u8], value: u64| {
+            let mut keybuf = [0u8; 8];
+            keybuf.copy_from_slice(key);
+            let key_int = u64::from_be_bytes(keybuf);
+
+            let mut result = result.lock().unwrap();
+            result.push((key_int, value));
+
+            // keep going until we have 10 matches
+            result.len() < limit.load(Ordering::Relaxed)
+        };
+
+        for search_key_int in 0..(NUM_KEYS * 2 + 10) {
+            let search_key = u64::to_be_bytes(search_key_int);
+            assert_eq!(
+                reader.get(&search_key)?,
+                all_data.get(&search_key_int).cloned()
+            );
+
+            // Test a forward scan starting with this key
+            result.lock().unwrap().clear();
+            reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
+            let expected = all_data
+                .range(search_key_int..)
+                .take(10)
+                .map(|(&key, &val)| (key, val))
+                .collect::<Vec<(u64, u64)>>();
+            assert_eq!(*result.lock().unwrap(), expected);
+
+            // And a backwards scan
+            result.lock().unwrap().clear();
+            reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
+            let expected = all_data
+                .range(..=search_key_int)
+                .rev()
+                .take(10)
+                .map(|(&key, &val)| (key, val))
+                .collect::<Vec<(u64, u64)>>();
+            assert_eq!(*result.lock().unwrap(), expected);
+        }
+
+        // full scan
+        let search_key = u64::to_be_bytes(0);
+        limit.store(usize::MAX, Ordering::Relaxed);
+        result.lock().unwrap().clear();
+        reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
+        let expected = all_data
+            .iter()
+            .map(|(&key, &val)| (key, val))
+            .collect::<Vec<(u64, u64)>>();
+        assert_eq!(*result.lock().unwrap(), expected);
+
+        // full scan
+        let search_key = u64::to_be_bytes(u64::MAX);
+        limit.store(usize::MAX, Ordering::Relaxed);
+        result.lock().unwrap().clear();
+        reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
+        let expected = all_data
+            .iter()
+            .rev()
+            .map(|(&key, &val)| (key, val))
+            .collect::<Vec<(u64, u64)>>();
+        assert_eq!(*result.lock().unwrap(), expected);
+
+        Ok(())
+    }
+
+    #[test]
+    fn random_data() -> anyhow::Result<()> {
+        // Generate random keys with exponential distribution, to
+        // exercise the prefix compression
+        const NUM_KEYS: usize = 100000;
+        let mut all_data: BTreeMap<u128, u64> = BTreeMap::new();
+        for idx in 0..NUM_KEYS {
+            let u: f64 = rand::thread_rng().gen_range(0.0..1.0);
+            let t = -(f64::ln(u));
+            let key_int = (t * 1000000.0) as u128;
+
+            all_data.insert(key_int as u128, idx as u64);
+        }
+
+        // Build a tree from it
+        let mut disk = TestDisk::new();
+        let mut writer = DiskBtreeBuilder::<_, 16>::new(&mut disk);
+
+        for (&key, &val) in all_data.iter() {
+            writer.append(&u128::to_be_bytes(key), val)?;
+        }
+        let (root_offset, _writer) = writer.finish()?;
+
+        let reader = DiskBtreeReader::new(0, root_offset, disk);
+
+        // Test get() operation on all the keys
+        for (&key, &val) in all_data.iter() {
+            let search_key = u128::to_be_bytes(key);
+            assert_eq!(reader.get(&search_key)?, Some(val));
+        }
+
+        // Test get() operations on random keys, most of which will not exist
+        for _ in 0..100000 {
+            let key_int = rand::thread_rng().gen::<u128>();
+            let search_key = u128::to_be_bytes(key_int);
+            assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned());
+        }
+
+        // Test boundary cases
+        assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned());
+        assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned());
+
+        Ok(())
+    }
+
+    #[test]
+    #[should_panic(expected = "unsorted input")]
+    fn unsorted_input() {
+        let mut disk = TestDisk::new();
+        let mut writer = DiskBtreeBuilder::<_, 2>::new(&mut disk);
+
+        let _ = writer.append(b"ba", 1);
+        let _ = writer.append(b"bb", 2);
+        let _ = writer.append(b"aa", 3);
+    }
+
+    ///
+    /// This test contains a particular data set, see disk_btree_test_data.rs
+    ///
+    #[test]
+    fn particular_data() -> anyhow::Result<()> {
+        // Build a tree from it
+        let mut disk = TestDisk::new();
+        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
+
+        for (key, val) in disk_btree_test_data::TEST_DATA {
+            writer.append(&key, val)?;
+        }
+        let (root_offset, writer) = writer.finish()?;
+
+        println!("SIZE: {} blocks", writer.blocks.len());
+
+        let reader = DiskBtreeReader::new(0, root_offset, disk);
+
+        // Test get() operation on all the keys
+        for (key, val) in disk_btree_test_data::TEST_DATA {
+            assert_eq!(reader.get(&key)?, Some(val));
+        }
+
+        // Test full scan
+        let mut count = 0;
+        reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
+            count += 1;
+            true
+        })?;
+        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());
+
+        reader.dump()?;
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+#[path = "disk_btree_test_data.rs"]
+mod disk_btree_test_data;
--- a/pageserver/src/layered_repository/disk_btree_test_data.rs
+++ b/pageserver/src/layered_repository/disk_btree_test_data.rs
--- a/pageserver/src/layered_repository/ephemeral_file.rs
+++ b/pageserver/src/layered_repository/ephemeral_file.rs
@@ -2,6 +2,8 @@
 //! used to keep in-memory layers spilled on disk.

 use crate::config::PageServerConf;
+use crate::layered_repository::blob_io::BlobWriter;
+use crate::layered_repository::block_io::BlockReader;
 use crate::page_cache;
 use crate::page_cache::PAGE_SZ;
 use crate::page_cache::{ReadBufResult, WriteBufResult};
@@ -10,7 +12,7 @@ use lazy_static::lazy_static;
 use std::cmp::min;
 use std::collections::HashMap;
 use std::fs::OpenOptions;
-use std::io::{Error, ErrorKind, Seek, SeekFrom, Write};
+use std::io::{Error, ErrorKind};
 use std::ops::DerefMut;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
@@ -41,7 +43,7 @@ pub struct EphemeralFile {
    _timelineid: ZTimelineId,
    file: Arc<VirtualFile>,

-    pos: u64,
+    size: u64,
 }

 impl EphemeralFile {
@@ -70,11 +72,11 @@ impl EphemeralFile {
            _tenantid: tenantid,
            _timelineid: timelineid,
            file: file_rc,
-            pos: 0,
+            size: 0,
        })
    }

-    pub fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> {
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> {
        let mut off = 0;
        while off < PAGE_SZ {
            let n = self
@@ -93,6 +95,26 @@ impl EphemeralFile {
        }
        Ok(())
    }
+
+    fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, Error> {
+        // Look up the right page
+        let cache = page_cache::get();
+        let mut write_guard = match cache.write_ephemeral_buf(self.file_id, blkno) {
+            WriteBufResult::Found(guard) => guard,
+            WriteBufResult::NotFound(mut guard) => {
+                // Read the page from disk into the buffer
+                // TODO: if we're overwriting the whole page, no need to read it in first
+                self.fill_buffer(guard.deref_mut(), blkno)?;
+                guard.mark_valid();
+
+                // And then fall through to modify it.
+                guard
+            }
+        };
+        write_guard.mark_dirty();
+
+        Ok(write_guard)
+    }
 }

 /// Does the given filename look like an ephemeral file?
@@ -167,48 +189,49 @@ impl FileExt for EphemeralFile {
    }
 }

-impl Write for EphemeralFile {
-    fn write(&mut self, buf: &[u8]) -> Result<usize, Error> {
-        let n = self.write_at(buf, self.pos)?;
-        self.pos += n as u64;
-        Ok(n)
-    }
+impl BlobWriter for EphemeralFile {
+    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
+        let pos = self.size;

-    fn flush(&mut self) -> Result<(), std::io::Error> {
-        // we don't need to flush data:
-        // * we either write input bytes or not, not keeping any intermediate data buffered
-        // * rust unix file `flush` impl does not flush things either, returning `Ok(())`
-        Ok(())
-    }
-}
+        let mut blknum = (self.size / PAGE_SZ as u64) as u32;
+        let mut off = (pos % PAGE_SZ as u64) as usize;

-impl Seek for EphemeralFile {
-    fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
-        match pos {
-            SeekFrom::Start(offset) => {
-                self.pos = offset;
-            }
-            SeekFrom::End(_offset) => {
-                return Err(Error::new(
-                    ErrorKind::Other,
-                    "SeekFrom::End not supported by EphemeralFile",
-                ));
-            }
-            SeekFrom::Current(offset) => {
-                let pos = self.pos as i128 + offset as i128;
-                if pos < 0 {
-                    return Err(Error::new(
-                        ErrorKind::InvalidInput,
-                        "offset would be negative",
-                    ));
-                }
-                if pos > u64::MAX as i128 {
-                    return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
-                }
-                self.pos = pos as u64;
-            }
+        let mut buf = self.get_buf_for_write(blknum)?;
+
+        // Write the length field
+        let len_buf = u32::to_ne_bytes(srcbuf.len() as u32);
+        let thislen = PAGE_SZ - off;
+        if thislen < 4 {
+            // it needs to be split across pages
+            buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
+            blknum += 1;
+            buf = self.get_buf_for_write(blknum)?;
+            buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
+            off = 4 - thislen;
+        } else {
+            buf[off..off + 4].copy_from_slice(&len_buf);
+            off += 4;
        }
-        Ok(self.pos)
+
+        // Write the payload
+        let mut buf_remain = srcbuf;
+        while !buf_remain.is_empty() {
+            let mut page_remain = PAGE_SZ - off;
+            if page_remain == 0 {
+                blknum += 1;
+                buf = self.get_buf_for_write(blknum)?;
+                off = 0;
+                page_remain = PAGE_SZ;
+            }
+            let this_blk_len = min(page_remain, buf_remain.len());
+            buf[off..(off + this_blk_len)].copy_from_slice(&buf_remain[..this_blk_len]);
+            off += this_blk_len;
+            buf_remain = &buf_remain[this_blk_len..];
+        }
+        drop(buf);
+        self.size += 4 + srcbuf.len() as u64;
+
+        Ok(pos)
    }
 }

@@ -239,11 +262,34 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er
    }
 }

+impl BlockReader for EphemeralFile {
+    type BlockLease = page_cache::PageReadGuard<'static>;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+        // Look up the right page
+        let cache = page_cache::get();
+        loop {
+            match cache.read_ephemeral_buf(self.file_id, blknum) {
+                ReadBufResult::Found(guard) => return Ok(guard),
+                ReadBufResult::NotFound(mut write_guard) => {
+                    // Read the page from disk into the buffer
+                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
+                    write_guard.mark_valid();
+
+                    // Swap for read lock
+                    continue;
+                }
+            };
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
-    use rand::seq::SliceRandom;
-    use rand::thread_rng;
+    use crate::layered_repository::blob_io::{BlobCursor, BlobWriter};
+    use crate::layered_repository::block_io::BlockCursor;
+    use rand::{seq::SliceRandom, thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;

@@ -281,19 +327,19 @@ mod tests {
    fn test_ephemeral_files() -> Result<(), Error> {
        let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?;

-        let mut file_a = EphemeralFile::create(conf, tenantid, timelineid)?;
+        let file_a = EphemeralFile::create(conf, tenantid, timelineid)?;

-        file_a.write_all(b"foo")?;
+        file_a.write_all_at(b"foo", 0)?;
        assert_eq!("foo", read_string(&file_a, 0, 20)?);

-        file_a.write_all(b"bar")?;
+        file_a.write_all_at(b"bar", 3)?;
        assert_eq!("foobar", read_string(&file_a, 0, 20)?);

        // Open a lot of files, enough to cause some page evictions.
        let mut efiles = Vec::new();
        for fileno in 0..100 {
-            let mut efile = EphemeralFile::create(conf, tenantid, timelineid)?;
-            efile.write_all(format!("file {}", fileno).as_bytes())?;
+            let efile = EphemeralFile::create(conf, tenantid, timelineid)?;
+            efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
            assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
            efiles.push((fileno, efile));
        }
@@ -307,4 +353,41 @@ mod tests {

        Ok(())
    }
+
+    #[test]
+    fn test_ephemeral_blobs() -> Result<(), Error> {
+        let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?;
+
+        let mut file = EphemeralFile::create(conf, tenantid, timelineid)?;
+
+        let pos_foo = file.write_blob(b"foo")?;
+        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
+        let pos_bar = file.write_blob(b"bar")?;
+        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
+        assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());
+
+        let mut blobs = Vec::new();
+        for i in 0..10000 {
+            let data = Vec::from(format!("blob{}", i).as_bytes());
+            let pos = file.write_blob(&data)?;
+            blobs.push((pos, data));
+        }
+
+        let mut cursor = BlockCursor::new(&file);
+        for (pos, expected) in blobs {
+            let actual = cursor.read_blob(pos)?;
+            assert_eq!(actual, expected);
+        }
+        drop(cursor);
+
+        // Test a large blob that spans multiple pages
+        let mut large_data = Vec::new();
+        large_data.resize(20000, 0);
+        thread_rng().fill_bytes(&mut large_data);
+        let pos_large = file.write_blob(&large_data)?;
+        let result = file.block_cursor().read_blob(pos_large)?;
+        assert_eq!(result, large_data);
+
+        Ok(())
+    }
 }
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -25,9 +25,7 @@ impl PartialOrd for DeltaFileName {

 impl Ord for DeltaFileName {
    fn cmp(&self, other: &Self) -> Ordering {
-        let mut cmp;
-
-        cmp = self.key_range.start.cmp(&other.key_range.start);
+        let mut cmp = self.key_range.start.cmp(&other.key_range.start);
        if cmp != Ordering::Equal {
            return cmp;
        }
@@ -117,9 +115,7 @@ impl PartialOrd for ImageFileName {

 impl Ord for ImageFileName {
    fn cmp(&self, other: &Self) -> Ordering {
-        let mut cmp;
-
-        cmp = self.key_range.start.cmp(&other.key_range.start);
+        let mut cmp = self.key_range.start.cmp(&other.key_range.start);
        if cmp != Ordering::Equal {
            return cmp;
        }
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -13,65 +13,76 @@
 //!
 //!    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
 //!
-//! An image file is constructed using the 'bookfile' crate.
-//!
-//! Only metadata is loaded into memory by the load function.
-//! When images are needed, they are read directly from disk.
-//!
+//! Every image layer file consists of three parts: "summary",
+//! "index", and "values".  The summary is a fixed size header at the
+//! beginning of the file, and it contains basic information about the
+//! layer, and offsets to the other parts. The "index" is a B-tree,
+//! mapping from Key to an offset in the "values" part.  The
+//! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
+use crate::layered_repository::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
+use crate::layered_repository::block_io::{BlockBuf, BlockReader, FileBlockReader};
+use crate::layered_repository::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::layered_repository::filename::{ImageFileName, PathOrConf};
 use crate::layered_repository::storage_layer::{
-    BlobRef, Layer, ValueReconstructResult, ValueReconstructState,
+    Layer, ValueReconstructResult, ValueReconstructState,
 };
-use crate::repository::{Key, Value};
+use crate::page_cache::PAGE_SZ;
+use crate::repository::{Key, Value, KEY_SIZE};
 use crate::virtual_file::VirtualFile;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, Context, Result};
+use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION};
+use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
+use hex;
 use log::*;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
 use std::fs;
-use std::io::{BufWriter, Write};
+use std::io::Write;
+use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::path::{Path, PathBuf};
-use std::sync::{Mutex, MutexGuard};
-
-use bookfile::{Book, BookWriter, ChapterWriter};
+use std::sync::{RwLock, RwLockReadGuard};

 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

-// Magic constant to identify a Zenith image layer file
-pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E11 + 1;
-
-/// Mapping from (key, lsn) -> page/WAL record
-/// byte ranges in VALUES_CHAPTER
-static INDEX_CHAPTER: u64 = 1;
-
-/// Contains each block in block # order
-const VALUES_CHAPTER: u64 = 2;
-
-/// Contains the [`Summary`] struct
-const SUMMARY_CHAPTER: u64 = 3;
-
+///
+/// Header stored in the beginning of the file
+///
+/// After this comes the 'values' part, starting on block 1. After that,
+/// the 'index' starts at the block indicated by 'index_start_blk'
+///
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 struct Summary {
+    /// Magic value to identify this as a zenith image file. Always IMAGE_FILE_MAGIC.
+    magic: u16,
+    format_version: u16,
+
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
    key_range: Range<Key>,
-
    lsn: Lsn,
+
+    /// Block number where the 'index' part of the file begins.
+    index_start_blk: u32,
+    /// Block within the 'index', where the B-tree root page is stored
+    index_root_blk: u32,
+    // the 'values' part starts after the summary header, on block 1.
 }

 impl From<&ImageLayer> for Summary {
    fn from(layer: &ImageLayer) -> Self {
        Self {
+            magic: IMAGE_FILE_MAGIC,
+            format_version: STORAGE_FORMAT_VERSION,
            tenantid: layer.tenantid,
            timelineid: layer.timelineid,
            key_range: layer.key_range.clone(),
-
            lsn: layer.lsn,
+
+            index_start_blk: 0,
+            index_root_blk: 0,
        }
    }
 }
@@ -92,19 +103,19 @@ pub struct ImageLayer {
    // This entry contains an image of all pages as of this LSN
    pub lsn: Lsn,

-    inner: Mutex<ImageLayerInner>,
+    inner: RwLock<ImageLayerInner>,
 }

 pub struct ImageLayerInner {
    /// If false, the 'index' has not been loaded into memory yet.
    loaded: bool,

-    /// The underlying (virtual) file handle. None if the layer hasn't been loaded
-    /// yet.
-    book: Option<Book<VirtualFile>>,
+    // values copied from summary
+    index_start_blk: u32,
+    index_root_blk: u32,

-    /// offset of each value
-    index: HashMap<Key, BlobRef>,
+    /// Reader object for reading blocks from the file. (None if not loaded yet)
+    file: Option<FileBlockReader<VirtualFile>>,
 }

 impl Layer for ImageLayer {
@@ -135,30 +146,25 @@ impl Layer for ImageLayer {
        key: Key,
        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
+    ) -> anyhow::Result<ValueReconstructResult> {
        assert!(self.key_range.contains(&key));
        assert!(lsn_range.end >= self.lsn);

        let inner = self.load()?;

-        if let Some(blob_ref) = inner.index.get(&key) {
-            let chapter = inner
-                .book
-                .as_ref()
-                .unwrap()
-                .chapter_reader(VALUES_CHAPTER)?;
+        let file = inner.file.as_ref().unwrap();
+        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);

-            let mut blob = vec![0; blob_ref.size()];
-            chapter
-                .read_exact_at(&mut blob, blob_ref.pos())
-                .with_context(|| {
-                    format!(
-                        "failed to read {} bytes from data file {} at offset {}",
-                        blob_ref.size(),
-                        self.filename().display(),
-                        blob_ref.pos()
-                    )
-                })?;
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader.get(&keybuf)? {
+            let blob = file.block_cursor().read_blob(offset).with_context(|| {
+                format!(
+                    "failed to read value from data file {} at offset {}",
+                    self.filename().display(),
+                    offset
+                )
+            })?;
            let value = Bytes::from(blob);

            reconstruct_state.img = Some((self.lsn, value));
@@ -172,29 +178,6 @@ impl Layer for ImageLayer {
        todo!();
    }

-    fn unload(&self) -> Result<()> {
-        // Unload the index.
-        //
-        // TODO: we should access the index directly from pages on the disk,
-        // using the buffer cache. This load/unload mechanism is really ad hoc.
-
-        // FIXME: In debug mode, loading and unloading the index slows
-        // things down so much that you get timeout errors. At least
-        // with the test_parallel_copy test. So as an even more ad hoc
-        // stopgap fix for that, only unload every on average 10
-        // checkpoint cycles.
-        use rand::RngCore;
-        if rand::thread_rng().next_u32() > (u32::MAX / 10) {
-            return Ok(());
-        }
-
-        let mut inner = self.inner.lock().unwrap();
-        inner.index = HashMap::default();
-        inner.loaded = false;
-
-        Ok(())
-    }
-
    fn delete(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -210,26 +193,28 @@ impl Layer for ImageLayer {
    }

    /// debugging function to print out the contents of the layer
-    fn dump(&self) -> Result<()> {
+    fn dump(&self, verbose: bool) -> Result<()> {
        println!(
            "----- image layer for ten {} tli {} key {}-{} at {} ----",
            self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn
        );

-        let inner = self.load()?;
-
-        let mut index_vec: Vec<(&Key, &BlobRef)> = inner.index.iter().collect();
-        index_vec.sort_by_key(|x| x.1.pos());
-
-        for (key, blob_ref) in index_vec {
-            println!(
-                "key: {} size {} offset {}",
-                key,
-                blob_ref.size(),
-                blob_ref.pos()
-            );
+        if !verbose {
+            return Ok(());
        }

+        let inner = self.load()?;
+        let file = inner.file.as_ref().unwrap();
+        let tree_reader =
+            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
+
+        tree_reader.dump()?;
+
+        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
+            println!("key: {} offset {}", hex::encode(key), value);
+            true
+        })?;
+
        Ok(())
    }
 }
@@ -250,34 +235,55 @@ impl ImageLayer {
    }

    ///
-    /// Load the contents of the file into memory
+    /// Open the underlying file and read the metadata into memory, if it's
+    /// not loaded already.
    ///
-    fn load(&self) -> Result<MutexGuard<ImageLayerInner>> {
-        // quick exit if already loaded
-        let mut inner = self.inner.lock().unwrap();
+    fn load(&self) -> Result<RwLockReadGuard<ImageLayerInner>> {
+        loop {
+            // Quick exit if already loaded
+            let inner = self.inner.read().unwrap();
+            if inner.loaded {
+                return Ok(inner);
+            }

-        if inner.loaded {
-            return Ok(inner);
+            // Need to open the file and load the metadata. Upgrade our lock to
+            // a write lock. (Or rather, release and re-lock in write mode.)
+            drop(inner);
+            let mut inner = self.inner.write().unwrap();
+            if !inner.loaded {
+                self.load_inner(&mut inner)?;
+            } else {
+                // Another thread loaded it while we were not holding the lock.
+            }
+
+            // We now have the file open and loaded. There's no function to do
+            // that in the std library RwLock, so we have to release and re-lock
+            // in read mode. (To be precise, the lock guard was moved in the
+            // above call to `load_inner`, so it's already been released). And
+            // while we do that, another thread could unload again, so we have
+            // to re-check and retry if that happens.
+            drop(inner);
        }
+    }

+    fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
        let path = self.path();

        // Open the file if it's not open already.
-        if inner.book.is_none() {
+        if inner.file.is_none() {
            let file = VirtualFile::open(&path)
                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-            inner.book = Some(Book::new(file).with_context(|| {
-                format!("Failed to open file '{}' as a bookfile", path.display())
-            })?);
+            inner.file = Some(FileBlockReader::new(file));
        }
-        let book = inner.book.as_ref().unwrap();
+        let file = inner.file.as_mut().unwrap();
+        let summary_blk = file.read_blk(0)?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        match &self.path_or_conf {
            PathOrConf::Conf(_) => {
-                let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
-                let actual_summary = Summary::des(&chapter)?;
-
-                let expected_summary = Summary::from(self);
+                let mut expected_summary = Summary::from(self);
+                expected_summary.index_start_blk = actual_summary.index_start_blk;
+                expected_summary.index_root_blk = actual_summary.index_root_blk;

                if actual_summary != expected_summary {
                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
@@ -297,15 +303,10 @@ impl ImageLayer {
            }
        }

-        let chapter = book.read_chapter(INDEX_CHAPTER)?;
-        let index = HashMap::des(&chapter)?;
-
-        info!("loaded from {}", &path.display());
-
-        inner.index = index;
+        inner.index_start_blk = actual_summary.index_start_blk;
+        inner.index_root_blk = actual_summary.index_root_blk;
        inner.loaded = true;
-
-        Ok(inner)
+        Ok(())
    }

    /// Create an ImageLayer struct representing an existing file on disk
@@ -321,10 +322,11 @@ impl ImageLayer {
            tenantid,
            key_range: filename.key_range.clone(),
            lsn: filename.lsn,
-            inner: Mutex::new(ImageLayerInner {
-                book: None,
-                index: HashMap::new(),
+            inner: RwLock::new(ImageLayerInner {
                loaded: false,
+                file: None,
+                index_start_blk: 0,
+                index_root_blk: 0,
            }),
        }
    }
@@ -332,12 +334,14 @@ impl ImageLayer {
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
-    pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<ImageLayer>
+    pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
    where
        F: std::os::unix::prelude::FileExt,
    {
-        let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
-        let summary = Summary::des(&chapter)?;
+        let mut summary_buf = Vec::new();
+        summary_buf.resize(PAGE_SZ, 0);
+        file.read_exact_at(&mut summary_buf, 0)?;
+        let summary = Summary::des_prefix(&summary_buf)?;

        Ok(ImageLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
@@ -345,10 +349,11 @@ impl ImageLayer {
            tenantid: summary.tenantid,
            key_range: summary.key_range,
            lsn: summary.lsn,
-            inner: Mutex::new(ImageLayerInner {
-                book: None,
-                index: HashMap::new(),
+            inner: RwLock::new(ImageLayerInner {
+                file: None,
                loaded: false,
+                index_start_blk: 0,
+                index_root_blk: 0,
            }),
        })
    }
@@ -377,25 +382,21 @@ impl ImageLayer {
 ///
 /// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
 ///
-/// 2. Write the contents by calling `put_page_image` for every page
-///    in the segment.
+/// 2. Write the contents by calling `put_page_image` for every key-value
+///    pair in the key range.
 ///
 /// 3. Call `finish`.
 ///
 pub struct ImageLayerWriter {
    conf: &'static PageServerConf,
-    path: PathBuf,
+    _path: PathBuf,
    timelineid: ZTimelineId,
    tenantid: ZTenantId,
    key_range: Range<Key>,
    lsn: Lsn,

-    values_writer: Option<ChapterWriter<BufWriter<VirtualFile>>>,
-    end_offset: u64,
-
-    index: HashMap<Key, BlobRef>,
-
-    finished: bool,
+    blob_writer: WriteBlobWriter<VirtualFile>,
+    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }

 impl ImageLayerWriter {
@@ -405,7 +406,7 @@ impl ImageLayerWriter {
        tenantid: ZTenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> Result<ImageLayerWriter> {
+    ) -> anyhow::Result<ImageLayerWriter> {
        // Create the file
        //
        // Note: This overwrites any existing file. There shouldn't be any.
@@ -420,25 +421,24 @@ impl ImageLayerWriter {
            },
        );
        info!("new image layer {}", path.display());
-        let file = VirtualFile::create(&path)?;
-        let buf_writer = BufWriter::new(file);
-        let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
+        let mut file = VirtualFile::create(&path)?;
+        // make room for the header block
+        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
+        let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);

-        // Open the page-images chapter for writing. The calls to
-        // `put_image` will use this to write the contents.
-        let chapter = book.new_chapter(VALUES_CHAPTER);
+        // Initialize the b-tree index builder
+        let block_buf = BlockBuf::new();
+        let tree_builder = DiskBtreeBuilder::new(block_buf);

        let writer = ImageLayerWriter {
            conf,
-            path,
+            _path: path,
            timelineid,
            tenantid,
            key_range: key_range.clone(),
            lsn,
-            values_writer: Some(chapter),
-            index: HashMap::new(),
-            end_offset: 0,
-            finished: false,
+            tree: tree_builder,
+            blob_writer,
        };

        Ok(writer)
@@ -450,79 +450,61 @@ impl ImageLayerWriter {
    /// The page versions must be appended in blknum order.
    ///
    pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
-        assert!(self.key_range.contains(&key));
-        let off = self.end_offset;
+        ensure!(self.key_range.contains(&key));
+        let off = self.blob_writer.write_blob(img)?;

-        if let Some(writer) = &mut self.values_writer {
-            let len = img.len();
-            writer.write_all(img)?;
-            self.end_offset += len as u64;
-
-            let old = self.index.insert(key, BlobRef::new(off, len, true));
-            assert!(old.is_none());
-        } else {
-            panic!()
-        }
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        self.tree.append(&keybuf, off)?;

        Ok(())
    }

-    pub fn finish(&mut self) -> Result<ImageLayer> {
-        // Close the values chapter
-        let book = self.values_writer.take().unwrap().close()?;
+    pub fn finish(self) -> anyhow::Result<ImageLayer> {
+        let index_start_blk =
+            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
+
+        let mut file = self.blob_writer.into_inner();

        // Write out the index
-        let mut chapter = book.new_chapter(INDEX_CHAPTER);
-        let buf = HashMap::ser(&self.index)?;
-        chapter.write_all(&buf)?;
-        let book = chapter.close()?;
+        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
+        let (index_root_blk, block_buf) = self.tree.finish()?;
+        for buf in block_buf.blocks {
+            file.write_all(buf.as_ref())?;
+        }

-        // Write out the summary chapter
-        let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
+        // Fill in the summary on blk 0
        let summary = Summary {
+            magic: IMAGE_FILE_MAGIC,
+            format_version: STORAGE_FORMAT_VERSION,
            tenantid: self.tenantid,
            timelineid: self.timelineid,
            key_range: self.key_range.clone(),
            lsn: self.lsn,
+            index_start_blk,
+            index_root_blk,
        };
-        Summary::ser_into(&summary, &mut chapter)?;
-        let book = chapter.close()?;
-
-        // This flushes the underlying 'buf_writer'.
-        book.close()?;
+        file.seek(SeekFrom::Start(0))?;
+        Summary::ser_into(&summary, &mut file)?;

        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
-        // set inner.book here. The first read will have to re-open it.
+        // set inner.file here. The first read will have to re-open it.
        let layer = ImageLayer {
            path_or_conf: PathOrConf::Conf(self.conf),
            timelineid: self.timelineid,
            tenantid: self.tenantid,
            key_range: self.key_range.clone(),
            lsn: self.lsn,
-            inner: Mutex::new(ImageLayerInner {
-                book: None,
+            inner: RwLock::new(ImageLayerInner {
                loaded: false,
-                index: HashMap::new(),
+                file: None,
+                index_start_blk,
+                index_root_blk,
            }),
        };
        trace!("created image layer {}", layer.path().display());

-        self.finished = true;
-
        Ok(layer)
    }
 }
-
-impl Drop for ImageLayerWriter {
-    fn drop(&mut self) {
-        if let Some(page_image_writer) = self.values_writer.take() {
-            if let Ok(book) = page_image_writer.close() {
-                let _ = book.close();
-            }
-        }
-        if !self.finished {
-            let _ = fs::remove_file(&self.path);
-        }
-    }
-}
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -5,23 +5,23 @@
 //! its position in the file, is kept in memory, though.
 //!
 use crate::config::PageServerConf;
+use crate::layered_repository::blob_io::{BlobCursor, BlobWriter};
+use crate::layered_repository::block_io::BlockReader;
 use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter};
 use crate::layered_repository::ephemeral_file::EphemeralFile;
 use crate::layered_repository::storage_layer::{
-    BlobRef, Layer, ValueReconstructResult, ValueReconstructState,
+    Layer, ValueReconstructResult, ValueReconstructState,
 };
 use crate::repository::{Key, Value};
 use crate::walrecord;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::Result;
+use anyhow::{bail, ensure, Result};
 use log::*;
 use std::collections::HashMap;
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
-use std::io::Write;
 use std::ops::Range;
-use std::os::unix::fs::FileExt;
 use std::path::PathBuf;
 use std::sync::RwLock;
 use zenith_utils::bin_ser::BeSer;
@@ -54,14 +54,12 @@ pub struct InMemoryLayerInner {
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
    ///
-    index: HashMap<Key, VecMap<Lsn, BlobRef>>,
+    index: HashMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
    /// PerSeg::page_versions map stores offsets into this file.
    file: EphemeralFile,
-
-    end_offset: u64,
 }

 impl InMemoryLayerInner {
@@ -114,16 +112,18 @@ impl Layer for InMemoryLayer {
        key: Key,
        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        assert!(lsn_range.start <= self.start_lsn);
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start <= self.start_lsn);
        let mut need_image = true;

        let inner = self.inner.read().unwrap();

+        let mut reader = inner.file.block_cursor();
+
        // Scan the page versions backwards, starting from `lsn`.
        if let Some(vec_map) = inner.index.get(&key) {
            let slice = vec_map.slice_range(lsn_range);
-            for (entry_lsn, blob_ref) in slice.iter().rev() {
+            for (entry_lsn, pos) in slice.iter().rev() {
                match &reconstruct_state.img {
                    Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
                        return Ok(ValueReconstructResult::Complete)
@@ -131,8 +131,7 @@ impl Layer for InMemoryLayer {
                    _ => {}
                }

-                let mut buf = vec![0u8; blob_ref.size()];
-                inner.file.read_exact_at(&mut buf, blob_ref.pos())?;
+                let buf = reader.read_blob(*pos)?;
                let value = Value::des(&buf)?;
                match value {
                    Value::Image(img) => {
@@ -167,17 +166,10 @@ impl Layer for InMemoryLayer {
        todo!();
    }

-    /// Cannot unload anything in an in-memory layer, since there's no backing
-    /// store. To release memory used by an in-memory layer, use 'freeze' to turn
-    /// it into an on-disk layer.
-    fn unload(&self) -> Result<()> {
-        Ok(())
-    }
-
    /// Nothing to do here. When you drop the last reference to the layer, it will
    /// be deallocated.
    fn delete(&self) -> Result<()> {
-        panic!("can't delete an InMemoryLayer")
+        bail!("can't delete an InMemoryLayer")
    }

    fn is_incremental(&self) -> bool {
@@ -190,7 +182,7 @@ impl Layer for InMemoryLayer {
    }

    /// debugging function to print out the contents of the layer
-    fn dump(&self) -> Result<()> {
+    fn dump(&self, verbose: bool) -> Result<()> {
        let inner = self.inner.read().unwrap();

        let end_str = inner
@@ -204,12 +196,16 @@ impl Layer for InMemoryLayer {
            self.timelineid, self.start_lsn, end_str,
        );

+        if !verbose {
+            return Ok(());
+        }
+
+        let mut cursor = inner.file.block_cursor();
        let mut buf = Vec::new();
        for (key, vec_map) in inner.index.iter() {
-            for (lsn, blob_ref) in vec_map.as_slice() {
+            for (lsn, pos) in vec_map.as_slice() {
                let mut desc = String::new();
-                buf.resize(blob_ref.size(), 0);
-                inner.file.read_exact_at(&mut buf, blob_ref.pos())?;
+                cursor.read_blob_into_buf(*pos, &mut buf)?;
                let val = Value::des(&buf);
                match val {
                    Ok(Value::Image(img)) => {
@@ -264,7 +260,6 @@ impl InMemoryLayer {
                end_lsn: None,
                index: HashMap::new(),
                file,
-                end_offset: 0,
            }),
        })
    }
@@ -279,15 +274,10 @@ impl InMemoryLayer {

        inner.assert_writeable();

-        let off = inner.end_offset;
-        let buf = Value::ser(&val)?;
-        let len = buf.len();
-        inner.file.write_all(&buf)?;
-        inner.end_offset += len as u64;
+        let off = inner.file.write_blob(&Value::ser(&val)?)?;

        let vec_map = inner.index.entry(key).or_default();
-        let blob_ref = BlobRef::new(off, len, val.will_init());
-        let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0;
+        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
            warn!("Key {} at {} already exists", key, lsn);
@@ -311,30 +301,16 @@ impl InMemoryLayer {
        assert!(self.start_lsn < end_lsn);
        inner.end_lsn = Some(end_lsn);

-        // FIXME
-        /*
-                for perseg in inner.segs.values() {
-                    if let Some((lsn, _)) = perseg.seg_sizes.as_slice().last() {
-                        assert!(lsn < &end_lsn, "{:?} {:?}", lsn, end_lsn);
-                    }
-
-                    for (_blk, vec_map) in perseg.page_versions.iter() {
-                        for (lsn, _pos) in vec_map.as_slice() {
-                            assert!(*lsn < end_lsn);
-                        }
-                    }
-                }
-        */
+        for vec_map in inner.index.values() {
+            for (lsn, _pos) in vec_map.as_slice() {
+                assert!(*lsn < end_lsn);
+            }
+        }
    }

    /// Write this frozen in-memory layer to disk.
    ///
-    /// Returns new layers that replace this one.
-    /// If not dropped and reconstruct_pages is true, returns a new image layer containing the page versions
-    /// at the `end_lsn`. Can also return a DeltaLayer that includes all the
-    /// WAL records between start and end LSN. (The delta layer is not needed
-    /// when a new relish is created with a single LSN, so that the start and
-    /// end LSN are the same.)
+    /// Returns a new delta layer with all the same data as this in-memory layer
    pub fn write_to_disk(&self) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
@@ -355,21 +331,21 @@ impl InMemoryLayer {
            self.start_lsn..inner.end_lsn.unwrap(),
        )?;

-        let mut do_steps = || -> Result<()> {
-            for (key, vec_map) in inner.index.iter() {
-                // Write all page versions
-                for (lsn, blob_ref) in vec_map.as_slice() {
-                    let mut buf = vec![0u8; blob_ref.size()];
-                    inner.file.read_exact_at(&mut buf, blob_ref.pos())?;
-                    let val = Value::des(&buf)?;
-                    delta_layer_writer.put_value(*key, *lsn, val)?;
-                }
+        let mut buf = Vec::new();
+
+        let mut cursor = inner.file.block_cursor();
+
+        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
+        keys.sort_by_key(|k| k.0);
+
+        for (key, vec_map) in keys.iter() {
+            let key = **key;
+            // Write all page versions
+            for (lsn, pos) in vec_map.as_slice() {
+                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                let val = Value::des(&buf)?;
+                delta_layer_writer.put_value(key, *lsn, val)?;
            }
-            Ok(())
-        };
-        if let Err(err) = do_steps() {
-            delta_layer_writer.abort();
-            return Err(err);
        }

        let delta_layer = delta_layer_writer.finish(Key::MAX)?;
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -1,5 +1,5 @@
 //!
-//! The layer map tracks what layers exist for all the relishes in a timeline.
+//! The layer map tracks what layers exist in a timeline.
 //!
 //! When the timeline is first accessed, the server lists of all layer files
 //! in the timelines/<timelineid> directory, and populates this map with
@@ -61,12 +61,24 @@ pub struct LayerMap {
    historic_layers: Vec<Arc<dyn Layer>>,
 }

+/// Return value of LayerMap::search
 pub struct SearchResult {
    pub layer: Arc<dyn Layer>,
    pub lsn_floor: Lsn,
 }

 impl LayerMap {
+    ///
+    /// Find the latest layer that covers the given 'key', with lsn <
+    /// 'end_lsn'.
+    ///
+    /// Returns the layer, if any, and an 'lsn_floor' value that
+    /// indicates which portion of the layer the caller should
+    /// check. 'lsn_floor' is normally the start-LSN of the layer, but
+    /// can be greater if there is an overlapping layer that might
+    /// contain the version, even if it's missing from the returned
+    /// layer.
+    ///
    pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult>> {
        // linear search
        // Find the latest image layer that covers the given key
@@ -195,11 +207,11 @@ impl LayerMap {
        NUM_ONDISK_LAYERS.dec();
    }

-    /// Is there a newer image layer for given segment?
+    /// Is there a newer image layer for given key-range?
    ///
    /// This is used for garbage collection, to determine if an old layer can
    /// be deleted.
-    /// We ignore segments newer than disk_consistent_lsn because they will be removed at restart
+    /// We ignore layers newer than disk_consistent_lsn because they will be removed at restart
    /// We also only look at historic layers
    //#[allow(dead_code)]
    pub fn newer_image_layer_exists(
@@ -238,28 +250,13 @@ impl LayerMap {
        }
    }

-    /// Is there any layer for given segment that is alive at the lsn?
-    ///
-    /// This is a public wrapper for SegEntry fucntion,
-    /// used for garbage collection, to determine if some alive layer
-    /// exists at the lsn. If so, we shouldn't delete a newer dropped layer
-    /// to avoid incorrectly making it visible.
-    /*
-        pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
-            Ok(if let Some(segentry) = self.historic_layers.get(&seg) {
-                segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false)
-            } else {
-                false
-            })
-        }
-    */
-
    pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
        self.historic_layers.iter()
    }

+    /// Find the last image layer that covers 'key', ignoring any image layers
+    /// newer than 'lsn'.
    fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
-        // Find the last image layer that covers the key
        let mut candidate_lsn = Lsn(0);
        let mut candidate = None;
        for l in self.historic_layers.iter() {
@@ -299,9 +296,7 @@ impl LayerMap {
        key_range: &Range<Key>,
        lsn: Lsn,
    ) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
-        let mut points: Vec<Key>;
-
-        points = vec![key_range.start];
+        let mut points = vec![key_range.start];
        for l in self.historic_layers.iter() {
            if l.get_lsn_range().start > lsn {
                continue;
@@ -334,6 +329,8 @@ impl LayerMap {
        Ok(ranges)
    }

+    /// Count how many L1 delta layers there are that overlap with the
+    /// given key and LSN range.
    pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
        let mut result = 0;
        for l in self.historic_layers.iter() {
@@ -360,6 +357,7 @@ impl LayerMap {
        Ok(result)
    }

+    /// Return all L0 delta layers
    pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
        let mut deltas = Vec::new();
        for l in self.historic_layers.iter() {
@@ -376,10 +374,22 @@ impl LayerMap {

    /// debugging function to print out the contents of the layer map
    #[allow(unused)]
-    pub fn dump(&self) -> Result<()> {
+    pub fn dump(&self, verbose: bool) -> Result<()> {
        println!("Begin dump LayerMap");
+
+        println!("open_layer:");
+        if let Some(open_layer) = &self.open_layer {
+            open_layer.dump(verbose)?;
+        }
+
+        println!("frozen_layers:");
+        for frozen_layer in self.frozen_layers.iter() {
+            frozen_layer.dump(verbose)?;
+        }
+
+        println!("historic_layers:");
        for layer in self.historic_layers.iter() {
-            layer.dump()?;
+            layer.dump(verbose)?;
        }
        println!("End dump LayerMap");
        Ok(())
--- a/pageserver/src/layered_repository/metadata.rs
+++ b/pageserver/src/layered_repository/metadata.rs
@@ -6,9 +6,10 @@
 //!
 //! The module contains all structs and related helper methods related to timeline metadata.

-use std::{convert::TryInto, path::PathBuf};
+use std::path::PathBuf;

 use anyhow::ensure;
+use serde::{Deserialize, Serialize};
 use zenith_utils::{
    bin_ser::BeSer,
    lsn::Lsn,
@@ -16,11 +17,13 @@ use zenith_utils::{
 };

 use crate::config::PageServerConf;
+use crate::STORAGE_FORMAT_VERSION;

-// Taken from PG_CONTROL_MAX_SAFE_SIZE
-const METADATA_MAX_SAFE_SIZE: usize = 512;
-const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
-const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE;
+/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
+///
+/// This is the same assumption that PostgreSQL makes with the control file,
+/// see PG_CONTROL_MAX_SAFE_SIZE
+const METADATA_MAX_SIZE: usize = 512;

 /// The name of the metadata file pageserver creates per timeline.
 pub const METADATA_FILE_NAME: &str = "metadata";
@@ -28,8 +31,22 @@ pub const METADATA_FILE_NAME: &str = "metadata";
 /// Metadata stored on disk for each timeline
 ///
 /// The fields correspond to the values we hold in memory, in LayeredTimeline.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct TimelineMetadata {
+    hdr: TimelineMetadataHeader,
+    body: TimelineMetadataBody,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+struct TimelineMetadataHeader {
+    checksum: u32,       // CRC of serialized metadata body
+    size: u16,           // size of serialized metadata
+    format_version: u16, // storage format version (used for compatibility checks)
+}
+const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+struct TimelineMetadataBody {
    disk_consistent_lsn: Lsn,
    // This is only set if we know it. We track it in memory when the page
    // server is running, but we only track the value corresponding to
@@ -69,130 +86,90 @@ impl TimelineMetadata {
        initdb_lsn: Lsn,
    ) -> Self {
        Self {
-            disk_consistent_lsn,
-            prev_record_lsn,
-            ancestor_timeline,
-            ancestor_lsn,
-            latest_gc_cutoff_lsn,
-            initdb_lsn,
+            hdr: TimelineMetadataHeader {
+                checksum: 0,
+                size: 0,
+                format_version: STORAGE_FORMAT_VERSION,
+            },
+            body: TimelineMetadataBody {
+                disk_consistent_lsn,
+                prev_record_lsn,
+                ancestor_timeline,
+                ancestor_lsn,
+                latest_gc_cutoff_lsn,
+                initdb_lsn,
+            },
        }
    }

    pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
        ensure!(
-            metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
+            metadata_bytes.len() == METADATA_MAX_SIZE,
            "metadata bytes size is wrong"
        );
-
-        let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
-        let calculated_checksum = crc32c::crc32c(data);
-
-        let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
-            metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
-        let expected_checksum = u32::from_le_bytes(*checksum_bytes);
+        let hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
        ensure!(
-            calculated_checksum == expected_checksum,
+            hdr.format_version == STORAGE_FORMAT_VERSION,
+            "format version mismatch"
+        );
+        let metadata_size = hdr.size as usize;
+        ensure!(
+            metadata_size <= METADATA_MAX_SIZE,
+            "corrupted metadata file"
+        );
+        let calculated_checksum = crc32c::crc32c(&metadata_bytes[METADATA_HDR_SIZE..metadata_size]);
+        ensure!(
+            hdr.checksum == calculated_checksum,
            "metadata checksum mismatch"
        );
+        let body = TimelineMetadataBody::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
+        ensure!(
+            body.disk_consistent_lsn.is_aligned(),
+            "disk_consistent_lsn is not aligned"
+        );

-        let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?);
-        assert!(data.disk_consistent_lsn.is_aligned());
-
-        Ok(data)
+        Ok(TimelineMetadata { hdr, body })
    }

    pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
-        let serializeable_metadata = serialize::SeTimelineMetadata::from(self);
-        let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?;
-        assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
-        metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
-
-        let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
-        metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
+        let body_bytes = self.body.ser()?;
+        let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
+        let hdr = TimelineMetadataHeader {
+            size: metadata_size as u16,
+            format_version: STORAGE_FORMAT_VERSION,
+            checksum: crc32c::crc32c(&body_bytes),
+        };
+        let hdr_bytes = hdr.ser()?;
+        let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE];
+        metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes);
+        metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes);
        Ok(metadata_bytes)
    }

    /// [`Lsn`] that corresponds to the corresponding timeline directory
    /// contents, stored locally in the pageserver workdir.
    pub fn disk_consistent_lsn(&self) -> Lsn {
-        self.disk_consistent_lsn
+        self.body.disk_consistent_lsn
    }

    pub fn prev_record_lsn(&self) -> Option<Lsn> {
-        self.prev_record_lsn
+        self.body.prev_record_lsn
    }

    pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
-        self.ancestor_timeline
+        self.body.ancestor_timeline
    }

    pub fn ancestor_lsn(&self) -> Lsn {
-        self.ancestor_lsn
+        self.body.ancestor_lsn
    }

    pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
-        self.latest_gc_cutoff_lsn
+        self.body.latest_gc_cutoff_lsn
    }

    pub fn initdb_lsn(&self) -> Lsn {
-        self.initdb_lsn
-    }
-}
-
-/// This module is for direct conversion of metadata to bytes and back.
-/// For a certain metadata, besides the conversion a few verification steps has to
-/// be done, so all serde derives are hidden from the user, to avoid accidental
-/// verification-less metadata creation.
-mod serialize {
-    use serde::{Deserialize, Serialize};
-    use zenith_utils::{lsn::Lsn, zid::ZTimelineId};
-
-    use super::TimelineMetadata;
-
-    #[derive(Serialize)]
-    pub(super) struct SeTimelineMetadata<'a> {
-        disk_consistent_lsn: &'a Lsn,
-        prev_record_lsn: &'a Option<Lsn>,
-        ancestor_timeline: &'a Option<ZTimelineId>,
-        ancestor_lsn: &'a Lsn,
-        latest_gc_cutoff_lsn: &'a Lsn,
-        initdb_lsn: &'a Lsn,
-    }
-
-    impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
-        fn from(other: &'a TimelineMetadata) -> Self {
-            Self {
-                disk_consistent_lsn: &other.disk_consistent_lsn,
-                prev_record_lsn: &other.prev_record_lsn,
-                ancestor_timeline: &other.ancestor_timeline,
-                ancestor_lsn: &other.ancestor_lsn,
-                latest_gc_cutoff_lsn: &other.latest_gc_cutoff_lsn,
-                initdb_lsn: &other.initdb_lsn,
-            }
-        }
-    }
-
-    #[derive(Deserialize)]
-    pub(super) struct DeTimelineMetadata {
-        disk_consistent_lsn: Lsn,
-        prev_record_lsn: Option<Lsn>,
-        ancestor_timeline: Option<ZTimelineId>,
-        ancestor_lsn: Lsn,
-        latest_gc_cutoff_lsn: Lsn,
-        initdb_lsn: Lsn,
-    }
-
-    impl From<DeTimelineMetadata> for TimelineMetadata {
-        fn from(other: DeTimelineMetadata) -> Self {
-            Self {
-                disk_consistent_lsn: other.disk_consistent_lsn,
-                prev_record_lsn: other.prev_record_lsn,
-                ancestor_timeline: other.ancestor_timeline,
-                ancestor_lsn: other.ancestor_lsn,
-                latest_gc_cutoff_lsn: other.latest_gc_cutoff_lsn,
-                initdb_lsn: other.initdb_lsn,
-            }
-        }
+        self.body.initdb_lsn
    }
 }

@@ -204,14 +181,14 @@ mod tests {

    #[test]
    fn metadata_serializes_correctly() {
-        let original_metadata = TimelineMetadata {
-            disk_consistent_lsn: Lsn(0x200),
-            prev_record_lsn: Some(Lsn(0x100)),
-            ancestor_timeline: Some(TIMELINE_ID),
-            ancestor_lsn: Lsn(0),
-            latest_gc_cutoff_lsn: Lsn(0),
-            initdb_lsn: Lsn(0),
-        };
+        let original_metadata = TimelineMetadata::new(
+            Lsn(0x200),
+            Some(Lsn(0x100)),
+            Some(TIMELINE_ID),
+            Lsn(0),
+            Lsn(0),
+            Lsn(0),
+        );

        let metadata_bytes = original_metadata
            .to_bytes()
@@ -221,7 +198,7 @@ mod tests {
            .expect("Should deserialize its own bytes");

        assert_eq!(
-            deserialized_metadata, original_metadata,
+            deserialized_metadata.body, original_metadata.body,
            "Metadata that was serialized to bytes and deserialized back should not change"
        );
    }
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -7,7 +7,6 @@ use crate::walrecord::ZenithWalRecord;
 use crate::{ZTenantId, ZTimelineId};
 use anyhow::Result;
 use bytes::Bytes;
-use serde::{Deserialize, Serialize};
 use std::ops::Range;
 use std::path::PathBuf;

@@ -85,10 +84,10 @@ pub enum ValueReconstructResult {
 pub trait Layer: Send + Sync {
    fn get_tenant_id(&self) -> ZTenantId;

-    /// Identify the timeline this relish belongs to
+    /// Identify the timeline this layer belongs to
    fn get_timeline_id(&self) -> ZTimelineId;

-    /// Range of segments that this layer covers
+    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

    /// Inclusive start bound of the LSN range that this layer holds
@@ -123,7 +122,7 @@ pub trait Layer: Send + Sync {
        reconstruct_data: &mut ValueReconstructState,
    ) -> Result<ValueReconstructResult>;

-    /// Does this layer only contain some data for the segment (incremental),
+    /// Does this layer only contain some data for the key-range (incremental),
    /// or does it contain a version of every page? This is important to know
    /// for garbage collecting old layers: an incremental layer depends on
    /// the previous non-incremental layer.
@@ -135,46 +134,9 @@ pub trait Layer: Send + Sync {
    /// Iterate through all keys and values stored in the layer
    fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;

-    /// Release memory used by this layer. There is no corresponding 'load'
-    /// function, that's done implicitly when you call one of the get-functions.
-    fn unload(&self) -> Result<()>;
-
    /// Permanently remove this layer from disk.
    fn delete(&self) -> Result<()>;

    /// Dump summary of the contents of the layer to stdout
-    fn dump(&self) -> Result<()>;
-}
-
-// Flag indicating that this version initialize the page
-const WILL_INIT: u64 = 1;
-
-///
-/// Struct representing reference to BLOB in layers. Reference contains BLOB offset and size.
-/// For WAL records (delta layer) it also contains `will_init` flag which helps to determine range of records
-/// which needs to be applied without reading/deserializing records themselves.
-///
-#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
-pub struct BlobRef(u64);
-
-impl BlobRef {
-    pub fn will_init(&self) -> bool {
-        (self.0 & WILL_INIT) != 0
-    }
-
-    pub fn pos(&self) -> u64 {
-        self.0 >> 32
-    }
-
-    pub fn size(&self) -> usize {
-        ((self.0 & 0xFFFFFFFF) >> 1) as usize
-    }
-
-    pub fn new(pos: u64, size: usize, will_init: bool) -> BlobRef {
-        let mut blob_ref = (pos << 32) | ((size as u64) << 1);
-        if will_init {
-            blob_ref |= WILL_INIT;
-        }
-        BlobRef(blob_ref)
-    }
+    fn dump(&self, verbose: bool) -> Result<()>;
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -7,7 +7,7 @@ pub mod layered_repository;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
-pub mod relish;
+pub mod reltag;
 pub mod remote_storage;
 pub mod repository;
 pub mod tenant_mgr;
@@ -22,12 +22,29 @@ pub mod walredo;
 pub mod wal_metadata;

 use lazy_static::lazy_static;
+use tracing::info;
 use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
-use zenith_utils::zid::{ZTenantId, ZTimelineId};
+use zenith_utils::{
+    postgres_backend,
+    zid::{ZTenantId, ZTimelineId},
+};
+
+use crate::thread_mgr::ThreadKind;

 use layered_repository::LayeredRepository;
 use pgdatadir_mapping::DatadirTimeline;

+/// Current storage format version
+///
+/// This is embedded in the metadata file, and also in the header of all the
+/// layer files. If you make any backwards-incompatible changes to the storage
+/// format, bump this!
+pub const STORAGE_FORMAT_VERSION: u16 = 3;
+
+// Magic constants used to identify different kinds of files
+pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
+pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
+
 lazy_static! {
    static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
        "pageserver_live_connections_count",
@@ -51,3 +68,33 @@ pub enum CheckpointConfig {
 pub type RepositoryImpl = LayeredRepository;

 pub type DatadirTimelineImpl = DatadirTimeline<RepositoryImpl>;
+
+pub fn shutdown_pageserver() {
+    // Shut down the libpq endpoint thread. This prevents new connections from
+    // being accepted.
+    thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
+
+    // Shut down any page service threads.
+    postgres_backend::set_pgbackend_shutdown_requested();
+    thread_mgr::shutdown_threads(Some(ThreadKind::PageRequestHandler), None, None);
+
+    // Shut down all the tenants. This flushes everything to disk and kills
+    // the checkpoint and GC threads.
+    tenant_mgr::shutdown_all_tenants();
+
+    // Stop syncing with remote storage.
+    //
+    // FIXME: Does this wait for the sync thread to finish syncing what's queued up?
+    // Should it?
+    thread_mgr::shutdown_threads(Some(ThreadKind::StorageSync), None, None);
+
+    // Shut down the HTTP endpoint last, so that you can still check the server's
+    // status while it's shutting down.
+    thread_mgr::shutdown_threads(Some(ThreadKind::HttpEndpointListener), None, None);
+
+    // There should be nothing left, but let's be sure
+    thread_mgr::shutdown_threads(None, None, None);
+
+    info!("Shut down successfully completed");
+    std::process::exit(0);
+}
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -41,7 +41,7 @@ use std::{
    convert::TryInto,
    sync::{
        atomic::{AtomicU8, AtomicUsize, Ordering},
-        RwLock, RwLockReadGuard, RwLockWriteGuard,
+        RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
    },
 };

@@ -57,16 +57,13 @@ use crate::layered_repository::writeback_ephemeral_file;
 use crate::repository::Key;

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
-const TEST_PAGE_CACHE_SIZE: usize = 10;
+const TEST_PAGE_CACHE_SIZE: usize = 50;

 ///
 /// Initialize the page cache. This must be called once at page server startup.
 ///
-pub fn init(conf: &'static PageServerConf) {
-    if PAGE_CACHE
-        .set(PageCache::new(conf.page_cache_size))
-        .is_err()
-    {
+pub fn init(size: usize) {
+    if PAGE_CACHE.set(PageCache::new(size)).is_err() {
        panic!("page cache already initialized");
    }
 }
@@ -94,6 +91,7 @@ const MAX_USAGE_COUNT: u8 = 5;
 /// CacheKey uniquely identifies a "thing" to cache in the page cache.
 ///
 #[derive(Debug, PartialEq, Eq, Clone)]
+#[allow(clippy::enum_variant_names)]
 enum CacheKey {
    MaterializedPage {
        hash_key: MaterializedPageHashKey,
@@ -103,6 +101,10 @@ enum CacheKey {
        file_id: u64,
        blkno: u32,
    },
+    ImmutableFilePage {
+        file_id: u64,
+        blkno: u32,
+    },
 }

 #[derive(Debug, PartialEq, Eq, Hash, Clone)]
@@ -177,6 +179,8 @@ pub struct PageCache {

    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,

+    immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
+
    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,

@@ -199,6 +203,12 @@ impl std::ops::Deref for PageReadGuard<'_> {
    }
 }

+impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
+    fn as_ref(&self) -> &[u8; PAGE_SZ] {
+        self.0.buf
+    }
+}
+
 ///
 /// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked
 /// until the guard is dropped.
@@ -230,6 +240,12 @@ impl std::ops::Deref for PageWriteGuard<'_> {
    }
 }

+impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
+    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
+        self.inner.buf
+    }
+}
+
 impl PageWriteGuard<'_> {
    /// Mark that the buffer contents are now valid.
    pub fn mark_valid(&mut self) {
@@ -385,6 +401,36 @@ impl PageCache {
        }
    }

+    // Section 1.3: Public interface functions for working with immutable file pages.
+
+    pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
+        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
+
+        self.lock_for_read(&mut cache_key)
+    }
+
+    /// Immediately drop all buffers belonging to given file, without writeback
+    pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) {
+        for slot_idx in 0..self.slots.len() {
+            let slot = &self.slots[slot_idx];
+
+            let mut inner = slot.inner.write().unwrap();
+            if let Some(key) = &inner.key {
+                match key {
+                    CacheKey::ImmutableFilePage { file_id, blkno: _ }
+                        if *file_id == drop_file_id =>
+                    {
+                        // remove mapping for old buffer
+                        self.remove_mapping(key);
+                        inner.key = None;
+                        inner.dirty = false;
+                    }
+                    _ => {}
+                }
+            }
+        }
+    }
+
    //
    // Section 2: Internal interface functions for lookup/update.
    //
@@ -582,6 +628,10 @@ impl PageCache {
                let map = self.ephemeral_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
            }
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let map = self.immutable_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
        }
    }

@@ -605,6 +655,10 @@ impl PageCache {
                let map = self.ephemeral_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
            }
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let map = self.immutable_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
        }
    }

@@ -636,6 +690,11 @@ impl PageCache {
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
            }
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let mut map = self.immutable_page_map.write().unwrap();
+                map.remove(&(*file_id, *blkno))
+                    .expect("could not find old key in mapping");
+            }
        }
    }

@@ -676,6 +735,16 @@ impl PageCache {
                    }
                }
            }
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let mut map = self.immutable_page_map.write().unwrap();
+                match map.entry((*file_id, *blkno)) {
+                    Entry::Occupied(entry) => Some(*entry.get()),
+                    Entry::Vacant(entry) => {
+                        entry.insert(slot_idx);
+                        None
+                    }
+                }
+            }
        }
    }

@@ -687,16 +756,33 @@ impl PageCache {
    ///
    /// On return, the slot is empty and write-locked.
    fn find_victim(&self) -> (usize, RwLockWriteGuard<SlotInner>) {
-        let iter_limit = self.slots.len() * 2;
+        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
+            iters += 1;
            let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len();

            let slot = &self.slots[slot_idx];

-            if slot.dec_usage_count() == 0 || iters >= iter_limit {
-                let mut inner = slot.inner.write().unwrap();
-
+            if slot.dec_usage_count() == 0 {
+                let mut inner = match slot.inner.try_write() {
+                    Ok(inner) => inner,
+                    Err(TryLockError::Poisoned(err)) => {
+                        panic!("buffer lock was poisoned: {:?}", err)
+                    }
+                    Err(TryLockError::WouldBlock) => {
+                        // If we have looped through the whole buffer pool 10 times
+                        // and still haven't found a victim buffer, something's wrong.
+                        // Maybe all the buffers were in locked. That could happen in
+                        // theory, if you have more threads holding buffers locked than
+                        // there are buffers in the pool. In practice, with a reasonably
+                        // large buffer pool it really shouldn't happen.
+                        if iters > iter_limit {
+                            panic!("could not find a victim buffer to evict");
+                        }
+                        continue;
+                    }
+                };
                if let Some(old_key) = &inner.key {
                    if inner.dirty {
                        if let Err(err) = Self::writeback(old_key, inner.buf) {
@@ -721,8 +807,6 @@ impl PageCache {
                }
                return (slot_idx, inner);
            }
-
-            iters += 1;
        }
    }

@@ -731,12 +815,20 @@ impl PageCache {
            CacheKey::MaterializedPage {
                hash_key: _,
                lsn: _,
-            } => {
-                panic!("unexpected dirty materialized page");
-            }
+            } => Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "unexpected dirty materialized page",
+            )),
            CacheKey::EphemeralPage { file_id, blkno } => {
                writeback_ephemeral_file(*file_id, *blkno, buf)
            }
+            CacheKey::ImmutableFilePage {
+                file_id: _,
+                blkno: _,
+            } => Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "unexpected dirty immutable page",
+            )),
        }
    }

@@ -767,6 +859,7 @@ impl PageCache {
        Self {
            materialized_page_map: Default::default(),
            ephemeral_page_map: Default::default(),
+            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
        }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -33,7 +33,7 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};
 use crate::basebackup;
 use crate::config::PageServerConf;
 use crate::pgdatadir_mapping::DatadirTimeline;
-use crate::relish::*;
+use crate::reltag::RelTag;
 use crate::repository::Repository;
 use crate::repository::Timeline;
 use crate::tenant_mgr;
@@ -230,6 +230,7 @@ pub fn thread_main(
                    None,
                    None,
                    "serving Page Service thread",
+                    false,
                    move || page_service_conn_main(conf, local_auth, socket, auth_type),
                ) {
                    // Thread creation failed. Log the error and continue.
@@ -324,8 +325,8 @@ impl PageServerHandler {
        let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();

        // Check that the timeline exists
-        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
-            .context("Cannot handle pagerequests for a remote timeline")?;
+        let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+            .context("Cannot load local timeline")?;

        /* switch client to COPYBOTH */
        pgb.write_message(&BeMessage::CopyBothResponse)?;
@@ -513,10 +514,11 @@ impl PageServerHandler {
    ) -> anyhow::Result<()> {
        let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
        let _enter = span.enter();
+        info!("starting");

        // check that the timeline exists
-        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
-            .context("Cannot handle basebackup request for a remote timeline")?;
+        let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+            .context("Cannot load local timeline")?;
        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            timeline
@@ -535,7 +537,7 @@ impl PageServerHandler {
            basebackup.send_tarball()?;
        }
        pgb.write_message(&BeMessage::CopyDone)?;
-        debug!("CopyDone sent!");
+        info!("done");

        Ok(())
    }
@@ -569,7 +571,6 @@ impl postgres_backend::Handler for PageServerHandler {
        let data = self
            .auth
            .as_ref()
-            .as_ref()
            .unwrap()
            .decode(str::from_utf8(jwt_response)?)?;

@@ -650,8 +651,8 @@ impl postgres_backend::Handler for PageServerHandler {
                info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();

            // Check that the timeline exists
-            tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
-                .context("Failed to fetch local timeline for callmemaybe requests")?;
+            tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+                .context("Cannot load local timeline")?;

            walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?;

@@ -725,8 +726,8 @@ impl postgres_backend::Handler for PageServerHandler {
            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;

-            let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
-                .context("Failed to fetch local timeline for checkpoint request")?;
+            let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+                .context("Cannot load local timeline")?;

            timeline.tline.checkpoint(CheckpointConfig::Forced)?;

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,22 +6,21 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
-use crate::keyspace::{KeySpace, KeySpaceAccum, TARGET_FILE_SIZE_BYTES};
-use crate::relish::*;
+use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceAccum};
+use crate::reltag::{RelTag, SlruKind};
 use crate::repository::*;
 use crate::repository::{Repository, Timeline};
 use crate::walrecord::ZenithWalRecord;
-use anyhow::{bail, Result};
+use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
 use postgres_ffi::{pg_constants, Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
 use std::ops::Range;
 use std::sync::atomic::{AtomicIsize, Ordering};
-use std::sync::{Arc, RwLockReadGuard};
+use std::sync::{Arc, Mutex, RwLockReadGuard};
 use tracing::{debug, error, trace, warn};
 use zenith_utils::bin_ser::BeSer;
-use zenith_utils::lsn::AtomicLsn;
 use zenith_utils::lsn::Lsn;

 /// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
@@ -38,7 +37,7 @@ where
    pub tline: Arc<R::Timeline>,

    /// When did we last calculate the partitioning?
-    last_partitioning: AtomicLsn,
+    partitioning: Mutex<(KeyPartitioning, Lsn)>,

    /// Configuration: how often should the partitioning be recalculated.
    repartition_threshold: u64,
@@ -51,7 +50,7 @@ impl<R: Repository> DatadirTimeline<R> {
    pub fn new(tline: Arc<R::Timeline>, repartition_threshold: u64) -> Self {
        DatadirTimeline {
            tline,
-            last_partitioning: AtomicLsn::new(0),
+            partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
            current_logical_size: AtomicIsize::new(0),
            repartition_threshold,
        }
@@ -69,23 +68,25 @@ impl<R: Repository> DatadirTimeline<R> {
        Ok(())
    }

-    /// Start updating a WAL record
+    /// Start ingesting a WAL record, or other atomic modification of
+    /// the timeline.
    ///
    /// This provides a transaction-like interface to perform a bunch
-    /// of modifications atomically, with one LSN.
+    /// of modifications atomically, all stamped with one LSN.
    ///
-    /// To ingest a WAL record, call begin_record(lsn) to get a writer
-    /// object. Use the functions in the writer-object to modify the
-    /// repository state, updating all the pages and metadata that the
-    /// WAL record affects. When you're done, call writer.finish() to
+    /// To ingest a WAL record, call begin_modification(lsn) to get a
+    /// DatadirModification object. Use the functions in the object to
+    /// modify the repository state, updating all the pages and metadata
+    /// that the WAL record affects. When you're done, call commit() to
    /// commit the changes.
    ///
-    /// Note that any pending modifications you make through the writer
-    /// won't be visible to calls to the get functions until you finish!
-    /// If you update the same page twice, the last update wins.
+    /// Note that any pending modifications you make through the
+    /// modification object won't be visible to calls to the 'get' and list
+    /// functions of the timeline until you finish! And if you update the
+    /// same page twice, the last update wins.
    ///
-    pub fn begin_record(&self, lsn: Lsn) -> DatadirTimelineWriter<R> {
-        DatadirTimelineWriter {
+    pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification<R> {
+        DatadirModification {
            tline: self,
            lsn,
            pending_updates: HashMap::new(),
@@ -100,6 +101,8 @@ impl<R: Repository> DatadirTimeline<R> {

    /// Look up given page version.
    pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes> {
+        ensure!(tag.relnode != 0, "invalid relnode");
+
        let nblocks = self.get_rel_size(tag, lsn)?;
        if blknum >= nblocks {
            debug!(
@@ -115,14 +118,16 @@ impl<R: Repository> DatadirTimeline<R> {

    /// Get size of a relation file
    pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
+        ensure!(tag.relnode != 0, "invalid relnode");
+
        if (tag.forknum == pg_constants::FSM_FORKNUM
            || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM)
            && !self.get_rel_exists(tag, lsn)?
        {
-            // FIXME: Postgres sometimes calls calls smgrcreate() to
-            // create FSM, and smgrnblocks() on it immediately
-            // afterwards, without extending it.  Tolerate that by
-            // claiming that any non-existent FSM fork has size 0.
+            // FIXME: Postgres sometimes calls smgrcreate() to create
+            // FSM, and smgrnblocks() on it immediately afterwards,
+            // without extending it.  Tolerate that by claiming that
+            // any non-existent FSM fork has size 0.
            return Ok(0);
        }

@@ -133,6 +138,8 @@ impl<R: Repository> DatadirTimeline<R> {

    /// Does relation exist?
    pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
+        ensure!(tag.relnode != 0, "invalid relnode");
+
        // fetch directory listing
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
        let buf = self.tline.get(key, lsn)?;
@@ -380,14 +387,30 @@ impl<R: Repository> DatadirTimeline<R> {

        Ok(result.to_keyspace())
    }
+
+    pub fn repartition(&self, lsn: Lsn, partition_size: u64) -> Result<(KeyPartitioning, Lsn)> {
+        let mut partitioning_guard = self.partitioning.lock().unwrap();
+        if partitioning_guard.1 == Lsn(0)
+            || lsn.0 - partitioning_guard.1 .0 > self.repartition_threshold
+        {
+            let keyspace = self.collect_keyspace(lsn)?;
+            let partitioning = keyspace.partition(partition_size);
+            *partitioning_guard = (partitioning, lsn);
+            return Ok((partitioning_guard.0.clone(), lsn));
+        }
+        Ok((partitioning_guard.0.clone(), partitioning_guard.1))
+    }
 }

-/// DatadirTimelineWriter represents an operation to ingest an atomic set of
+/// DatadirModification represents an operation to ingest an atomic set of
 /// updates to the repository. It is created by the 'begin_record'
 /// function. It is called for each WAL record, so that all the modifications
-/// by a one WAL record appear atomic
-pub struct DatadirTimelineWriter<'a, R: Repository> {
-    tline: &'a DatadirTimeline<R>,
+/// by a one WAL record appear atomic.
+pub struct DatadirModification<'a, R: Repository> {
+    /// The timeline this modification applies to. You can access this to
+    /// read the state, but note that any pending updates are *not* reflected
+    /// in the state in 'tline' yet.
+    pub tline: &'a DatadirTimeline<R>,

    lsn: Lsn,

@@ -399,19 +422,7 @@ pub struct DatadirTimelineWriter<'a, R: Repository> {
    pending_nblocks: isize,
 }

-// TODO Currently, Deref is used to allow easy access to read methods from this trait.
-// This is probably considered a bad practice in Rust and should be fixed eventually,
-// but will cause large code changes.
-impl<'a, R: Repository> std::ops::Deref for DatadirTimelineWriter<'a, R> {
-    type Target = DatadirTimeline<R>;
-
-    fn deref(&self) -> &Self::Target {
-        self.tline
-    }
-}
-
-/// Various functions to mutate the repository state.
-impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {
+impl<'a, R: Repository> DatadirModification<'a, R> {
    /// Initialize a completely new repository.
    ///
    /// This inserts the directory metadata entries that are assumed to
@@ -450,6 +461,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {
        blknum: BlockNumber,
        rec: ZenithWalRecord,
    ) -> Result<()> {
+        ensure!(rel.relnode != 0, "invalid relnode");
        self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
        Ok(())
    }
@@ -476,6 +488,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {
        blknum: BlockNumber,
        img: Bytes,
    ) -> Result<()> {
+        ensure!(rel.relnode != 0, "invalid relnode");
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
    }
@@ -491,6 +504,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {
        Ok(())
    }

+    /// Store a relmapper file (pg_filenode.map) in the repository
    pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> {
        // Add it to the directory (if it doesn't exist already)
        let buf = self.get(DBDIR_KEY)?;
@@ -566,22 +580,11 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {
        Ok(())
    }

-    // When a new relation is created:
-    // - create/update the directory entry to remember that it exists
-    // - create relish header to indicate the size (0)
-
-    // When a relation is extended:
-    // - update relish header with new size
-    // - insert the block
-
-    // when a relation is truncated:
-    // - delete truncated blocks
-    // - update relish header with size
-
    /// Create a relation fork.
    ///
    /// 'nblocks' is the initial size.
    pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
+        ensure!(rel.relnode != 0, "invalid relnode");
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?;
@@ -623,6 +626,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {

    /// Truncate relation
    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
+        ensure!(rel.relnode != 0, "invalid relnode");
        let size_key = rel_size_to_key(rel);

        // Fetch the old size first
@@ -639,6 +643,8 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {

    /// Extend relation
    pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
+        ensure!(rel.relnode != 0, "invalid relnode");
+
        // Put size
        let size_key = rel_size_to_key(rel);
        let old_size = self.get(size_key)?.get_u32_le();
@@ -652,6 +658,8 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {

    /// Drop a relation.
    pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> {
+        ensure!(rel.relnode != 0, "invalid relnode");
+
        // Remove it from the directory entry
        let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
        let buf = self.get(dir_key)?;
@@ -738,7 +746,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {
        Ok(())
    }

-    /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records
+    /// Drop a relmapper file (pg_filenode.map)
    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> {
        // TODO
        Ok(())
@@ -764,10 +772,13 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {
        Ok(())
    }

-    pub fn finish(self) -> Result<()> {
+    ///
+    /// Finish this atomic update, writing all the updated keys to the
+    /// underlying timeline.
+    ///
+    pub fn commit(self) -> Result<()> {
        let writer = self.tline.tline.writer();

-        let last_partitioning = self.last_partitioning.load();
        let pending_nblocks = self.pending_nblocks;

        for (key, value) in self.pending_updates {
@@ -779,15 +790,6 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> {

        writer.finish_write(self.lsn);

-        if last_partitioning == Lsn(0)
-            || self.lsn.0 - last_partitioning.0 > self.tline.repartition_threshold
-        {
-            let keyspace = self.tline.collect_keyspace(self.lsn)?;
-            let partitioning = keyspace.partition(TARGET_FILE_SIZE_BYTES);
-            self.tline.tline.hint_partitioning(partitioning, self.lsn)?;
-            self.tline.last_partitioning.store(self.lsn);
-        }
-
        if pending_nblocks != 0 {
            self.tline.current_logical_size.fetch_add(
                pending_nblocks * pg_constants::BLCKSZ as isize,
@@ -915,7 +917,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; pg_constants::BLCKSZ as usiz
 // 00 SPCNODE  DBNODE   00000000 00   00000000
 //
 // RelDir:
-// 00 SPCNODE  DBNODE   00000000 00   00000001
+// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
 //
 // RelBlock:
 // 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
@@ -1213,11 +1215,10 @@ pub fn create_test_timeline<R: Repository>(
    timeline_id: zenith_utils::zid::ZTimelineId,
 ) -> Result<Arc<crate::DatadirTimeline<R>>> {
    let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
-    let tline = DatadirTimeline::new(tline, crate::layered_repository::tests::TEST_FILE_SIZE / 10);
-    let mut writer = tline.begin_record(Lsn(8));
-    writer.init_empty()?;
-
-    writer.finish()?;
+    let tline = DatadirTimeline::new(tline, 256 * 1024);
+    let mut m = tline.begin_modification(Lsn(8));
+    m.init_empty()?;
+    m.commit()?;
    Ok(Arc::new(tline))
 }

--- a/pageserver/src/reltag.rs
+++ b/pageserver/src/reltag.rs
@@ -39,9 +39,7 @@ impl PartialOrd for RelTag {

 impl Ord for RelTag {
    fn cmp(&self, other: &Self) -> Ordering {
-        let mut cmp;
-
-        cmp = self.spcnode.cmp(&other.spcnode);
+        let mut cmp = self.spcnode.cmp(&other.spcnode);
        if cmp != Ordering::Equal {
            return cmp;
        }
--- a/pageserver/src/remote_storage.rs
+++ b/pageserver/src/remote_storage.rs
@@ -5,7 +5,7 @@
 //! There are a few components the storage machinery consists of:
 //! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
 //!     * [`local_fs`] allows to use local file system as an external storage
-//!     * [`rust_s3`] uses AWS S3 bucket as an external storage
+//!     * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!
 //! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
 //! Synchronization internals are split into submodules
@@ -82,7 +82,7 @@
 //! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time.

 mod local_fs;
-mod rust_s3;
+mod s3_bucket;
 mod storage_sync;

 use std::{
@@ -93,28 +93,34 @@ use std::{

 use anyhow::{bail, Context};
 use tokio::io;
-use tracing::{error, info};
+use tracing::{debug, error, info};
 use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};

+pub use self::storage_sync::index::{RemoteIndex, TimelineIndexEntry};
 pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download};
-use self::{local_fs::LocalFs, rust_s3::S3};
+use self::{local_fs::LocalFs, s3_bucket::S3Bucket};
+use crate::layered_repository::ephemeral_file::is_ephemeral_file;
 use crate::{
    config::{PageServerConf, RemoteStorageKind},
    layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
-    repository::TimelineSyncState,
 };

 pub use storage_sync::compression;

+#[derive(Clone, Copy, Debug)]
+pub enum LocalTimelineInitStatus {
+    LocallyComplete,
+    NeedsSync,
+}
+
+type LocalTimelineInitStatuses = HashMap<ZTenantId, HashMap<ZTimelineId, LocalTimelineInitStatus>>;
+
 /// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization.
 /// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still,
 /// to simplify the received code.
 pub struct SyncStartupData {
-    /// A sync state, derived from initial comparison of local timeline files and the remote archives,
-    /// before any sync tasks are executed.
-    /// To reuse the local file scan logic, the timeline states are returned even if no sync loop get started during init:
-    /// in this case, no remote files exist and all local timelines with correct metadata files are considered ready.
-    pub initial_timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>,
+    pub remote_index: RemoteIndex,
+    pub local_timeline_init_statuses: LocalTimelineInitStatuses,
 }

 /// Based on the config, initiates the remote storage connection and starts a separate thread
@@ -145,7 +151,7 @@ pub fn start_local_timeline_sync(
                storage_sync::spawn_storage_sync_thread(
                    config,
                    local_timeline_files,
-                    S3::new(s3_config, &config.workdir)?,
+                    S3Bucket::new(s3_config, &config.workdir)?,
                    storage_config.max_concurrent_sync,
                    storage_config.max_sync_errors,
                )
@@ -154,23 +160,18 @@ pub fn start_local_timeline_sync(
        .context("Failed to spawn the storage sync thread"),
        None => {
            info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
-            let mut initial_timeline_states: HashMap<
-                ZTenantId,
-                HashMap<ZTimelineId, TimelineSyncState>,
-            > = HashMap::new();
-            for (ZTenantTimelineId{tenant_id, timeline_id}, (timeline_metadata, _)) in
+            let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
+            for (ZTenantTimelineId { tenant_id, timeline_id }, _) in
                local_timeline_files
            {
-                initial_timeline_states
+                local_timeline_init_statuses
                    .entry(tenant_id)
                    .or_default()
-                    .insert(
-                        timeline_id,
-                        TimelineSyncState::Ready(timeline_metadata.disk_consistent_lsn()),
-                    );
+                    .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete);
            }
            Ok(SyncStartupData {
-                initial_timeline_states,
+                local_timeline_init_statuses,
+                remote_index: RemoteIndex::empty(),
            })
        }
    }
@@ -260,6 +261,8 @@ fn collect_timelines_for_tenant(
    Ok(timelines)
 }

+// discover timeline files and extract timeline metadata
+//  NOTE: ephemeral files are excluded from the list
 fn collect_timeline_files(
    timeline_dir: &Path,
 ) -> anyhow::Result<(ZTimelineId, TimelineMetadata, Vec<PathBuf>)> {
@@ -279,6 +282,9 @@ fn collect_timeline_files(
        if entry_path.is_file() {
            if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) {
                timeline_metadata_path = Some(entry_path);
+            } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
+                debug!("skipping ephemeral file {}", entry_path.display());
+                continue;
            } else {
                timeline_files.push(entry_path);
            }
@@ -319,27 +325,35 @@ trait RemoteStorage: Send + Sync {
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        to: &Self::StoragePath,
+        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()>;

    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Returns the metadata, if any was stored with the file previously.
    async fn download(
        &self,
        from: &Self::StoragePath,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()>;
+    ) -> anyhow::Result<Option<StorageMetadata>>;

    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Returns the metadata, if any was stored with the file previously.
    async fn download_range(
        &self,
        from: &Self::StoragePath,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()>;
+    ) -> anyhow::Result<Option<StorageMetadata>>;

    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>;
 }

+/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
+/// Immutable, cannot be changed once the file is created.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct StorageMetadata(HashMap<String, String>);
+
 fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
    if prefix == path {
        anyhow::bail!(
--- a/pageserver/src/remote_storage/README.md
+++ b/pageserver/src/remote_storage/README.md
@@ -17,7 +17,7 @@ This way, the backups are managed in background, not affecting directly other pa
 Current implementation
 * provides remote storage wrappers for AWS S3 and local FS
 * synchronizes the differences with local timelines and remote states as fast as possible
-* uploads new relishes, frozen by pageserver checkpoint thread
+* uploads new layer files
 * downloads and registers timelines, found on the remote storage, but missing locally, if those are requested somehow via pageserver (e.g. http api, gc)
 * uses compression when deals with files, for better S3 usage
 * maintains an index of what's stored remotely
@@ -46,18 +46,6 @@ This could be avoided by a background thread/future storing the serialized index

 No file checksum assertion is done currently, but should be (AWS S3 returns file checksums during the `list` operation)

-* sad rust-s3 api
-
-rust-s3 is not very pleasant to use:
-1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance
-2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091)
-3. it's a prerelease library with unclear maintenance status
-4. noisy on debug level
-
-But it's already used in the project, so for now it's reused to avoid bloating the dependency tree.
-Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.
-
-
 * gc is ignored

 So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage.
--- a/pageserver/src/remote_storage/local_fs.rs
+++ b/pageserver/src/remote_storage/local_fs.rs
@@ -17,7 +17,7 @@ use tokio::{
 };
 use tracing::*;

-use super::{strip_path_prefix, RemoteStorage};
+use super::{strip_path_prefix, RemoteStorage, StorageMetadata};

 pub struct LocalFs {
    pageserver_workdir: &'static Path,
@@ -53,6 +53,32 @@ impl LocalFs {
            )
        }
    }
+
+    async fn read_storage_metadata(
+        &self,
+        file_path: &Path,
+    ) -> anyhow::Result<Option<StorageMetadata>> {
+        let metadata_path = storage_metadata_path(file_path);
+        if metadata_path.exists() && metadata_path.is_file() {
+            let metadata_string = fs::read_to_string(&metadata_path).await.with_context(|| {
+                format!(
+                    "Failed to read metadata from the local storage at '{}'",
+                    metadata_path.display()
+                )
+            })?;
+
+            serde_json::from_str(&metadata_string)
+                .with_context(|| {
+                    format!(
+                        "Failed to deserialize metadata from the local storage at '{}'",
+                        metadata_path.display()
+                    )
+                })
+                .map(|metadata| Some(StorageMetadata(metadata)))
+        } else {
+            Ok(None)
+        }
+    }
 }

 #[async_trait::async_trait]
@@ -80,14 +106,19 @@ impl RemoteStorage for LocalFs {
        &self,
        mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        to: &Self::StoragePath,
+        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
        let target_file_path = self.resolve_in_storage(to)?;
        create_target_directory(&target_file_path).await?;
+        // We need this dance with sort of durable rename (without fsyncs)
+        // to prevent partial uploads. This was really hit when pageserver shutdown
+        // cancelled the upload and partial file was left on the fs
+        let temp_file_path = path_with_suffix_extension(&target_file_path, ".temp");
        let mut destination = io::BufWriter::new(
            fs::OpenOptions::new()
                .write(true)
                .create(true)
-                .open(&target_file_path)
+                .open(&temp_file_path)
                .await
                .with_context(|| {
                    format!(
@@ -101,16 +132,43 @@ impl RemoteStorage for LocalFs {
            .await
            .with_context(|| {
                format!(
-                    "Failed to upload file to the local storage at '{}'",
+                    "Failed to upload file (write temp) to the local storage at '{}'",
+                    temp_file_path.display()
+                )
+            })?;
+
+        destination.flush().await.with_context(|| {
+            format!(
+                "Failed to upload (flush temp) file to the local storage at '{}'",
+                temp_file_path.display()
+            )
+        })?;
+
+        fs::rename(temp_file_path, &target_file_path)
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to upload (rename) file to the local storage at '{}'",
                    target_file_path.display()
                )
            })?;
-        destination.flush().await.with_context(|| {
-            format!(
-                "Failed to upload file to the local storage at '{}'",
-                target_file_path.display()
+
+        if let Some(storage_metadata) = metadata {
+            let storage_metadata_path = storage_metadata_path(&target_file_path);
+            fs::write(
+                &storage_metadata_path,
+                serde_json::to_string(&storage_metadata.0)
+                    .context("Failed to serialize storage metadata as json")?,
            )
-        })?;
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to write metadata to the local storage at '{}'",
+                    storage_metadata_path.display()
+                )
+            })?;
+        }
+
        Ok(())
    }

@@ -118,7 +176,7 @@ impl RemoteStorage for LocalFs {
        &self,
        from: &Self::StoragePath,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<Option<StorageMetadata>> {
        let file_path = self.resolve_in_storage(from)?;

        if file_path.exists() && file_path.is_file() {
@@ -141,7 +199,8 @@ impl RemoteStorage for LocalFs {
                )
            })?;
            source.flush().await?;
-            Ok(())
+
+            self.read_storage_metadata(&file_path).await
        } else {
            bail!(
                "File '{}' either does not exist or is not a file",
@@ -156,7 +215,7 @@ impl RemoteStorage for LocalFs {
        start_inclusive: u64,
        end_exclusive: Option<u64>,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<Option<StorageMetadata>> {
        if let Some(end_exclusive) = end_exclusive {
            ensure!(
                end_exclusive > start_inclusive,
@@ -165,7 +224,7 @@ impl RemoteStorage for LocalFs {
                end_exclusive
            );
            if start_inclusive == end_exclusive.saturating_sub(1) {
-                return Ok(());
+                return Ok(None);
            }
        }
        let file_path = self.resolve_in_storage(from)?;
@@ -199,7 +258,8 @@ impl RemoteStorage for LocalFs {
                    file_path.display()
                )
            })?;
-            Ok(())
+
+            self.read_storage_metadata(&file_path).await
        } else {
            bail!(
                "File '{}' either does not exist or is not a file",
@@ -221,6 +281,17 @@ impl RemoteStorage for LocalFs {
    }
 }

+fn path_with_suffix_extension(original_path: &Path, suffix: &str) -> PathBuf {
+    let mut extension_with_suffix = original_path.extension().unwrap_or_default().to_os_string();
+    extension_with_suffix.push(suffix);
+
+    original_path.with_extension(extension_with_suffix)
+}
+
+fn storage_metadata_path(original_path: &Path) -> PathBuf {
+    path_with_suffix_extension(original_path, ".metadata")
+}
+
 fn get_all_files<'a, P>(
    directory_path: P,
 ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
@@ -430,7 +501,7 @@ mod fs_tests {
    use super::*;
    use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};

-    use std::io::Write;
+    use std::{collections::HashMap, io::Write};
    use tempfile::tempdir;

    #[tokio::test]
@@ -444,7 +515,7 @@ mod fs_tests {
        )
        .await?;
        let target_path = PathBuf::from("/").join("somewhere").join("else");
-        match storage.upload(source, &target_path).await {
+        match storage.upload(source, &target_path, None).await {
            Ok(()) => panic!("Should not allow storing files with wrong target path"),
            Err(e) => {
                let message = format!("{:?}", e);
@@ -454,14 +525,14 @@ mod fs_tests {
        }
        assert!(storage.list().await?.is_empty());

-        let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
+        let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1", None).await?;
        assert_eq!(
            storage.list().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );

-        let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
+        let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2", None).await?;
        assert_eq!(
            list_files_sorted(&storage).await?,
            vec![target_path_1.clone(), target_path_2.clone()],
@@ -482,12 +553,16 @@ mod fs_tests {
        let repo_harness = RepoHarness::create("download_file")?;
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;

        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        storage.download(&upload_target, &mut content_bytes).await?;
-        content_bytes.flush().await?;
+        let metadata = storage.download(&upload_target, &mut content_bytes).await?;
+        assert!(
+            metadata.is_none(),
+            "No metadata should be returned for no metadata upload"
+        );

+        content_bytes.flush().await?;
        let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
        assert_eq!(
            dummy_contents(upload_name),
@@ -512,12 +587,16 @@ mod fs_tests {
        let repo_harness = RepoHarness::create("download_file_range_positive")?;
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;

        let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        storage
+        let metadata = storage
            .download_range(&upload_target, 0, None, &mut full_range_bytes)
            .await?;
+        assert!(
+            metadata.is_none(),
+            "No metadata should be returned for no metadata upload"
+        );
        full_range_bytes.flush().await?;
        assert_eq!(
            dummy_contents(upload_name),
@@ -527,7 +606,7 @@ mod fs_tests {

        let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let same_byte = 1_000_000_000;
-        storage
+        let metadata = storage
            .download_range(
                &upload_target,
                same_byte,
@@ -535,6 +614,10 @@ mod fs_tests {
                &mut zero_range_bytes,
            )
            .await?;
+        assert!(
+            metadata.is_none(),
+            "No metadata should be returned for no metadata upload"
+        );
        zero_range_bytes.flush().await?;
        assert!(
            zero_range_bytes.into_inner().into_inner().is_empty(),
@@ -545,7 +628,7 @@ mod fs_tests {
        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);

        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        storage
+        let metadata = storage
            .download_range(
                &upload_target,
                0,
@@ -553,6 +636,11 @@ mod fs_tests {
                &mut first_part_remote,
            )
            .await?;
+        assert!(
+            metadata.is_none(),
+            "No metadata should be returned for no metadata upload"
+        );
+
        first_part_remote.flush().await?;
        let first_part_remote = first_part_remote.into_inner().into_inner();
        assert_eq!(
@@ -562,7 +650,7 @@ mod fs_tests {
        );

        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        storage
+        let metadata = storage
            .download_range(
                &upload_target,
                first_part_local.len() as u64,
@@ -570,6 +658,11 @@ mod fs_tests {
                &mut second_part_remote,
            )
            .await?;
+        assert!(
+            metadata.is_none(),
+            "No metadata should be returned for no metadata upload"
+        );
+
        second_part_remote.flush().await?;
        let second_part_remote = second_part_remote.into_inner().into_inner();
        assert_eq!(
@@ -586,7 +679,7 @@ mod fs_tests {
        let repo_harness = RepoHarness::create("download_file_range_negative")?;
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;

        let start = 10000;
        let end = 234;
@@ -624,7 +717,7 @@ mod fs_tests {
        let repo_harness = RepoHarness::create("delete_file")?;
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
+        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;

        storage.delete(&upload_target).await?;
        assert!(storage.list().await?.is_empty());
@@ -640,10 +733,69 @@ mod fs_tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn file_with_metadata() -> anyhow::Result<()> {
+        let repo_harness = RepoHarness::create("download_file")?;
+        let storage = create_storage()?;
+        let upload_name = "upload_1";
+        let metadata = StorageMetadata(HashMap::from([
+            ("one".to_string(), "1".to_string()),
+            ("two".to_string(), "2".to_string()),
+        ]));
+        let upload_target =
+            upload_dummy_file(&repo_harness, &storage, upload_name, Some(metadata.clone())).await?;
+
+        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?;
+
+        content_bytes.flush().await?;
+        let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
+        assert_eq!(
+            dummy_contents(upload_name),
+            contents,
+            "We should upload and download the same contents"
+        );
+
+        assert_eq!(
+            full_download_metadata.as_ref(),
+            Some(&metadata),
+            "We should get the same metadata back for full download"
+        );
+
+        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
+        let (first_part_local, _) = uploaded_bytes.split_at(3);
+
+        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        let partial_download_metadata = storage
+            .download_range(
+                &upload_target,
+                0,
+                Some(first_part_local.len() as u64),
+                &mut first_part_remote,
+            )
+            .await?;
+        first_part_remote.flush().await?;
+        let first_part_remote = first_part_remote.into_inner().into_inner();
+        assert_eq!(
+            first_part_local,
+            first_part_remote.as_slice(),
+            "First part bytes should be returned when requested"
+        );
+
+        assert_eq!(
+            partial_download_metadata.as_ref(),
+            Some(&metadata),
+            "We should get the same metadata back for partial download"
+        );
+
+        Ok(())
+    }
+
    async fn upload_dummy_file(
        harness: &RepoHarness<'_>,
        storage: &LocalFs,
        name: &str,
+        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<PathBuf> {
        let timeline_path = harness.timeline_path(&TIMELINE_ID);
        let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
@@ -656,6 +808,7 @@ mod fs_tests {
                )
                .await?,
                &storage_path,
+                metadata,
            )
            .await?;
        Ok(storage_path)
--- a/pageserver/src/remote_storage/s3_bucket.rs
+++ b/pageserver/src/remote_storage/s3_bucket.rs
@@ -1,4 +1,4 @@
-//! AWS S3 storage wrapper around `rust_s3` library.
+//! AWS S3 storage wrapper around `rusoto` library.
 //!
 //! Respects `prefix_in_bucket` property from [`S3Config`],
 //! allowing multiple pageservers to independently work with the same S3 bucket, if
@@ -7,15 +7,25 @@
 use std::path::{Path, PathBuf};

 use anyhow::Context;
-use s3::{bucket::Bucket, creds::Credentials, region::Region};
-use tokio::io::{self, AsyncWriteExt};
-use tracing::debug;
+use rusoto_core::{
+    credential::{InstanceMetadataProvider, StaticProvider},
+    HttpClient, Region,
+};
+use rusoto_s3::{
+    DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client,
+    StreamingBody, S3,
+};
+use tokio::io;
+use tokio_util::io::ReaderStream;
+use tracing::{debug, trace};

 use crate::{
    config::S3Config,
    remote_storage::{strip_path_prefix, RemoteStorage},
 };

+use super::StorageMetadata;
+
 const S3_FILE_SEPARATOR: char = '/';

 #[derive(Debug, Eq, PartialEq)]
@@ -50,38 +60,50 @@ impl S3ObjectKey {
 }

 /// AWS S3 storage.
-pub struct S3 {
+pub struct S3Bucket {
    pageserver_workdir: &'static Path,
-    bucket: Bucket,
+    client: S3Client,
+    bucket_name: String,
    prefix_in_bucket: Option<String>,
 }

-impl S3 {
-    /// Creates the storage, errors if incorrect AWS S3 configuration provided.
+impl S3Bucket {
+    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
    pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
+        // TODO kb check this
+        // Keeping a single client may cause issues due to timeouts.
+        // https://github.com/rusoto/rusoto/issues/1686
+
        debug!(
-            "Creating s3 remote storage around bucket {}",
+            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
        );
        let region = match aws_config.endpoint.clone() {
-            Some(endpoint) => Region::Custom {
-                endpoint,
-                region: aws_config.bucket_region.clone(),
+            Some(custom_endpoint) => Region::Custom {
+                name: aws_config.bucket_region.clone(),
+                endpoint: custom_endpoint,
            },
            None => aws_config
                .bucket_region
                .parse::<Region>()
                .context("Failed to parse the s3 region from config")?,
        };
-
-        let credentials = Credentials::new(
-            aws_config.access_key_id.as_deref(),
-            aws_config.secret_access_key.as_deref(),
-            None,
-            None,
-            None,
-        )
-        .context("Failed to create the s3 credentials")?;
+        let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?;
+        let client = if aws_config.access_key_id.is_none() && aws_config.secret_access_key.is_none()
+        {
+            trace!("Using IAM-based AWS access");
+            S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
+        } else {
+            trace!("Using credentials-based AWS access");
+            S3Client::new_with(
+                request_dispatcher,
+                StaticProvider::new_minimal(
+                    aws_config.access_key_id.clone().unwrap_or_default(),
+                    aws_config.secret_access_key.clone().unwrap_or_default(),
+                ),
+                region,
+            )
+        };

        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
            let mut prefix = prefix;
@@ -97,20 +119,16 @@ impl S3 {
        });

        Ok(Self {
-            bucket: Bucket::new_with_path_style(
-                aws_config.bucket_name.as_str(),
-                region,
-                credentials,
-            )
-            .context("Failed to create the s3 bucket")?,
+            client,
            pageserver_workdir,
+            bucket_name: aws_config.bucket_name.clone(),
            prefix_in_bucket,
        })
    }
 }

 #[async_trait::async_trait]
-impl RemoteStorage for S3 {
+impl RemoteStorage for S3Bucket {
    type StoragePath = S3ObjectKey;

    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
@@ -129,74 +147,74 @@ impl RemoteStorage for S3 {
    }

    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
-        let list_response = self
-            .bucket
-            .list(self.prefix_in_bucket.clone().unwrap_or_default(), None)
-            .await
-            .context("Failed to list s3 objects")?;
+        let mut document_keys = Vec::new();

-        Ok(list_response
-            .into_iter()
-            .flat_map(|response| response.contents)
-            .map(|s3_object| S3ObjectKey(s3_object.key))
-            .collect())
+        let mut continuation_token = None;
+        loop {
+            let fetch_response = self
+                .client
+                .list_objects_v2(ListObjectsV2Request {
+                    bucket: self.bucket_name.clone(),
+                    prefix: self.prefix_in_bucket.clone(),
+                    continuation_token,
+                    ..ListObjectsV2Request::default()
+                })
+                .await?;
+            document_keys.extend(
+                fetch_response
+                    .contents
+                    .unwrap_or_default()
+                    .into_iter()
+                    .filter_map(|o| Some(S3ObjectKey(o.key?))),
+            );
+
+            match fetch_response.continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+
+        Ok(document_keys)
    }

    async fn upload(
        &self,
-        mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        to: &Self::StoragePath,
+        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
-        let mut upload_contents = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(&mut from, &mut upload_contents)
-            .await
-            .context("Failed to read the upload contents")?;
-        upload_contents
-            .flush()
-            .await
-            .context("Failed to read the upload contents")?;
-        let upload_contents = upload_contents.into_inner().into_inner();
-
-        let (_, code) = self
-            .bucket
-            .put_object(to.key(), &upload_contents)
-            .await
-            .with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
-        if code != 200 {
-            Err(anyhow::format_err!(
-                "Received non-200 exit code during creating object with key '{}', code: {}",
-                to.key(),
-                code
-            ))
-        } else {
-            Ok(())
-        }
+        self.client
+            .put_object(PutObjectRequest {
+                body: Some(StreamingBody::new(ReaderStream::new(from))),
+                bucket: self.bucket_name.clone(),
+                key: to.key().to_owned(),
+                metadata: metadata.map(|m| m.0),
+                ..PutObjectRequest::default()
+            })
+            .await?;
+        Ok(())
    }

    async fn download(
        &self,
        from: &Self::StoragePath,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()> {
-        let (data, code) = self
-            .bucket
-            .get_object(from.key())
-            .await
-            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
-        if code != 200 {
-            Err(anyhow::format_err!(
-                "Received non-200 exit code during downloading object, code: {}",
-                code
-            ))
-        } else {
-            // we don't have to write vector into the destination this way, `to_write_all` would be enough.
-            // but we want to prepare for migration on `rusoto`, that has a streaming HTTP body instead here, with
-            // which it makes more sense to use `io::copy`.
-            io::copy(&mut data.as_slice(), to)
-                .await
-                .context("Failed to write downloaded data into the destination buffer")?;
-            Ok(())
+    ) -> anyhow::Result<Option<StorageMetadata>> {
+        let object_output = self
+            .client
+            .get_object(GetObjectRequest {
+                bucket: self.bucket_name.clone(),
+                key: from.key().to_owned(),
+                ..GetObjectRequest::default()
+            })
+            .await?;
+
+        if let Some(body) = object_output.body {
+            let mut from = io::BufReader::new(body.into_async_read());
+            io::copy(&mut from, to).await?;
        }
+
+        Ok(object_output.metadata.map(StorageMetadata))
    }

    async fn download_range(
@@ -205,44 +223,41 @@ impl RemoteStorage for S3 {
        start_inclusive: u64,
        end_exclusive: Option<u64>,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<Option<StorageMetadata>> {
        // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
        // and needs both ends to be exclusive
        let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
-        let (data, code) = self
-            .bucket
-            .get_object_range(from.key(), start_inclusive, end_inclusive)
-            .await
-            .with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
-        if code != 206 {
-            Err(anyhow::format_err!(
-                "Received non-206 exit code during downloading object range, code: {}",
-                code
-            ))
-        } else {
-            // see `download` function above for the comment on why `Vec<u8>` buffer is copied this way
-            io::copy(&mut data.as_slice(), to)
-                .await
-                .context("Failed to write downloaded range into the destination buffer")?;
-            Ok(())
+        let range = Some(match end_inclusive {
+            Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
+            None => format!("bytes={}-", start_inclusive),
+        });
+        let object_output = self
+            .client
+            .get_object(GetObjectRequest {
+                bucket: self.bucket_name.clone(),
+                key: from.key().to_owned(),
+                range,
+                ..GetObjectRequest::default()
+            })
+            .await?;
+
+        if let Some(body) = object_output.body {
+            let mut from = io::BufReader::new(body.into_async_read());
+            io::copy(&mut from, to).await?;
        }
+
+        Ok(object_output.metadata.map(StorageMetadata))
    }

    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
-        let (_, code) = self
-            .bucket
-            .delete_object(path.key())
-            .await
-            .with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
-        if code != 204 {
-            Err(anyhow::format_err!(
-                "Received non-204 exit code during deleting object with key '{}', code: {}",
-                path.key(),
-                code
-            ))
-        } else {
-            Ok(())
-        }
+        self.client
+            .delete_object(DeleteObjectRequest {
+                bucket: self.bucket_name.clone(),
+                key: path.key().to_owned(),
+                ..DeleteObjectRequest::default()
+            })
+            .await?;
+        Ok(())
    }
 }

@@ -314,7 +329,7 @@ mod tests {
    #[test]
    fn storage_path_negatives() -> anyhow::Result<()> {
        #[track_caller]
-        fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String {
+        fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String {
            match storage.storage_path(mismatching_path) {
                Ok(wrong_key) => panic!(
                    "Expected path '{}' to error, but got S3 key: {:?}",
@@ -412,15 +427,11 @@ mod tests {
        Ok(())
    }

-    fn dummy_storage(pageserver_workdir: &'static Path) -> S3 {
-        S3 {
+    fn dummy_storage(pageserver_workdir: &'static Path) -> S3Bucket {
+        S3Bucket {
            pageserver_workdir,
-            bucket: Bucket::new(
-                "dummy-bucket",
-                "us-east-1".parse().unwrap(),
-                Credentials::anonymous().unwrap(),
-            )
-            .unwrap(),
+            client: S3Client::new("us-east-1".parse().unwrap()),
+            bucket_name: "dummy-bucket".to_string(),
            prefix_in_bucket: Some("dummy_prefix/".to_string()),
        }
    }
--- a/pageserver/src/remote_storage/storage_sync.rs
+++ b/pageserver/src/remote_storage/storage_sync.rs
@@ -25,8 +25,9 @@
 //! * all never local state gets scheduled for upload, such timelines are "local" and fully operational
 //! * the rest of the remote timelines are reported to pageserver, but not downloaded before they are actually accessed in pageserver,
 //! it may schedule the download on such occasions.
+//! Then, the index is shared across pageserver under [`RemoteIndex`] guard to ensure proper synchronization.
 //!
-//! The synchronization unit is an archive: a set of timeline files (or relishes) and a special metadata file, all compressed into a blob.
+//! The synchronization unit is an archive: a set of layer files and a special metadata file, all compressed into a blob.
 //! Currently, there's no way to process an archive partially, if the archive processing fails, it has to be started from zero next time again.
 //! An archive contains set of files of a certain timeline, added during checkpoint(s) and the timeline metadata at that moment.
 //! The archive contains that metadata's `disk_consistent_lsn` in its name, to be able to restore partial index information from just a remote storage file list.
@@ -58,7 +59,7 @@
 //! Synchronization never removes any local from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (metadata file updates; future checksum mismatch fixes).
 //! NOTE: No real contents or checksum check happens right now and is a subject to improve later.
 //!
-//! After the whole timeline is downloaded, [`crate::tenant_mgr::set_timeline_states`] function is used to update pageserver memory stage for the timeline processed.
+//! After the whole timeline is downloaded, [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function is used to update pageserver memory stage for the timeline processed.
 //!
 //! When pageserver signals shutdown, current sync task gets finished and the loop exists.

@@ -80,10 +81,7 @@ use futures::stream::{FuturesUnordered, StreamExt};
 use lazy_static::lazy_static;
 use tokio::{
    runtime::Runtime,
-    sync::{
-        mpsc::{self, UnboundedReceiver},
-        RwLock,
-    },
+    sync::mpsc::{self, UnboundedReceiver},
    time::{Duration, Instant},
 };
 use tracing::*;
@@ -92,18 +90,26 @@ use self::{
    compression::ArchiveHeader,
    download::{download_timeline, DownloadedTimeline},
    index::{
-        ArchiveDescription, ArchiveId, RemoteTimeline, RemoteTimelineIndex, TimelineIndexEntry,
+        ArchiveDescription, ArchiveId, RemoteIndex, RemoteTimeline, RemoteTimelineIndex,
+        TimelineIndexEntry, TimelineIndexEntryInner,
    },
    upload::upload_timeline_checkpoint,
 };
-use super::{RemoteStorage, SyncStartupData, ZTenantTimelineId};
+use super::{
+    LocalTimelineInitStatus, LocalTimelineInitStatuses, RemoteStorage, SyncStartupData,
+    ZTenantTimelineId,
+};
 use crate::{
    config::PageServerConf, layered_repository::metadata::TimelineMetadata,
-    remote_storage::storage_sync::compression::read_archive_header, repository::TimelineSyncState,
-    tenant_mgr::set_timeline_states, thread_mgr, thread_mgr::ThreadKind,
+    remote_storage::storage_sync::compression::read_archive_header,
+    repository::TimelineSyncStatusUpdate, tenant_mgr::apply_timeline_sync_status_updates,
+    thread_mgr, thread_mgr::ThreadKind,
 };

-use zenith_metrics::{register_histogram_vec, register_int_gauge, HistogramVec, IntGauge};
+use zenith_metrics::{
+    register_histogram_vec, register_int_counter, register_int_gauge, HistogramVec, IntCounter,
+    IntGauge,
+};
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 lazy_static! {
@@ -112,6 +118,11 @@ lazy_static! {
        "Number of storage sync items left in the queue"
    )
    .expect("failed to register pageserver remote storage remaining sync items int gauge");
+    static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!(
+        "pageserver_remote_storage_fatal_task_failures",
+        "Number of critically failed tasks"
+    )
+    .expect("failed to register pageserver remote storage remaining sync items int gauge");
    static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
        "pageserver_remote_storage_image_sync_time",
        "Time took to synchronize (download or upload) a whole pageserver image. \
@@ -129,7 +140,7 @@ lazy_static! {
 /// mpsc approach was picked to allow blocking the sync loop if no tasks are present, to avoid meaningless spinning.
 mod sync_queue {
    use std::{
-        collections::{BTreeSet, HashMap},
+        collections::HashMap,
        sync::atomic::{AtomicUsize, Ordering},
    };

@@ -192,9 +203,9 @@ mod sync_queue {
    pub async fn next_task_batch(
        receiver: &mut UnboundedReceiver<SyncTask>,
        mut max_batch_size: usize,
-    ) -> BTreeSet<SyncTask> {
+    ) -> Vec<SyncTask> {
        if max_batch_size == 0 {
-            return BTreeSet::new();
+            return Vec::new();
        }
        let mut tasks = HashMap::with_capacity(max_batch_size);

@@ -231,7 +242,7 @@ mod sync_queue {

 /// A task to run in the async download/upload loop.
 /// Limited by the number of retries, after certain threshold the failing task gets evicted and the timeline disabled.
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+#[derive(Debug, Clone)]
 pub struct SyncTask {
    sync_id: ZTenantTimelineId,
    retries: u32,
@@ -248,7 +259,7 @@ impl SyncTask {
    }
 }

-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+#[derive(Debug, Clone)]
 enum SyncKind {
    /// A certain amount of images (archive files) to download.
    Download(TimelineDownload),
@@ -268,15 +279,15 @@ impl SyncKind {

 /// Local timeline files for upload, appeared after the new checkpoint.
 /// Current checkpoint design assumes new files are added only, no deletions or amendment happens.
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+#[derive(Debug, Clone)]
 pub struct NewCheckpoint {
-    /// Relish file paths in the pageserver workdir, that were added for the corresponding checkpoint.
+    /// layer file paths in the pageserver workdir, that were added for the corresponding checkpoint.
    layers: Vec<PathBuf>,
    metadata: TimelineMetadata,
 }

 /// Info about the remote image files.
-#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
+#[derive(Debug, Clone)]
 struct TimelineDownload {
    files_to_skip: Arc<BTreeSet<PathBuf>>,
    archives_to_skip: BTreeSet<ArchiveId>,
@@ -310,8 +321,8 @@ pub fn schedule_timeline_checkpoint_upload(
            tenant_id, timeline_id
        )
    } else {
-        warn!(
-            "Could not send an upload task for tenant {}, timeline {}: the sync queue is not initialized",
+        debug!(
+            "Upload task for tenant {}, timeline {} sent",
            tenant_id, timeline_id
        )
    }
@@ -379,35 +390,42 @@ pub(super) fn spawn_storage_sync_thread<
                None
            }
        });
-    let remote_index = RemoteTimelineIndex::try_parse_descriptions_from_paths(conf, download_paths);
+    let remote_index = RemoteIndex::try_parse_descriptions_from_paths(conf, download_paths);

-    let initial_timeline_states = schedule_first_sync_tasks(&remote_index, local_timeline_files);
+    let local_timeline_init_statuses = schedule_first_sync_tasks(
+        &mut runtime.block_on(remote_index.write()),
+        local_timeline_files,
+    );

+    let loop_index = remote_index.clone();
    thread_mgr::spawn(
        ThreadKind::StorageSync,
        None,
        None,
        "Remote storage sync thread",
+        false,
        move || {
            storage_sync_loop(
                runtime,
                conf,
                receiver,
-                remote_index,
+                loop_index,
                storage,
                max_concurrent_sync,
                max_sync_errors,
-            )
+            );
+            Ok(())
        },
    )
    .context("Failed to spawn remote storage sync thread")?;
    Ok(SyncStartupData {
-        initial_timeline_states,
+        remote_index,
+        local_timeline_init_statuses,
    })
 }

 enum LoopStep {
-    NewStates(HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>),
+    SyncStatusUpdates(HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>),
    Shutdown,
 }

@@ -419,41 +437,48 @@ fn storage_sync_loop<
    runtime: Runtime,
    conf: &'static PageServerConf,
    mut receiver: UnboundedReceiver<SyncTask>,
-    index: RemoteTimelineIndex,
+    index: RemoteIndex,
    storage: S,
    max_concurrent_sync: NonZeroUsize,
    max_sync_errors: NonZeroU32,
-) -> anyhow::Result<()> {
-    let remote_assets = Arc::new((storage, RwLock::new(index)));
+) {
+    let remote_assets = Arc::new((storage, index.clone()));
+    info!("Starting remote storage sync loop");
    loop {
+        let index = index.clone();
        let loop_step = runtime.block_on(async {
            tokio::select! {
-                new_timeline_states = loop_step(
+                step = loop_step(
                    conf,
                    &mut receiver,
                    Arc::clone(&remote_assets),
                    max_concurrent_sync,
                    max_sync_errors,
                )
-                .instrument(debug_span!("storage_sync_loop_step")) => LoopStep::NewStates(new_timeline_states),
+                .instrument(info_span!("storage_sync_loop_step")) => step,
                _ = thread_mgr::shutdown_watcher() => LoopStep::Shutdown,
            }
        });

        match loop_step {
-            LoopStep::NewStates(new_timeline_states) => {
-                // Batch timeline download registration to ensure that the external registration code won't block any running tasks before.
-                set_timeline_states(conf, new_timeline_states);
-                debug!("Sync loop step completed");
+            LoopStep::SyncStatusUpdates(new_timeline_states) => {
+                if new_timeline_states.is_empty() {
+                    debug!("Sync loop step completed, no new timeline states");
+                } else {
+                    info!(
+                        "Sync loop step completed, {} new timeline state update(s)",
+                        new_timeline_states.len()
+                    );
+                    // Batch timeline download registration to ensure that the external registration code won't block any running tasks before.
+                    apply_timeline_sync_status_updates(conf, index, new_timeline_states);
+                }
            }
            LoopStep::Shutdown => {
-                debug!("Shutdown requested, stopping");
+                info!("Shutdown requested, stopping");
                break;
            }
        }
    }
-
-    Ok(())
 }

 async fn loop_step<
@@ -462,19 +487,18 @@ async fn loop_step<
 >(
    conf: &'static PageServerConf,
    receiver: &mut UnboundedReceiver<SyncTask>,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    remote_assets: Arc<(S, RemoteIndex)>,
    max_concurrent_sync: NonZeroUsize,
    max_sync_errors: NonZeroU32,
-) -> HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>> {
+) -> LoopStep {
    let max_concurrent_sync = max_concurrent_sync.get();
-    let mut next_tasks = BTreeSet::new();
+    let mut next_tasks = Vec::new();

    // request the first task in blocking fashion to do less meaningless work
    if let Some(first_task) = sync_queue::next_task(receiver).await {
-        next_tasks.insert(first_task);
+        next_tasks.push(first_task);
    } else {
-        debug!("Shutdown requested, stopping");
-        return HashMap::new();
+        return LoopStep::Shutdown;
    };
    next_tasks.extend(
        sync_queue::next_task_batch(receiver, max_concurrent_sync - 1)
@@ -483,12 +507,17 @@ async fn loop_step<
    );

    let remaining_queue_length = sync_queue::len();
-    debug!(
-        "Processing {} tasks in batch, more tasks left to process: {}",
-        next_tasks.len(),
-        remaining_queue_length
-    );
    REMAINING_SYNC_ITEMS.set(remaining_queue_length as i64);
+    if remaining_queue_length > 0 || !next_tasks.is_empty() {
+        info!(
+            "Processing {} tasks in batch, more tasks left to process: {}",
+            next_tasks.len(),
+            remaining_queue_length
+        );
+    } else {
+        debug!("No tasks to process");
+        return LoopStep::SyncStatusUpdates(HashMap::new());
+    }

    let mut task_batch = next_tasks
        .into_iter()
@@ -498,8 +527,9 @@ async fn loop_step<
            let sync_name = task.kind.sync_name();

            let extra_step = match tokio::spawn(
-                process_task(conf, Arc::clone(&remote_assets), task, max_sync_errors)
-                    .instrument(debug_span!("", sync_id = %sync_id, attempt, sync_name)),
+                process_task(conf, Arc::clone(&remote_assets), task, max_sync_errors).instrument(
+                    info_span!("process_sync_task", sync_id = %sync_id, attempt, sync_name),
+                ),
            )
            .await
            {
@@ -516,8 +546,10 @@ async fn loop_step<
        })
        .collect::<FuturesUnordered<_>>();

-    let mut new_timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>> =
-        HashMap::with_capacity(max_concurrent_sync);
+    let mut new_timeline_states: HashMap<
+        ZTenantId,
+        HashMap<ZTimelineId, TimelineSyncStatusUpdate>,
+    > = HashMap::with_capacity(max_concurrent_sync);
    while let Some((sync_id, state_update)) = task_batch.next().await {
        debug!("Finished storage sync task for sync id {}", sync_id);
        if let Some(state_update) = state_update {
@@ -532,7 +564,7 @@ async fn loop_step<
        }
    }

-    new_timeline_states
+    LoopStep::SyncStatusUpdates(new_timeline_states)
 }

 async fn process_task<
@@ -540,24 +572,19 @@ async fn process_task<
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
    conf: &'static PageServerConf,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    remote_assets: Arc<(S, RemoteIndex)>,
    task: SyncTask,
    max_sync_errors: NonZeroU32,
-) -> Option<TimelineSyncState> {
+) -> Option<TimelineSyncStatusUpdate> {
    if task.retries > max_sync_errors.get() {
        error!(
            "Evicting task {:?} that failed {} times, exceeding the error threshold",
            task.kind, task.retries
        );
-        return Some(TimelineSyncState::Evicted(
-            remote_assets
-                .as_ref()
-                .1
-                .read()
-                .await
-                .timeline_entry(&task.sync_id)
-                .and_then(TimelineIndexEntry::disk_consistent_lsn),
-        ));
+        FATAL_TASK_FAILURES.inc();
+        // FIXME (rodionov) this can potentially leave holes in timeline uploads
+        //    planneed to be fixed as part of https://github.com/zenithdb/zenith/issues/977
+        return None;
    }

    if task.retries > 0 {
@@ -569,13 +596,15 @@ async fn process_task<
        tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await;
    }

+    let remote_index = &remote_assets.1;
+
    let sync_start = Instant::now();
    let sync_name = task.kind.sync_name();
    match task.kind {
        SyncKind::Download(download_data) => {
            let download_result = download_timeline(
                conf,
-                remote_assets,
+                remote_assets.clone(),
                task.sync_id,
                download_data,
                task.retries + 1,
@@ -585,19 +614,25 @@ async fn process_task<
            match download_result {
                DownloadedTimeline::Abort => {
                    register_sync_status(sync_start, sync_name, None);
+                    remote_index
+                        .write()
+                        .await
+                        .set_awaits_download(&task.sync_id, false)
+                        .expect("timeline should be present in remote index");
                    None
                }
-                DownloadedTimeline::FailedAndRescheduled {
-                    disk_consistent_lsn,
-                } => {
+                DownloadedTimeline::FailedAndRescheduled => {
                    register_sync_status(sync_start, sync_name, Some(false));
-                    Some(TimelineSyncState::AwaitsDownload(disk_consistent_lsn))
+                    None
                }
-                DownloadedTimeline::Successful {
-                    disk_consistent_lsn,
-                } => {
+                DownloadedTimeline::Successful => {
                    register_sync_status(sync_start, sync_name, Some(true));
-                    Some(TimelineSyncState::Ready(disk_consistent_lsn))
+                    remote_index
+                        .write()
+                        .await
+                        .set_awaits_download(&task.sync_id, false)
+                        .expect("timeline should be present in remote index");
+                    Some(TimelineSyncStatusUpdate::Downloaded)
                }
            }
        }
@@ -617,45 +652,45 @@ async fn process_task<
 }

 fn schedule_first_sync_tasks(
-    index: &RemoteTimelineIndex,
+    index: &mut RemoteTimelineIndex,
    local_timeline_files: HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>,
-) -> HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>> {
-    let mut initial_timeline_statuses: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>> =
-        HashMap::new();
+) -> LocalTimelineInitStatuses {
+    let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();

    let mut new_sync_tasks =
        VecDeque::with_capacity(local_timeline_files.len().max(local_timeline_files.len()));

    for (sync_id, (local_metadata, local_files)) in local_timeline_files {
-        let local_disk_consistent_lsn = local_metadata.disk_consistent_lsn();
-
        let ZTenantTimelineId {
            tenant_id,
            timeline_id,
        } = sync_id;
-        match index.timeline_entry(&sync_id) {
+        match index.timeline_entry_mut(&sync_id) {
            Some(index_entry) => {
-                let timeline_status = compare_local_and_remote_timeline(
+                let (timeline_status, awaits_download) = compare_local_and_remote_timeline(
                    &mut new_sync_tasks,
                    sync_id,
                    local_metadata,
                    local_files,
                    index_entry,
                );
-                match timeline_status {
-                    Some(timeline_status) => {
-                        initial_timeline_statuses
-                            .entry(tenant_id)
-                            .or_default()
-                            .insert(timeline_id, timeline_status);
-                    }
-                    None => error!(
-                        "Failed to compare local and remote timeline for task {}",
-                        sync_id
-                    ),
+                let was_there = local_timeline_init_statuses
+                    .entry(tenant_id)
+                    .or_default()
+                    .insert(timeline_id, timeline_status);
+
+                if was_there.is_some() {
+                    // defensive check
+                    warn!(
+                        "Overwriting timeline init sync status. Status {:?} Timeline {}",
+                        timeline_status, timeline_id
+                    );
                }
+                index_entry.set_awaits_download(awaits_download);
            }
            None => {
+                // TODO (rodionov) does this mean that we've crashed during tenant creation?
+                //  is it safe to upload this checkpoint? could it be half broken?
                new_sync_tasks.push_back(SyncTask::new(
                    sync_id,
                    0,
@@ -664,56 +699,18 @@ fn schedule_first_sync_tasks(
                        metadata: local_metadata,
                    }),
                ));
-                initial_timeline_statuses
+                local_timeline_init_statuses
                    .entry(tenant_id)
                    .or_default()
-                    .insert(
-                        timeline_id,
-                        TimelineSyncState::Ready(local_disk_consistent_lsn),
-                    );
+                    .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete);
            }
        }
    }

-    let unprocessed_remote_ids = |remote_id: &ZTenantTimelineId| {
-        initial_timeline_statuses
-            .get(&remote_id.tenant_id)
-            .and_then(|timelines| timelines.get(&remote_id.timeline_id))
-            .is_none()
-    };
-    for unprocessed_remote_id in index
-        .all_sync_ids()
-        .filter(unprocessed_remote_ids)
-        .collect::<Vec<_>>()
-    {
-        let ZTenantTimelineId {
-            tenant_id: cloud_only_tenant_id,
-            timeline_id: cloud_only_timeline_id,
-        } = unprocessed_remote_id;
-        match index
-            .timeline_entry(&unprocessed_remote_id)
-            .and_then(TimelineIndexEntry::disk_consistent_lsn)
-        {
-            Some(remote_disk_consistent_lsn) => {
-                initial_timeline_statuses
-                    .entry(cloud_only_tenant_id)
-                    .or_default()
-                    .insert(
-                        cloud_only_timeline_id,
-                        TimelineSyncState::CloudOnly(remote_disk_consistent_lsn),
-                    );
-            }
-            None => error!(
-                "Failed to find disk consistent LSN for remote timeline {}",
-                unprocessed_remote_id
-            ),
-        }
-    }
-
    new_sync_tasks.into_iter().for_each(|task| {
        sync_queue::push(task);
    });
-    initial_timeline_statuses
+    local_timeline_init_statuses
 }

 fn compare_local_and_remote_timeline(
@@ -722,10 +719,21 @@ fn compare_local_and_remote_timeline(
    local_metadata: TimelineMetadata,
    local_files: Vec<PathBuf>,
    remote_entry: &TimelineIndexEntry,
-) -> Option<TimelineSyncState> {
+) -> (LocalTimelineInitStatus, bool) {
    let local_lsn = local_metadata.disk_consistent_lsn();
    let uploads = remote_entry.uploaded_checkpoints();

+    let mut initial_timeline_status = LocalTimelineInitStatus::LocallyComplete;
+
+    let mut awaits_download = false;
+    // TODO probably here we need more sophisticated logic,
+    //   if more data is available remotely can we just download whats there?
+    //   without trying to upload something. It may be tricky, needs further investigation.
+    //   For now looks strange that we can request upload
+    //   and dowload for the same timeline simultaneously.
+    //   (upload needs to be only for previously unsynced files, not whole timeline dir).
+    //   If one of the tasks fails they will be reordered in the queue which can lead
+    //   to timeline being stuck in evicted state
    if !uploads.contains(&local_lsn) {
        new_sync_tasks.push_back(SyncTask::new(
            sync_id,
@@ -735,6 +743,7 @@ fn compare_local_and_remote_timeline(
                metadata: local_metadata,
            }),
        ));
+        // Note that status here doesnt change.
    }

    let uploads_count = uploads.len();
@@ -743,7 +752,7 @@ fn compare_local_and_remote_timeline(
        .filter(|upload_lsn| upload_lsn <= &local_lsn)
        .map(ArchiveId)
        .collect();
-    Some(if archives_to_skip.len() != uploads_count {
+    if archives_to_skip.len() != uploads_count {
        new_sync_tasks.push_back(SyncTask::new(
            sync_id,
            0,
@@ -752,10 +761,12 @@ fn compare_local_and_remote_timeline(
                archives_to_skip,
            }),
        ));
-        TimelineSyncState::AwaitsDownload(remote_entry.disk_consistent_lsn()?)
-    } else {
-        TimelineSyncState::Ready(remote_entry.disk_consistent_lsn().unwrap_or(local_lsn))
-    })
+        initial_timeline_status = LocalTimelineInitStatus::NeedsSync;
+        awaits_download = true;
+        // we do not need to manupulate with remote consistent lsn here
+        // because it will be updated when sync will be completed
+    }
+    (initial_timeline_status, awaits_download)
 }

 fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Option<bool>) {
@@ -769,21 +780,23 @@ fn register_sync_status(sync_start: Instant, sync_name: &str, sync_status: Optio
    .observe(secs_elapsed)
 }

-async fn update_index_description<
+async fn fetch_full_index<
    P: Send + Sync + 'static,
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
-    (storage, index): &(S, RwLock<RemoteTimelineIndex>),
+    (storage, index): &(S, RemoteIndex),
    timeline_dir: &Path,
    id: ZTenantTimelineId,
 ) -> anyhow::Result<RemoteTimeline> {
-    let mut index_write = index.write().await;
-    let full_index = match index_write.timeline_entry(&id) {
+    let index_read = index.read().await;
+    let full_index = match index_read.timeline_entry(&id).map(|e| e.inner()) {
        None => bail!("Timeline not found for sync id {}", id),
-        Some(TimelineIndexEntry::Full(_)) => bail!("Index is already populated for sync id {}", id),
-        Some(TimelineIndexEntry::Description(description)) => {
+        Some(TimelineIndexEntryInner::Full(_)) => {
+            bail!("Index is already populated for sync id {}", id)
+        }
+        Some(TimelineIndexEntryInner::Description(description)) => {
            let mut archive_header_downloads = FuturesUnordered::new();
-            for (&archive_id, description) in description {
+            for (archive_id, description) in description {
                archive_header_downloads.push(async move {
                    let header = download_archive_header(storage, timeline_dir, description)
                        .await
@@ -795,18 +808,23 @@ async fn update_index_description<
            let mut full_index = RemoteTimeline::empty();
            while let Some(header_data) = archive_header_downloads.next().await {
                match header_data {
-                        Ok((archive_id, header_size, header)) => full_index.update_archive_contents(archive_id.0, header, header_size),
-                        Err((e, archive_id)) => bail!(
-                            "Failed to download archive header for tenant {}, timeline {}, archive for Lsn {}: {}",
-                            id.tenant_id, id.timeline_id, archive_id.0,
-                            e
-                        ),
-                    }
+                    Ok((archive_id, header_size, header)) => full_index.update_archive_contents(archive_id.0, header, header_size),
+                    Err((e, archive_id)) => bail!(
+                        "Failed to download archive header for tenant {}, timeline {}, archive for Lsn {}: {}",
+                        id.tenant_id, id.timeline_id, archive_id.0,
+                        e
+                    ),
+                }
            }
            full_index
        }
    };
-    index_write.add_timeline_entry(id, TimelineIndexEntry::Full(full_index.clone()));
+    drop(index_read); // tokio rw lock is not upgradeable
+    index
+        .write()
+        .await
+        .upgrade_timeline_entry(&id, full_index.clone())
+        .context("cannot upgrade timeline entry in remote index")?;
    Ok(full_index)
 }

@@ -850,7 +868,7 @@ mod test_utils {
    #[track_caller]
    pub async fn ensure_correct_timeline_upload(
        harness: &RepoHarness<'_>,
-        remote_assets: Arc<(LocalFs, RwLock<RemoteTimelineIndex>)>,
+        remote_assets: Arc<(LocalFs, RemoteIndex)>,
        timeline_id: ZTimelineId,
        new_upload: NewCheckpoint,
    ) {
@@ -867,7 +885,7 @@ mod test_utils {
        let (storage, index) = remote_assets.as_ref();
        assert_index_descriptions(
            index,
-            RemoteTimelineIndex::try_parse_descriptions_from_paths(
+            &RemoteIndex::try_parse_descriptions_from_paths(
                harness.conf,
                remote_assets
                    .0
@@ -909,11 +927,14 @@ mod test_utils {
    }

    pub async fn expect_timeline(
-        index: &RwLock<RemoteTimelineIndex>,
+        index: &RemoteIndex,
        sync_id: ZTenantTimelineId,
    ) -> RemoteTimeline {
-        if let Some(TimelineIndexEntry::Full(remote_timeline)) =
-            index.read().await.timeline_entry(&sync_id)
+        if let Some(TimelineIndexEntryInner::Full(remote_timeline)) = index
+            .read()
+            .await
+            .timeline_entry(&sync_id)
+            .map(|e| e.inner())
        {
            remote_timeline.clone()
        } else {
@@ -926,9 +947,11 @@ mod test_utils {

    #[track_caller]
    pub async fn assert_index_descriptions(
-        index: &RwLock<RemoteTimelineIndex>,
-        expected_index_with_descriptions: RemoteTimelineIndex,
+        index: &RemoteIndex,
+        expected_index_with_descriptions: &RemoteIndex,
    ) {
+        let expected_index_with_descriptions = expected_index_with_descriptions.read().await;
+
        let index_read = index.read().await;
        let actual_sync_ids = index_read.all_sync_ids().collect::<BTreeSet<_>>();
        let expected_sync_ids = expected_index_with_descriptions
@@ -965,26 +988,26 @@ mod test_utils {
                        sync_id
                    )
                });
-            let expected_timeline_description = match expected_timeline_description {
-                TimelineIndexEntry::Description(description) => description,
-                TimelineIndexEntry::Full(_) => panic!("Expected index entry for sync id {} is a full entry, while a description was expected", sync_id),
+            let expected_timeline_description = match expected_timeline_description.inner() {
+                TimelineIndexEntryInner::Description(description) => description,
+                TimelineIndexEntryInner::Full(_) => panic!("Expected index entry for sync id {} is a full entry, while a description was expected", sync_id),
            };

-            match actual_timeline_entry {
-                TimelineIndexEntry::Description(actual_descriptions) => {
+            match actual_timeline_entry.inner() {
+                TimelineIndexEntryInner::Description(description) => {
                    assert_eq!(
-                        actual_descriptions, expected_timeline_description,
+                        description, expected_timeline_description,
                        "Index contains unexpected descriptions entry for sync id {}",
                        sync_id
                    )
                }
-                TimelineIndexEntry::Full(actual_full_entry) => {
+                TimelineIndexEntryInner::Full(remote_timeline) => {
                    let expected_lsns = expected_timeline_description
                        .values()
                        .map(|description| description.disk_consistent_lsn)
                        .collect::<BTreeSet<_>>();
                    assert_eq!(
-                        actual_full_entry.checkpoints().collect::<BTreeSet<_>>(),
+                        remote_timeline.checkpoints().collect::<BTreeSet<_>>(),
                        expected_lsns,
                        "Timeline {} should have the same checkpoints uploaded",
                        sync_id,
--- a/pageserver/src/remote_storage/storage_sync/compression.rs
+++ b/pageserver/src/remote_storage/storage_sync/compression.rs
@@ -10,7 +10,7 @@
 //! Archiving is almost agnostic to timeline file types, with an exception of the metadata file, that's currently distinguished in the [un]compression code.
 //! The metadata file is treated separately when [de]compression is involved, to reduce the risk of corrupting the metadata file.
 //! When compressed, the metadata file is always required and stored as the last file in the archive stream.
-//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other relishes are decompressed successfully first.
+//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other layer files are decompressed successfully first.
 //!
 //! Archive structure:
 //! +----------------------------------------+
@@ -201,8 +201,7 @@ pub async fn read_archive_header<A: io::AsyncRead + Send + Sync + Unpin>(
        .await
        .context("Failed to decompress a header from the archive")?;

-    Ok(ArchiveHeader::des(&header_bytes)
-        .context("Failed to deserialize a header from the archive")?)
+    ArchiveHeader::des(&header_bytes).context("Failed to deserialize a header from the archive")
 }

 /// Reads the archive metadata out of the archive name:
--- a/pageserver/src/remote_storage/storage_sync/download.rs
+++ b/pageserver/src/remote_storage/storage_sync/download.rs
@@ -3,16 +3,16 @@
 use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};

 use anyhow::{ensure, Context};
-use tokio::{fs, sync::RwLock};
+use tokio::fs;
 use tracing::{debug, error, trace, warn};
-use zenith_utils::{lsn::Lsn, zid::ZTenantId};
+use zenith_utils::zid::ZTenantId;

 use crate::{
    config::PageServerConf,
    layered_repository::metadata::{metadata_path, TimelineMetadata},
    remote_storage::{
        storage_sync::{
-            compression, index::TimelineIndexEntry, sync_queue, update_index_description, SyncKind,
+            compression, fetch_full_index, index::TimelineIndexEntryInner, sync_queue, SyncKind,
            SyncTask,
        },
        RemoteStorage, ZTenantTimelineId,
@@ -20,8 +20,8 @@ use crate::{
 };

 use super::{
-    index::{ArchiveId, RemoteTimeline, RemoteTimelineIndex},
-    TimelineDownload,
+    index::{ArchiveId, RemoteTimeline},
+    RemoteIndex, TimelineDownload,
 };

 /// Timeline download result, with extra data, needed for downloading.
@@ -30,10 +30,10 @@ pub(super) enum DownloadedTimeline {
    Abort,
    /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
    /// Initial download failed due to some error, the download task is rescheduled for another retry.
-    FailedAndRescheduled { disk_consistent_lsn: Lsn },
+    FailedAndRescheduled,
    /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
    /// Initial download successful.
-    Successful { disk_consistent_lsn: Lsn },
+    Successful,
 }

 /// Attempts to download and uncompress files from all remote archives for the timeline given.
@@ -47,7 +47,7 @@ pub(super) async fn download_timeline<
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
    conf: &'static PageServerConf,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    remote_assets: Arc<(S, RemoteIndex)>,
    sync_id: ZTenantTimelineId,
    mut download: TimelineDownload,
    retries: u32,
@@ -58,19 +58,26 @@ pub(super) async fn download_timeline<
        tenant_id,
        timeline_id,
    } = sync_id;
-    let index_read = remote_assets.1.read().await;
+    let index = &remote_assets.1;
+
+    let index_read = index.read().await;
    let remote_timeline = match index_read.timeline_entry(&sync_id) {
        None => {
-            error!("Cannot download: no timeline is present in the index for given ids");
+            error!("Cannot download: no timeline is present in the index for given id");
            return DownloadedTimeline::Abort;
        }
-        Some(index_entry) => match index_entry {
-            TimelineIndexEntry::Full(remote_timeline) => Cow::Borrowed(remote_timeline),
-            TimelineIndexEntry::Description(_) => {
+
+        Some(index_entry) => match index_entry.inner() {
+            TimelineIndexEntryInner::Full(remote_timeline) => Cow::Borrowed(remote_timeline),
+            TimelineIndexEntryInner::Description(_) => {
+                // we do not check here for awaits_download because it is ok
+                // to call this function while the download is in progress
+                // so it is not a concurrent download, it is the same one
+
                let remote_disk_consistent_lsn = index_entry.disk_consistent_lsn();
                drop(index_read);
                debug!("Found timeline description for the given ids, downloading the full index");
-                match update_index_description(
+                match fetch_full_index(
                    remote_assets.as_ref(),
                    &conf.timeline_path(&timeline_id, &tenant_id),
                    sync_id,
@@ -80,16 +87,15 @@ pub(super) async fn download_timeline<
                    Ok(remote_timeline) => Cow::Owned(remote_timeline),
                    Err(e) => {
                        error!("Failed to download full timeline index: {:?}", e);
+
                        return match remote_disk_consistent_lsn {
-                            Some(disk_consistent_lsn) => {
+                            Some(_) => {
                                sync_queue::push(SyncTask::new(
                                    sync_id,
                                    retries,
                                    SyncKind::Download(download),
                                ));
-                                DownloadedTimeline::FailedAndRescheduled {
-                                    disk_consistent_lsn,
-                                }
+                                DownloadedTimeline::FailedAndRescheduled
                            }
                            None => {
                                error!("Cannot download: no disk consistent Lsn is present for the index entry");
@@ -101,12 +107,9 @@ pub(super) async fn download_timeline<
            }
        },
    };
-    let disk_consistent_lsn = match remote_timeline.checkpoints().max() {
-        Some(lsn) => lsn,
-        None => {
-            debug!("Cannot download: no disk consistent Lsn is present for the remote timeline");
-            return DownloadedTimeline::Abort;
-        }
+    if remote_timeline.checkpoints().max().is_none() {
+        debug!("Cannot download: no disk consistent Lsn is present for the remote timeline");
+        return DownloadedTimeline::Abort;
    };

    debug!("Downloading timeline archives");
@@ -125,7 +128,7 @@ pub(super) async fn download_timeline<
            conf,
            sync_id,
            Arc::clone(&remote_assets),
-            remote_timeline.as_ref(),
+            &remote_timeline,
            archive_id,
            Arc::clone(&download.files_to_skip),
        )
@@ -142,9 +145,7 @@ pub(super) async fn download_timeline<
                    retries,
                    SyncKind::Download(download),
                ));
-                return DownloadedTimeline::FailedAndRescheduled {
-                    disk_consistent_lsn,
-                };
+                return DownloadedTimeline::FailedAndRescheduled;
            }
            Ok(()) => {
                debug!("Successfully downloaded archive {:?}", archive_id);
@@ -154,9 +155,7 @@ pub(super) async fn download_timeline<
    }

    debug!("Finished downloading all timeline's archives");
-    DownloadedTimeline::Successful {
-        disk_consistent_lsn,
-    }
+    DownloadedTimeline::Successful
 }

 async fn try_download_archive<
@@ -168,7 +167,7 @@ async fn try_download_archive<
        tenant_id,
        timeline_id,
    }: ZTenantTimelineId,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    remote_assets: Arc<(S, RemoteIndex)>,
    remote_timeline: &RemoteTimeline,
    archive_id: ArchiveId,
    files_to_skip: Arc<BTreeSet<PathBuf>>,
@@ -226,8 +225,8 @@ async fn read_local_metadata(
    let local_metadata_bytes = fs::read(&local_metadata_path)
        .await
        .context("Failed to read local metadata file bytes")?;
-    Ok(TimelineMetadata::from_bytes(&local_metadata_bytes)
-        .context("Failed to read local metadata files bytes")?)
+    TimelineMetadata::from_bytes(&local_metadata_bytes)
+        .context("Failed to read local metadata files bytes")
 }

 #[cfg(test)]
@@ -256,14 +255,14 @@ mod tests {
        let repo_harness = RepoHarness::create("test_download_timeline")?;
        let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
-        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
+        let index = RemoteIndex::try_parse_descriptions_from_paths(
            repo_harness.conf,
            storage
                .list()
                .await?
                .into_iter()
                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
-        ));
+        );
        let remote_assets = Arc::new((storage, index));
        let storage = &remote_assets.0;
        let index = &remote_assets.1;
@@ -313,7 +312,7 @@ mod tests {
        .await;
        assert_index_descriptions(
            index,
-            RemoteTimelineIndex::try_parse_descriptions_from_paths(
+            &RemoteIndex::try_parse_descriptions_from_paths(
                repo_harness.conf,
                remote_assets
                    .0
--- a/pageserver/src/remote_storage/storage_sync/index.rs
+++ b/pageserver/src/remote_storage/storage_sync/index.rs
@@ -7,11 +7,13 @@
 use std::{
    collections::{BTreeMap, BTreeSet, HashMap},
    path::{Path, PathBuf},
+    sync::Arc,
 };

 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
-use tracing::debug;
+use tokio::sync::RwLock;
+use tracing::*;
 use zenith_utils::{
    lsn::Lsn,
    zid::{ZTenantId, ZTimelineId},
@@ -52,10 +54,19 @@ impl RelativePath {
 /// Currently, timeline archive files are tracked only.
 #[derive(Debug, Clone)]
 pub struct RemoteTimelineIndex {
-    timeline_files: HashMap<ZTenantTimelineId, TimelineIndexEntry>,
+    timeline_entries: HashMap<ZTenantTimelineId, TimelineIndexEntry>,
 }

-impl RemoteTimelineIndex {
+/// A wrapper to synchrnize access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
+pub struct RemoteIndex(Arc<RwLock<RemoteTimelineIndex>>);
+
+impl RemoteIndex {
+    pub fn empty() -> Self {
+        Self(Arc::new(RwLock::new(RemoteTimelineIndex {
+            timeline_entries: HashMap::new(),
+        })))
+    }
+
    /// Attempts to parse file paths (not checking the file contents) and find files
    /// that can be tracked wiht the index.
    /// On parse falures, logs the error and continues, so empty index can be created from not suitable paths.
@@ -63,8 +74,8 @@ impl RemoteTimelineIndex {
        conf: &'static PageServerConf,
        paths: impl Iterator<Item = P>,
    ) -> Self {
-        let mut index = Self {
-            timeline_files: HashMap::new(),
+        let mut index = RemoteTimelineIndex {
+            timeline_entries: HashMap::new(),
        };
        for path in paths {
            if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) {
@@ -75,44 +86,121 @@ impl RemoteTimelineIndex {
                );
            }
        }
-        index
+
+        Self(Arc::new(RwLock::new(index)))
    }

+    pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> {
+        self.0.read().await
+    }
+
+    pub async fn write(&self) -> tokio::sync::RwLockWriteGuard<'_, RemoteTimelineIndex> {
+        self.0.write().await
+    }
+}
+
+impl Clone for RemoteIndex {
+    fn clone(&self) -> Self {
+        Self(Arc::clone(&self.0))
+    }
+}
+
+impl RemoteTimelineIndex {
    pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&TimelineIndexEntry> {
-        self.timeline_files.get(id)
+        self.timeline_entries.get(id)
    }

    pub fn timeline_entry_mut(
        &mut self,
        id: &ZTenantTimelineId,
    ) -> Option<&mut TimelineIndexEntry> {
-        self.timeline_files.get_mut(id)
+        self.timeline_entries.get_mut(id)
    }

    pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: TimelineIndexEntry) {
-        self.timeline_files.insert(id, entry);
+        self.timeline_entries.insert(id, entry);
+    }
+
+    pub fn upgrade_timeline_entry(
+        &mut self,
+        id: &ZTenantTimelineId,
+        remote_timeline: RemoteTimeline,
+    ) -> anyhow::Result<()> {
+        let mut entry = self.timeline_entries.get_mut(id).ok_or(anyhow::anyhow!(
+            "timeline is unexpectedly missing from remote index"
+        ))?;
+
+        if !matches!(entry.inner, TimelineIndexEntryInner::Description(_)) {
+            anyhow::bail!("timeline entry is not a description entry")
+        };
+
+        entry.inner = TimelineIndexEntryInner::Full(remote_timeline);
+
+        Ok(())
    }

    pub fn all_sync_ids(&self) -> impl Iterator<Item = ZTenantTimelineId> + '_ {
-        self.timeline_files.keys().copied()
+        self.timeline_entries.keys().copied()
+    }
+
+    pub fn set_awaits_download(
+        &mut self,
+        id: &ZTenantTimelineId,
+        awaits_download: bool,
+    ) -> anyhow::Result<()> {
+        self.timeline_entry_mut(id)
+            .ok_or_else(|| anyhow::anyhow!("unknown timeline sync {}", id))?
+            .set_awaits_download(awaits_download);
+        Ok(())
    }
 }

+#[derive(Debug, Clone, PartialEq, Eq, Default)]
+pub struct DescriptionTimelineIndexEntry {
+    pub description: BTreeMap<ArchiveId, ArchiveDescription>,
+    pub awaits_download: bool,
+}
+
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub enum TimelineIndexEntry {
-    /// An archive found on the remote storage, but not yet downloaded, only a metadata from its storage path is available, without archive contents.
+pub struct FullTimelineIndexEntry {
+    pub remote_timeline: RemoteTimeline,
+    pub awaits_download: bool,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TimelineIndexEntryInner {
    Description(BTreeMap<ArchiveId, ArchiveDescription>),
-    /// Full archive metadata, including the file list, parsed from the archive header.
    Full(RemoteTimeline),
 }

+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct TimelineIndexEntry {
+    inner: TimelineIndexEntryInner,
+    awaits_download: bool,
+}
+
 impl TimelineIndexEntry {
+    pub fn new(inner: TimelineIndexEntryInner, awaits_download: bool) -> Self {
+        Self {
+            inner,
+            awaits_download,
+        }
+    }
+
+    pub fn inner(&self) -> &TimelineIndexEntryInner {
+        &self.inner
+    }
+
+    pub fn inner_mut(&mut self) -> &mut TimelineIndexEntryInner {
+        &mut self.inner
+    }
+
    pub fn uploaded_checkpoints(&self) -> BTreeSet<Lsn> {
-        match self {
-            Self::Description(description) => {
+        match &self.inner {
+            TimelineIndexEntryInner::Description(description) => {
                description.keys().map(|archive_id| archive_id.0).collect()
            }
-            Self::Full(remote_timeline) => remote_timeline
+            TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline
                .checkpoint_archives
                .keys()
                .map(|archive_id| archive_id.0)
@@ -122,17 +210,25 @@ impl TimelineIndexEntry {

    /// Gets latest uploaded checkpoint's disk consisten Lsn for the corresponding timeline.
    pub fn disk_consistent_lsn(&self) -> Option<Lsn> {
-        match self {
-            Self::Description(description) => {
+        match &self.inner {
+            TimelineIndexEntryInner::Description(description) => {
                description.keys().map(|archive_id| archive_id.0).max()
            }
-            Self::Full(remote_timeline) => remote_timeline
+            TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline
                .checkpoint_archives
                .keys()
                .map(|archive_id| archive_id.0)
                .max(),
        }
    }
+
+    pub fn get_awaits_download(&self) -> bool {
+        self.awaits_download
+    }
+
+    pub fn set_awaits_download(&mut self, awaits_download: bool) {
+        self.awaits_download = awaits_download;
+    }
 }

 /// Checkpoint archive's id, corresponding to the `disk_consistent_lsn` from the timeline's metadata file during checkpointing.
@@ -181,7 +277,7 @@ impl RemoteTimeline {
            .map(CheckpointArchive::disk_consistent_lsn)
    }

-    /// Lists all relish files in the given remote timeline. Omits the metadata file.
+    /// Lists all layer files in the given remote timeline. Omits the metadata file.
    pub fn stored_files(&self, timeline_dir: &Path) -> BTreeSet<PathBuf> {
        self.timeline_files
            .values()
@@ -331,13 +427,15 @@ fn try_parse_index_entry(
                tenant_id,
                timeline_id,
            };
-            let timeline_index_entry = index
-                .timeline_files
-                .entry(sync_id)
-                .or_insert_with(|| TimelineIndexEntry::Description(BTreeMap::new()));
-            match timeline_index_entry {
-                TimelineIndexEntry::Description(descriptions) => {
-                    descriptions.insert(
+            let timeline_index_entry = index.timeline_entries.entry(sync_id).or_insert_with(|| {
+                TimelineIndexEntry::new(
+                    TimelineIndexEntryInner::Description(BTreeMap::default()),
+                    false,
+                )
+            });
+            match timeline_index_entry.inner_mut() {
+                TimelineIndexEntryInner::Description(description) => {
+                    description.insert(
                        ArchiveId(disk_consistent_lsn),
                        ArchiveDescription {
                            header_size,
@@ -346,7 +444,7 @@ fn try_parse_index_entry(
                        },
                    );
                }
-                TimelineIndexEntry::Full(_) => {
+                TimelineIndexEntryInner::Full(_) => {
                    bail!("Cannot add parsed archive description to its full context in index with sync id {}", sync_id)
                }
            }
--- a/pageserver/src/remote_storage/storage_sync/upload.rs
+++ b/pageserver/src/remote_storage/storage_sync/upload.rs
@@ -2,23 +2,21 @@

 use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};

-use anyhow::ensure;
-use tokio::sync::RwLock;
 use tracing::{debug, error, warn};

 use crate::{
    config::PageServerConf,
    remote_storage::{
        storage_sync::{
-            compression,
-            index::{RemoteTimeline, TimelineIndexEntry},
-            sync_queue, update_index_description, SyncKind, SyncTask,
+            compression, fetch_full_index,
+            index::{RemoteTimeline, TimelineIndexEntry, TimelineIndexEntryInner},
+            sync_queue, SyncKind, SyncTask,
        },
        RemoteStorage, ZTenantTimelineId,
    },
 };

-use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoint};
+use super::{compression::ArchiveHeader, NewCheckpoint, RemoteIndex};

 /// Attempts to compress and upload given checkpoint files.
 /// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten.
@@ -30,7 +28,7 @@ pub(super) async fn upload_timeline_checkpoint<
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
    config: &'static PageServerConf,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    remote_assets: Arc<(S, RemoteIndex)>,
    sync_id: ZTenantTimelineId,
    new_checkpoint: NewCheckpoint,
    retries: u32,
@@ -49,22 +47,24 @@ pub(super) async fn upload_timeline_checkpoint<
    let index_read = index.read().await;
    let remote_timeline = match index_read.timeline_entry(&sync_id) {
        None => None,
-        Some(TimelineIndexEntry::Full(remote_timeline)) => Some(Cow::Borrowed(remote_timeline)),
-        Some(TimelineIndexEntry::Description(_)) => {
-            debug!("Found timeline description for the given ids, downloading the full index");
-            match update_index_description(remote_assets.as_ref(), &timeline_dir, sync_id).await {
-                Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)),
-                Err(e) => {
-                    error!("Failed to download full timeline index: {:?}", e);
-                    sync_queue::push(SyncTask::new(
-                        sync_id,
-                        retries,
-                        SyncKind::Upload(new_checkpoint),
-                    ));
-                    return Some(false);
+        Some(entry) => match entry.inner() {
+            TimelineIndexEntryInner::Full(remote_timeline) => Some(Cow::Borrowed(remote_timeline)),
+            TimelineIndexEntryInner::Description(_) => {
+                debug!("Found timeline description for the given ids, downloading the full index");
+                match fetch_full_index(remote_assets.as_ref(), &timeline_dir, sync_id).await {
+                    Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)),
+                    Err(e) => {
+                        error!("Failed to download full timeline index: {:?}", e);
+                        sync_queue::push(SyncTask::new(
+                            sync_id,
+                            retries,
+                            SyncKind::Upload(new_checkpoint),
+                        ));
+                        return Some(false);
+                    }
                }
            }
-        }
+        },
    };

    let already_contains_upload_lsn = remote_timeline
@@ -93,30 +93,48 @@ pub(super) async fn upload_timeline_checkpoint<
    )
    .await
    {
-        Ok((archive_header, header_size)) => {
+        Some(Ok((archive_header, header_size))) => {
            let mut index_write = index.write().await;
-            match index_write.timeline_entry_mut(&sync_id) {
-                Some(TimelineIndexEntry::Full(remote_timeline)) => {
-                    remote_timeline.update_archive_contents(
-                        new_checkpoint.metadata.disk_consistent_lsn(),
-                        archive_header,
-                        header_size,
-                    );
-                }
-                None | Some(TimelineIndexEntry::Description(_)) => {
+            match index_write
+                .timeline_entry_mut(&sync_id)
+                .map(|e| e.inner_mut())
+            {
+                None => {
                    let mut new_timeline = RemoteTimeline::empty();
                    new_timeline.update_archive_contents(
                        new_checkpoint.metadata.disk_consistent_lsn(),
                        archive_header,
                        header_size,
                    );
-                    index_write.add_timeline_entry(sync_id, TimelineIndexEntry::Full(new_timeline));
+                    index_write.add_timeline_entry(
+                        sync_id,
+                        TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false),
+                    )
+                }
+                Some(TimelineIndexEntryInner::Full(remote_timeline)) => {
+                    remote_timeline.update_archive_contents(
+                        new_checkpoint.metadata.disk_consistent_lsn(),
+                        archive_header,
+                        header_size,
+                    );
+                }
+                Some(TimelineIndexEntryInner::Description(_)) => {
+                    let mut new_timeline = RemoteTimeline::empty();
+                    new_timeline.update_archive_contents(
+                        new_checkpoint.metadata.disk_consistent_lsn(),
+                        archive_header,
+                        header_size,
+                    );
+                    index_write.add_timeline_entry(
+                        sync_id,
+                        TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false),
+                    )
                }
            }
            debug!("Checkpoint uploaded successfully");
            Some(true)
        }
-        Err(e) => {
+        Some(Err(e)) => {
            error!(
                "Failed to upload checkpoint: {:?}, requeueing the upload",
                e
@@ -128,6 +146,7 @@ pub(super) async fn upload_timeline_checkpoint<
            ));
            Some(false)
        }
+        None => Some(true),
    }
 }

@@ -136,11 +155,11 @@ async fn try_upload_checkpoint<
    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
 >(
    config: &'static PageServerConf,
-    remote_assets: Arc<(S, RwLock<RemoteTimelineIndex>)>,
+    remote_assets: Arc<(S, RemoteIndex)>,
    sync_id: ZTenantTimelineId,
    new_checkpoint: &NewCheckpoint,
    files_to_skip: BTreeSet<PathBuf>,
-) -> anyhow::Result<(ArchiveHeader, u64)> {
+) -> Option<anyhow::Result<(ArchiveHeader, u64)>> {
    let ZTenantTimelineId {
        tenant_id,
        timeline_id,
@@ -152,7 +171,7 @@ async fn try_upload_checkpoint<
        .iter()
        .filter(|&path_to_upload| {
            if files_to_skip.contains(path_to_upload) {
-                error!(
+                warn!(
                    "Skipping file upload '{}', since it was already uploaded",
                    path_to_upload.display()
                );
@@ -162,9 +181,16 @@ async fn try_upload_checkpoint<
            }
        })
        .collect::<Vec<_>>();
-    ensure!(!files_to_upload.is_empty(), "No files to upload");

-    compression::archive_files_as_stream(
+    if files_to_upload.is_empty() {
+        warn!(
+            "No files to upload. Upload request was: {:?}, already uploaded files: {:?}",
+            new_checkpoint.layers, files_to_skip
+        );
+        return None;
+    }
+
+    let upload_result = compression::archive_files_as_stream(
        &timeline_dir,
        files_to_upload.into_iter(),
        &new_checkpoint.metadata,
@@ -175,12 +201,15 @@ async fn try_upload_checkpoint<
                .upload(
                    archive_streamer,
                    &remote_storage.storage_path(&timeline_dir.join(&archive_name))?,
+                    None,
                )
                .await
        },
    )
    .await
-    .map(|(header, header_size, _)| (header, header_size))
+    .map(|(header, header_size, _)| (header, header_size));
+
+    Some(upload_result)
 }

 #[cfg(test)]
@@ -209,14 +238,14 @@ mod tests {
        let repo_harness = RepoHarness::create("reupload_timeline")?;
        let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
-        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
+        let index = RemoteIndex::try_parse_descriptions_from_paths(
            repo_harness.conf,
            storage
                .list()
                .await?
                .into_iter()
                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
-        ));
+        );
        let remote_assets = Arc::new((storage, index));
        let index = &remote_assets.1;

@@ -405,14 +434,14 @@ mod tests {
        let repo_harness = RepoHarness::create("reupload_timeline_rejected")?;
        let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
        let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
-        let index = RwLock::new(RemoteTimelineIndex::try_parse_descriptions_from_paths(
+        let index = RemoteIndex::try_parse_descriptions_from_paths(
            repo_harness.conf,
            storage
                .list()
                .await?
                .into_iter()
                .map(|storage_path| storage.local_path(&storage_path).unwrap()),
-        ));
+        );
        let remote_assets = Arc::new((storage, index));
        let storage = &remote_assets.0;
        let index = &remote_assets.1;
@@ -431,7 +460,7 @@ mod tests {
            first_checkpoint,
        )
        .await;
-        let after_first_uploads = RemoteTimelineIndex::try_parse_descriptions_from_paths(
+        let after_first_uploads = RemoteIndex::try_parse_descriptions_from_paths(
            repo_harness.conf,
            remote_assets
                .0
@@ -462,7 +491,7 @@ mod tests {
            0,
        )
        .await;
-        assert_index_descriptions(index, after_first_uploads.clone()).await;
+        assert_index_descriptions(index, &after_first_uploads).await;

        let checkpoint_with_uploaded_lsn = create_local_timeline(
            &repo_harness,
@@ -478,7 +507,7 @@ mod tests {
            0,
        )
        .await;
-        assert_index_descriptions(index, after_first_uploads.clone()).await;
+        assert_index_descriptions(index, &after_first_uploads).await;

        Ok(())
    }
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,10 +1,13 @@
-use crate::keyspace::KeyPartitioning;
+use crate::layered_repository::metadata::TimelineMetadata;
+use crate::remote_storage::RemoteIndex;
 use crate::walrecord::ZenithWalRecord;
 use crate::CheckpointConfig;
 use anyhow::{bail, Result};
+use byteorder::{ByteOrder, BE};
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::fmt;
+use std::fmt::Display;
 use std::ops::{AddAssign, Range};
 use std::sync::{Arc, RwLockReadGuard};
 use std::time::Duration;
@@ -25,6 +28,8 @@ pub struct Key {
    pub field6: u32,
 }

+pub const KEY_SIZE: usize = 18;
+
 impl Key {
    pub fn next(&self) -> Key {
        self.add(1)
@@ -59,7 +64,7 @@ impl Key {
        key
    }

-    pub fn from_array(b: [u8; 18]) -> Self {
+    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
            field2: u32::from_be_bytes(b[1..5].try_into().unwrap()),
@@ -69,6 +74,15 @@ impl Key {
            field6: u32::from_be_bytes(b[14..18].try_into().unwrap()),
        }
    }
+
+    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
+        buf[0] = self.field1;
+        BE::write_u32(&mut buf[1..5], self.field2);
+        BE::write_u32(&mut buf[5..9], self.field3);
+        BE::write_u32(&mut buf[9..13], self.field4);
+        buf[13] = self.field5;
+        BE::write_u32(&mut buf[14..18], self.field6);
+    }
 }

 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
@@ -141,12 +155,15 @@ impl Key {
    }
 }

-//
-// There are two kinds of values: incremental and non-incremental
-//
+/// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub enum Value {
+    /// An Image value contains a full copy of the value
    Image(Bytes),
+    /// A WalRecord value contains a WAL record that needs to be
+    /// replayed get the full value. Replaying the WAL record
+    /// might need a previous version of the value (if will_init()
+    /// returns false), or it may be replayed stand-alone (true).
    WalRecord(ZenithWalRecord),
 }

@@ -163,32 +180,45 @@ impl Value {
    }
 }

+#[derive(Clone, Copy, Debug)]
+pub enum TimelineSyncStatusUpdate {
+    Uploaded,
+    Downloaded,
+}
+
+impl Display for TimelineSyncStatusUpdate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let s = match self {
+            TimelineSyncStatusUpdate::Uploaded => "Uploaded",
+            TimelineSyncStatusUpdate::Downloaded => "Downloaded",
+        };
+        f.write_str(s)
+    }
+}
 ///
 /// A repository corresponds to one .zenith directory. One repository holds multiple
 /// timelines, forked off from the same initial call to 'initdb'.
 pub trait Repository: Send + Sync {
    type Timeline: Timeline;

-    fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
-
-    /// Updates timeline based on the new sync state, received from the remote storage synchronization.
+    /// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
    /// See [`crate::remote_storage`] for more details about the synchronization.
-    fn set_timeline_state(
+    fn apply_timeline_remote_sync_status_update(
        &self,
        timeline_id: ZTimelineId,
-        new_state: TimelineSyncState,
+        timeline_sync_status_update: TimelineSyncStatusUpdate,
    ) -> Result<()>;

-    /// Gets current synchronization state of the timeline.
-    /// See [`crate::remote_storage`] for more details about the synchronization.
-    fn get_timeline_state(&self, timeline_id: ZTimelineId) -> Option<TimelineSyncState>;
-
    /// Get Timeline handle for given zenith timeline ID.
-    fn get_timeline(&self, timelineid: ZTimelineId) -> Result<RepositoryTimeline<Self::Timeline>>;
+    /// This function is idempotent. It doesnt change internal state in any way.
+    fn get_timeline(&self, timelineid: ZTimelineId) -> Option<RepositoryTimeline<Self::Timeline>>;
+
+    /// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded.
+    fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result<Arc<Self::Timeline>>;

    /// Lists timelines the repository contains.
    /// Up to repository's implementation to omit certain timelines that ar not considered ready for use.
-    fn list_timelines(&self) -> Result<Vec<RepositoryTimeline<Self::Timeline>>>;
+    fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline<Self::Timeline>)>;

    /// Create a new, empty timeline. The caller is responsible for loading data into it
    /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
@@ -225,69 +255,44 @@ pub trait Repository: Send + Sync {
    /// perform one compaction iteration.
    /// this function is periodically called by compactor thread.
    fn compaction_iteration(&self) -> Result<()>;
+
+    /// detaches locally available timeline by stopping all threads and removing all the data.
+    fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
+
+    // Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
+    fn get_remote_index(&self) -> &RemoteIndex;
 }

 /// A timeline, that belongs to the current repository.
 pub enum RepositoryTimeline<T> {
    /// Timeline, with its files present locally in pageserver's working directory.
    /// Loaded into pageserver's memory and ready to be used.
-    Local { id: ZTimelineId, timeline: Arc<T> },
-    /// Timeline, found on the pageserver's remote storage, but not yet downloaded locally.
-    Remote {
-        id: ZTimelineId,
-        /// metadata contents of the latest successfully uploaded checkpoint
-        disk_consistent_lsn: Lsn,
+    Loaded(Arc<T>),
+
+    /// All the data is available locally, but not loaded into memory, so loading have to be done before actually using the timeline
+    Unloaded {
+        // It is ok to keep metadata here, because it is not changed when timeline is unloaded.
+        // FIXME can s3 sync actually change it? It can change it when timeline is in awaiting download state.
+        //  but we currently do not download something for the timeline once it is local (even if there are new checkpoints) is it correct?
+        // also it is not that good to keep TimelineMetadata here, because it is layered repo implementation detail
+        metadata: TimelineMetadata,
    },
 }

-impl<T> RepositoryTimeline<T> {
-    pub fn local_timeline(&self) -> Option<Arc<T>> {
-        if let Self::Local { timeline, .. } = self {
-            Some(Arc::clone(timeline))
-        } else {
-            None
-        }
-    }
-
-    pub fn id(&self) -> ZTimelineId {
-        match self {
-            Self::Local { id, .. } => *id,
-            Self::Remote { id, .. } => *id,
-        }
-    }
-}
-
-/// A state of the timeline synchronization with the remote storage.
-/// Contains `disk_consistent_lsn` of the corresponding remote timeline (latest checkpoint's disk_consistent_lsn).
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub enum TimelineSyncState {
-    /// No further downloads from the remote storage are needed.
-    /// The timeline state is up-to-date or ahead of the remote storage one,
-    /// ready to be used in any pageserver operation.
-    Ready(Lsn),
-    /// Timeline is scheduled for downloading, but its current local state is not up to date with the remote storage.
-    /// The timeline is not ready to be used in any pageserver operations, otherwise it might diverge its local state from the remote version,
-    /// making it impossible to sync it further.
-    AwaitsDownload(Lsn),
-    /// Timeline was not in the pageserver's local working directory, but was found on the remote storage, ready to be downloaded.
-    /// Cannot be used in any pageserver operations due to complete absence locally.
-    CloudOnly(Lsn),
-    /// Timeline was evicted from the pageserver's local working directory due to conflicting remote and local states or too many errors during the synchronization.
-    /// Such timelines cannot have their state synchronized further and may not have the data about remote timeline's disk_consistent_lsn, since eviction may happen
-    /// due to errors before the remote timeline contents is known.
-    Evicted(Option<Lsn>),
+pub enum LocalTimelineState {
+    // timeline is loaded into memory (with layer map and all the bits),
+    Loaded,
+    // timeline is on disk locally and ready to be loaded into memory.
+    Unloaded,
 }

-impl TimelineSyncState {
-    pub fn remote_disk_consistent_lsn(&self) -> Option<Lsn> {
-        Some(match self {
-            TimelineSyncState::Evicted(None) => return None,
-            TimelineSyncState::Ready(lsn) => lsn,
-            TimelineSyncState::AwaitsDownload(lsn) => lsn,
-            TimelineSyncState::CloudOnly(lsn) => lsn,
-            TimelineSyncState::Evicted(Some(lsn)) => lsn,
-        })
-        .copied()
+impl<'a, T> From<&'a RepositoryTimeline<T>> for LocalTimelineState {
+    fn from(local_timeline_entry: &'a RepositoryTimeline<T>) -> Self {
+        match local_timeline_entry {
+            RepositoryTimeline::Loaded(_) => LocalTimelineState::Loaded,
+            RepositoryTimeline::Unloaded { .. } => LocalTimelineState::Unloaded,
+        }
    }
 }

@@ -378,19 +383,6 @@ pub trait Timeline: Send + Sync {
    /// know anything about them here in the repository.
    fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;

-    ///
-    /// Tell the implementation how the keyspace should be partitioned.
-    ///
-    /// FIXME: This is quite a hack. The code in pgdatadir_mapping.rs knows
-    /// which keys exist and what is the logical grouping of them. That's why
-    /// the code there (and in keyspace.rs) decides the partitioning, not the
-    /// layered_repository.rs implementation. That's a layering violation:
-    /// the Repository implementation ought to be responsible for the physical
-    /// layout, but currently it's more convenient to do it in pgdatadir_mapping.rs
-    /// rather than in layered_repository.rs.
-    ///
-    fn hint_partitioning(&self, partitioning: KeyPartitioning, lsn: Lsn) -> Result<()>;
-
    ///
    /// Check that it is valid to request operations with that lsn.
    fn check_lsn_is_in_scope(
@@ -433,7 +425,7 @@ pub mod repo_harness {
    use crate::RepositoryImpl;
    use crate::{
        config::PageServerConf,
-        layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME},
+        layered_repository::LayeredRepository,
        walredo::{WalRedoError, WalRedoManager},
    };

@@ -487,7 +479,6 @@ pub mod repo_harness {
            let repo_dir = PageServerConf::test_repo_dir(test_name);
            let _ = fs::remove_dir_all(&repo_dir);
            fs::create_dir_all(&repo_dir)?;
-            fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?;

            let conf = PageServerConf::dummy_conf(repo_dir);
            // Make a static copy of the config. This can never be free'd, but that's
@@ -496,6 +487,7 @@ pub mod repo_harness {

            let tenant_id = ZTenantId::generate();
            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
+            fs::create_dir_all(conf.timelines_path(&tenant_id))?;

            Ok(Self {
                conf,
@@ -505,9 +497,39 @@ pub mod repo_harness {
        }

        pub fn load(&self) -> RepositoryImpl {
+            self.try_load().expect("failed to load test repo")
+        }
+
+        pub fn try_load(&self) -> Result<RepositoryImpl> {
            let walredo_mgr = Arc::new(TestRedoManager);

-            LayeredRepository::new(self.conf, walredo_mgr, self.tenant_id, false)
+            let repo = LayeredRepository::new(
+                self.conf,
+                walredo_mgr,
+                self.tenant_id,
+                RemoteIndex::empty(),
+                false,
+            );
+            // populate repo with locally available timelines
+            for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id))
+                .expect("should be able to read timelines dir")
+            {
+                let timeline_dir_entry = timeline_dir_entry.unwrap();
+                let timeline_id: ZTimelineId = timeline_dir_entry
+                    .path()
+                    .file_name()
+                    .unwrap()
+                    .to_string_lossy()
+                    .parse()
+                    .unwrap();
+
+                repo.apply_timeline_remote_sync_status_update(
+                    timeline_id,
+                    TimelineSyncStatusUpdate::Downloaded,
+                )?;
+            }
+
+            Ok(repo)
        }

        pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
@@ -559,7 +581,7 @@ mod tests {
    use lazy_static::lazy_static;

    lazy_static! {
-        static ref TEST_KEY: Key = Key::from_array(hex!("112222222233333333444444445500000001"));
+        static ref TEST_KEY: Key = Key::from_slice(&hex!("112222222233333333444444445500000001"));
    }

    #[test]
@@ -620,10 +642,9 @@ mod tests {

        // Branch the history, modify relation differently on the new timeline
        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
-        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
-            Some(timeline) => timeline,
-            None => panic!("Should have a local timeline"),
-        };
+        let newtline = repo
+            .get_timeline_load(NEW_TIMELINE_ID)
+            .expect("Should have a local timeline");
        let new_writer = newtline.writer();
        new_writer.put(TEST_KEY_A, Lsn(0x40), test_value("bar at 0x40"))?;
        new_writer.finish_write(Lsn(0x40));
@@ -769,11 +790,9 @@ mod tests {
        make_some_layers(tline.as_ref(), Lsn(0x20))?;

        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
-        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
-            Some(timeline) => timeline,
-            None => panic!("Should have a local timeline"),
-        };
-
+        let newtline = repo
+            .get_timeline_load(NEW_TIMELINE_ID)
+            .expect("Should have a local timeline");
        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
        repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
@@ -787,10 +806,9 @@ mod tests {
        make_some_layers(tline.as_ref(), Lsn(0x20))?;

        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
-        let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
-            Some(timeline) => timeline,
-            None => panic!("Should have a local timeline"),
-        };
+        let newtline = repo
+            .get_timeline_load(NEW_TIMELINE_ID)
+            .expect("Should have a local timeline");

        make_some_layers(newtline.as_ref(), Lsn(0x60))?;

@@ -805,4 +823,81 @@ mod tests {

        Ok(())
    }
+
+    #[test]
+    fn timeline_load() -> Result<()> {
+        const TEST_NAME: &str = "timeline_load";
+        let harness = RepoHarness::create(TEST_NAME)?;
+        {
+            let repo = harness.load();
+            let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
+            make_some_layers(tline.as_ref(), Lsn(0x8000))?;
+            tline.checkpoint(CheckpointConfig::Forced)?;
+        }
+
+        let repo = harness.load();
+        let tline = repo
+            .get_timeline(TIMELINE_ID)
+            .expect("cannot load timeline");
+        assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
+
+        assert!(repo.get_timeline_load(TIMELINE_ID).is_ok());
+
+        let tline = repo
+            .get_timeline(TIMELINE_ID)
+            .expect("cannot load timeline");
+        assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
+
+        Ok(())
+    }
+
+    #[test]
+    fn timeline_load_with_ancestor() -> Result<()> {
+        const TEST_NAME: &str = "timeline_load_with_ancestor";
+        let harness = RepoHarness::create(TEST_NAME)?;
+        // create two timelines
+        {
+            let repo = harness.load();
+            let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
+
+            make_some_layers(tline.as_ref(), Lsn(0x20))?;
+            tline.checkpoint(CheckpointConfig::Forced)?;
+
+            repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
+
+            let newtline = repo
+                .get_timeline_load(NEW_TIMELINE_ID)
+                .expect("Should have a local timeline");
+
+            make_some_layers(newtline.as_ref(), Lsn(0x60))?;
+            tline.checkpoint(CheckpointConfig::Forced)?;
+        }
+
+        // check that both of them are initially unloaded
+        let repo = harness.load();
+        {
+            let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
+            assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
+
+            let tline = repo
+                .get_timeline(NEW_TIMELINE_ID)
+                .expect("cannot get timeline");
+            assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
+        }
+        // load only child timeline
+        let _ = repo
+            .get_timeline_load(NEW_TIMELINE_ID)
+            .expect("cannot load timeline");
+
+        // check that both, child and ancestor are loaded
+        let tline = repo
+            .get_timeline(NEW_TIMELINE_ID)
+            .expect("cannot get timeline");
+        assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
+
+        let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
+        assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -3,17 +3,20 @@

 use crate::config::PageServerConf;
 use crate::layered_repository::LayeredRepository;
-use crate::repository::Repository;
-use crate::repository::TimelineSyncState;
+use crate::remote_storage::RemoteIndex;
+use crate::repository::{Repository, TimelineSyncStatusUpdate};
 use crate::thread_mgr;
 use crate::thread_mgr::ThreadKind;
 use crate::timelines;
+use crate::timelines::CreateRepo;
 use crate::walredo::PostgresRedoManager;
 use crate::{DatadirTimelineImpl, RepositoryImpl};
 use anyhow::{Context, Result};
 use lazy_static::lazy_static;
 use log::*;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::fmt;
 use std::sync::{Arc, Mutex, MutexGuard};
@@ -60,81 +63,68 @@ fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
    TENANTS.lock().unwrap()
 }

-/// Updates tenants' repositories, changing their timelines state in memory.
-pub fn set_timeline_states(
+// Sets up wal redo manager and repository for tenant. Reduces code duplocation.
+// Used during pageserver startup, or when new tenant is attached to pageserver.
+pub fn load_local_repo(
    conf: &'static PageServerConf,
-    timeline_states: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncState>>,
-) {
-    if timeline_states.is_empty() {
-        debug!("no timeline state updates to perform");
-        return;
-    }
-
-    info!("Updating states for {} timelines", timeline_states.len());
-    trace!("States: {:?}", timeline_states);
-
+    tenant_id: ZTenantId,
+    remote_index: &RemoteIndex,
+) -> Arc<RepositoryImpl> {
    let mut m = access_tenants();
-    for (tenant_id, timeline_states) in timeline_states {
-        let tenant = m.entry(tenant_id).or_insert_with(|| {
-            // TODO (rodionov) reuse one of the initialisation routines
-            // Set up a WAL redo manager, for applying WAL records.
-            let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
+    let tenant = m.entry(tenant_id).or_insert_with(|| {
+        // Set up a WAL redo manager, for applying WAL records.
+        let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);

-            // Set up an object repository, for actual data storage.
-            let repo = LayeredRepository::new(
-                conf,
-                Arc::new(walredo_mgr),
-                tenant_id,
-                conf.remote_storage_config.is_some(),
-            );
-
-            Tenant {
-                state: TenantState::Idle,
-                repo: Arc::new(repo),
-                timelines: HashMap::new(),
-            }
-        });
-        if let Err(e) = put_timelines_into_tenant(tenant, tenant_id, timeline_states) {
-            error!(
-                "Failed to update timeline states for tenant {}: {:?}",
-                tenant_id, e
-            );
+        // Set up an object repository, for actual data storage.
+        let repo: Arc<LayeredRepository> = Arc::new(LayeredRepository::new(
+            conf,
+            Arc::new(walredo_mgr),
+            tenant_id,
+            remote_index.clone(),
+            conf.remote_storage_config.is_some(),
+        ));
+        Tenant {
+            state: TenantState::Idle,
+            repo,
+            timelines: HashMap::new(),
        }
-    }
+    });
+    Arc::clone(&tenant.repo)
 }

-fn put_timelines_into_tenant(
-    tenant: &mut Tenant,
-    tenant_id: ZTenantId,
-    timeline_states: HashMap<ZTimelineId, TimelineSyncState>,
-) -> anyhow::Result<()> {
-    for (timeline_id, timeline_state) in timeline_states {
-        // If the timeline is being put into any other state than Ready,
-        // stop any threads operating on it.
-        //
-        // FIXME: This is racy. A page service thread could just get
-        // handle on the Timeline, before we call set_timeline_state()
-        if !matches!(timeline_state, TimelineSyncState::Ready(_)) {
-            thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id));
-
-            // Should we run a final checkpoint to flush all the data to
-            // disk? Doesn't seem necessary; all of the states other than
-            // Ready imply that the data on local disk is corrupt or incomplete,
-            // and we don't want to flush that to disk.
-        }
-
-        tenant
-            .repo
-            .set_timeline_state(timeline_id, timeline_state)
-            .with_context(|| {
-                format!(
-                    "Failed to update timeline {} state to {:?}",
-                    timeline_id, timeline_state
-                )
-            })?;
+/// Updates tenants' repositories, changing their timelines state in memory.
+pub fn apply_timeline_sync_status_updates(
+    conf: &'static PageServerConf,
+    remote_index: RemoteIndex,
+    sync_status_updates: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>,
+) {
+    if sync_status_updates.is_empty() {
+        debug!("no sync status updates to apply");
+        return;
    }
+    info!(
+        "Applying sync status updates for {} timelines",
+        sync_status_updates.len()
+    );
+    trace!("Sync status updates: {:?}", sync_status_updates);

-    Ok(())
+    for (tenant_id, tenant_timelines_sync_status_updates) in sync_status_updates {
+        let repo = load_local_repo(conf, tenant_id, &remote_index);
+
+        for (timeline_id, timeline_sync_status_update) in tenant_timelines_sync_status_updates {
+            match repo.apply_timeline_remote_sync_status_update(timeline_id, timeline_sync_status_update)
+            {
+                Ok(_) => debug!(
+                    "successfully applied timeline sync status update: {} -> {}",
+                    timeline_id, timeline_sync_status_update
+                ),
+                Err(e) => error!(
+                    "Failed to apply timeline sync status update for tenant {}. timeline {} update {} Error: {:#}",
+                    tenant_id, timeline_id, timeline_sync_status_update, e
+                ),
+            }
+        }
+    }
 }

 ///
@@ -184,25 +174,31 @@ pub fn shutdown_all_tenants() {

 pub fn create_tenant_repository(
    conf: &'static PageServerConf,
-    new_tenant_id: Option<ZTenantId>,
+    tenantid: ZTenantId,
+    remote_index: RemoteIndex,
 ) -> Result<Option<ZTenantId>> {
-    let new_tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate);
-    let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, new_tenant_id));
-    match timelines::create_repo(conf, new_tenant_id, wal_redo_manager)? {
-        Some(repo) => {
-            access_tenants()
-                .entry(new_tenant_id)
-                .or_insert_with(|| Tenant {
-                    state: TenantState::Idle,
-                    repo,
-                    timelines: HashMap::new(),
-                });
-            Ok(Some(new_tenant_id))
-        }
-        None => {
-            debug!("repository already exists for tenant {}", new_tenant_id);
+    match access_tenants().entry(tenantid) {
+        Entry::Occupied(_) => {
+            debug!("tenant {} already exists", tenantid);
            Ok(None)
        }
+        Entry::Vacant(v) => {
+            let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
+            let repo = timelines::create_repo(
+                conf,
+                tenantid,
+                CreateRepo::Real {
+                    wal_redo_manager,
+                    remote_index,
+                },
+            )?;
+            v.insert(Tenant {
+                state: TenantState::Idle,
+                repo,
+                timelines: HashMap::new(),
+            });
+            Ok(Some(tenantid))
+        }
    }
 }

@@ -214,13 +210,13 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {
 /// Change the state of a tenant to Active and launch its compactor and GC
 /// threads. If the tenant was already in Active state or Stopping, does nothing.
 ///
-pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Result<()> {
+pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Result<()> {
    let mut m = access_tenants();
    let tenant = m
-        .get_mut(&tenantid)
-        .with_context(|| format!("Tenant not found for id {}", tenantid))?;
+        .get_mut(&tenant_id)
+        .with_context(|| format!("Tenant not found for id {}", tenant_id))?;

-    info!("activating tenant {}", tenantid);
+    info!("activating tenant {}", tenant_id);

    match tenant.state {
        // If the tenant is already active, nothing to do.
@@ -230,22 +226,31 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Re
        TenantState::Idle => {
            thread_mgr::spawn(
                ThreadKind::Compactor,
-                Some(tenantid),
+                Some(tenant_id),
                None,
                "Compactor thread",
-                move || crate::tenant_threads::compact_loop(tenantid, conf),
+                true,
+                move || crate::tenant_threads::compact_loop(tenant_id, conf),
            )?;

-            // FIXME: if we fail to launch the GC thread, but already launched the
-            // compactor, we're in a strange state.
-
-            thread_mgr::spawn(
+            let gc_spawn_result = thread_mgr::spawn(
                ThreadKind::GarbageCollector,
-                Some(tenantid),
+                Some(tenant_id),
                None,
                "GC thread",
-                move || crate::tenant_threads::gc_loop(tenantid, conf),
-            )?;
+                true,
+                move || crate::tenant_threads::gc_loop(tenant_id, conf),
+            )
+            .with_context(|| format!("Failed to launch GC thread for tenant {}", tenant_id));
+
+            if let Err(e) = &gc_spawn_result {
+                error!(
+                    "Failed to start GC thread for tenant {}, stopping its checkpointer thread: {:?}",
+                    tenant_id, e
+                );
+                thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
+                return gc_spawn_result;
+            }

            tenant.state = TenantState::Active;
        }
@@ -261,19 +266,20 @@ pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<RepositoryIm
    let m = access_tenants();
    let tenant = m
        .get(&tenantid)
-        .with_context(|| format!("Tenant not found for tenant {}", tenantid))?;
+        .with_context(|| format!("Tenant {} not found", tenantid))?;

    Ok(Arc::clone(&tenant.repo))
 }

-pub fn get_timeline_for_tenant(
+// Retrieve timeline for tenant. Load it into memory if it is not already loaded
+pub fn get_timeline_for_tenant_load(
    tenantid: ZTenantId,
    timelineid: ZTimelineId,
 ) -> Result<Arc<DatadirTimelineImpl>> {
    let mut m = access_tenants();
    let tenant = m
        .get_mut(&tenantid)
-        .with_context(|| format!("Tenant not found for tenant {}", tenantid))?;
+        .with_context(|| format!("Tenant {} not found", tenantid))?;

    if let Some(page_tline) = tenant.timelines.get(&timelineid) {
        return Ok(Arc::clone(page_tline));
@@ -281,9 +287,8 @@ pub fn get_timeline_for_tenant(
    // First access to this timeline. Create a DatadirTimeline wrapper for it
    let tline = tenant
        .repo
-        .get_timeline(timelineid)?
-        .local_timeline()
-        .with_context(|| format!("cannot fetch timeline {}", timelineid))?;
+        .get_timeline_load(timelineid)
+        .with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid))?;

    let repartition_distance = tenant.repo.conf.checkpoint_distance / 10;

@@ -293,9 +298,10 @@ pub fn get_timeline_for_tenant(
    Ok(page_tline)
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    #[serde(with = "hex")]
+    #[serde_as(as = "DisplayFromStr")]
    pub id: ZTenantId,
    pub state: TenantState,
 }
--- a/pageserver/src/tenant_threads.rs
+++ b/pageserver/src/tenant_threads.rs
@@ -57,7 +57,7 @@ pub fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()>
        // Garbage collect old files that are not needed for PITR anymore
        if conf.gc_horizon > 0 {
            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-            repo.gc_iteration(None, conf.gc_horizon, false).unwrap();
+            repo.gc_iteration(None, conf.gc_horizon, false)?;
        }

        // TODO Write it in more adequate way using
--- a/pageserver/src/thread_mgr.rs
+++ b/pageserver/src/thread_mgr.rs
@@ -43,12 +43,14 @@ use std::thread::JoinHandle;

 use tokio::sync::watch;

-use tracing::{info, warn};
+use tracing::{debug, error, info, warn};

 use lazy_static::lazy_static;

 use zenith_utils::zid::{ZTenantId, ZTimelineId};

+use crate::shutdown_pageserver;
+
 lazy_static! {
    /// Each thread that we track is associated with a "thread ID". It's just
    /// an increasing number that we assign, not related to any system thread
@@ -101,7 +103,7 @@ pub enum ThreadKind {
    // Thread that flushes frozen in-memory layers to disk
    LayerFlushThread,

-    // Thread for synchronizing pageserver relish data with the remote storage.
+    // Thread for synchronizing pageserver layer files with the remote storage.
    // Shared by all tenants.
    StorageSync,
 }
@@ -128,15 +130,16 @@ struct PageServerThread {
 }

 /// Launch a new thread
-pub fn spawn<F, E>(
+pub fn spawn<F>(
    kind: ThreadKind,
    tenant_id: Option<ZTenantId>,
    timeline_id: Option<ZTimelineId>,
    name: &str,
+    fail_on_error: bool,
    f: F,
 ) -> std::io::Result<()>
 where
-    F: FnOnce() -> Result<(), E> + Send + 'static,
+    F: FnOnce() -> anyhow::Result<()> + Send + 'static,
 {
    let (shutdown_tx, shutdown_rx) = watch::channel(());
    let thread_id = NEXT_THREAD_ID.fetch_add(1, Ordering::Relaxed);
@@ -163,12 +166,22 @@ where
        .insert(thread_id, Arc::clone(&thread_rc));

    let thread_rc2 = Arc::clone(&thread_rc);
+    let thread_name = name.to_string();
    let join_handle = match thread::Builder::new()
        .name(name.to_string())
-        .spawn(move || thread_wrapper(thread_id, thread_rc2, shutdown_rx, f))
-    {
+        .spawn(move || {
+            thread_wrapper(
+                thread_name,
+                thread_id,
+                thread_rc2,
+                shutdown_rx,
+                fail_on_error,
+                f,
+            )
+        }) {
        Ok(handle) => handle,
        Err(err) => {
+            error!("Failed to spawn thread '{}': {}", name, err);
            // Could not spawn the thread. Remove the entry
            THREADS.lock().unwrap().remove(&thread_id);
            return Err(err);
@@ -183,13 +196,15 @@ where

 /// This wrapper function runs in a newly-spawned thread. It initializes the
 /// thread-local variables and calls the payload function
-fn thread_wrapper<F, E>(
+fn thread_wrapper<F>(
+    thread_name: String,
    thread_id: u64,
    thread: Arc<PageServerThread>,
    shutdown_rx: watch::Receiver<()>,
+    fail_on_error: bool,
    f: F,
 ) where
-    F: FnOnce() -> Result<(), E> + Send + 'static,
+    F: FnOnce() -> anyhow::Result<()> + Send + 'static,
 {
    SHUTDOWN_RX.with(|rx| {
        *rx.borrow_mut() = Some(shutdown_rx);
@@ -198,6 +213,8 @@ fn thread_wrapper<F, E>(
        *ct.borrow_mut() = Some(thread);
    });

+    debug!("Starting thread '{}'", thread_name);
+
    // We use AssertUnwindSafe here so that the payload function
    // doesn't need to be UnwindSafe. We don't do anything after the
    // unwinding that would expose us to unwind-unsafe behavior.
@@ -206,9 +223,26 @@ fn thread_wrapper<F, E>(
    // Remove our entry from the global hashmap.
    THREADS.lock().unwrap().remove(&thread_id);

-    // If the thread payload panic'd, exit with the panic.
-    if let Err(err) = result {
-        panic::resume_unwind(err);
+    match result {
+        Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name),
+        Ok(Err(err)) => {
+            if fail_on_error {
+                error!(
+                    "Shutting down: thread '{}' exited with error: {:?}",
+                    thread_name, err
+                );
+                shutdown_pageserver();
+            } else {
+                error!("Thread '{}' exited with error: {:?}", thread_name, err);
+            }
+        }
+        Err(err) => {
+            error!(
+                "Shutting down: thread '{}' panicked: {:?}",
+                thread_name, err
+            );
+            shutdown_pageserver();
+        }
    }
 }

@@ -253,7 +287,7 @@ pub fn shutdown_threads(
            let _ = join_handle.join();
        } else {
            // The thread had not even fully started yet. Or it was shut down
-            // concurrently and alrady exited
+            // concurrently and already exited
        }
    }
 }
--- a/pageserver/src/timelines.rs
+++ b/pageserver/src/timelines.rs
@@ -2,8 +2,10 @@
 //! Timeline management code
 //

-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{bail, Context, Result};
 use postgres_ffi::ControlFileData;
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::{
    fs,
    path::Path,
@@ -16,110 +18,116 @@ use zenith_utils::lsn::Lsn;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};
 use zenith_utils::{crashsafe_dir, logging};

-use crate::DatadirTimeline;
-use crate::RepositoryImpl;
-use crate::{config::PageServerConf, repository::Repository};
+use crate::{
+    config::PageServerConf,
+    layered_repository::metadata::TimelineMetadata,
+    remote_storage::RemoteIndex,
+    repository::{LocalTimelineState, Repository},
+    DatadirTimeline, RepositoryImpl,
+};
 use crate::{import_datadir, LOG_FILE_NAME};
 use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager};
 use crate::{repository::RepositoryTimeline, tenant_mgr};
 use crate::{repository::Timeline, CheckpointConfig};

-#[derive(Clone)]
-pub enum TimelineInfo {
-    Local {
-        timeline_id: ZTimelineId,
-        tenant_id: ZTenantId,
-        last_record_lsn: Lsn,
-        prev_record_lsn: Lsn,
-        ancestor_timeline_id: Option<ZTimelineId>,
-        ancestor_lsn: Option<Lsn>,
-        disk_consistent_lsn: Lsn,
-        current_logical_size: usize,
-        current_logical_size_non_incremental: Option<usize>,
-    },
-    Remote {
-        timeline_id: ZTimelineId,
-        tenant_id: ZTenantId,
-        disk_consistent_lsn: Lsn,
-    },
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct LocalTimelineInfo {
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_timeline_id: Option<ZTimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub ancestor_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub last_record_lsn: Lsn,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub prev_record_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub disk_consistent_lsn: Lsn,
+    pub current_logical_size: Option<usize>, // is None when timeline is Unloaded
+    pub current_logical_size_non_incremental: Option<usize>,
+    pub timeline_state: LocalTimelineState,
 }

-impl TimelineInfo {
-    pub fn from_ids(
-        tenant_id: ZTenantId,
-        timeline_id: ZTimelineId,
+impl LocalTimelineInfo {
+    pub fn from_loaded_timeline<R: Repository>(
+        datadir_tline: &DatadirTimeline<R>,
        include_non_incremental_logical_size: bool,
-    ) -> Result<Self> {
-        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
-        let result = match repo.get_timeline(timeline_id)? {
-            RepositoryTimeline::Local { id, timeline } => {
-                let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
-                let ancestor_lsn = if ancestor_timeline_id.is_some() {
-                    Some(timeline.get_ancestor_lsn())
-                } else {
-                    None
-                };
-
-                let tline = tenant_mgr::get_timeline_for_tenant(tenant_id, timeline_id)?;
-                let current_logical_size = tline.get_current_logical_size();
-                let current_logical_size_non_incremental = get_current_logical_size_non_incremental(
-                    include_non_incremental_logical_size,
-                    tline.as_ref(),
-                );
-
-                Self::Local {
-                    timeline_id: id,
-                    tenant_id,
-                    last_record_lsn: timeline.get_last_record_lsn(),
-                    prev_record_lsn: timeline.get_prev_record_lsn(),
-                    ancestor_timeline_id,
-                    ancestor_lsn,
-                    disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
-                    current_logical_size,
-                    current_logical_size_non_incremental,
+    ) -> anyhow::Result<Self> {
+        let last_record_lsn = datadir_tline.tline.get_last_record_lsn();
+        let info = LocalTimelineInfo {
+            ancestor_timeline_id: datadir_tline.tline.get_ancestor_timeline_id(),
+            ancestor_lsn: {
+                match datadir_tline.tline.get_ancestor_lsn() {
+                    Lsn(0) => None,
+                    lsn @ Lsn(_) => Some(lsn),
                }
-            }
-            RepositoryTimeline::Remote {
-                id,
-                disk_consistent_lsn,
-            } => Self::Remote {
-                timeline_id: id,
-                tenant_id,
-                disk_consistent_lsn,
+            },
+            disk_consistent_lsn: datadir_tline.tline.get_disk_consistent_lsn(),
+            last_record_lsn,
+            prev_record_lsn: Some(datadir_tline.tline.get_prev_record_lsn()),
+            timeline_state: LocalTimelineState::Loaded,
+            current_logical_size: Some(datadir_tline.get_current_logical_size()),
+            current_logical_size_non_incremental: if include_non_incremental_logical_size {
+                Some(datadir_tline.get_current_logical_size_non_incremental(last_record_lsn)?)
+            } else {
+                None
            },
        };
-        Ok(result)
+        Ok(info)
    }

-    pub fn timeline_id(&self) -> ZTimelineId {
-        match *self {
-            TimelineInfo::Local { timeline_id, .. } => timeline_id,
-            TimelineInfo::Remote { timeline_id, .. } => timeline_id,
+    pub fn from_unloaded_timeline(metadata: &TimelineMetadata) -> Self {
+        LocalTimelineInfo {
+            ancestor_timeline_id: metadata.ancestor_timeline(),
+            ancestor_lsn: {
+                match metadata.ancestor_lsn() {
+                    Lsn(0) => None,
+                    lsn @ Lsn(_) => Some(lsn),
+                }
+            },
+            disk_consistent_lsn: metadata.disk_consistent_lsn(),
+            last_record_lsn: metadata.disk_consistent_lsn(),
+            prev_record_lsn: metadata.prev_record_lsn(),
+            timeline_state: LocalTimelineState::Unloaded,
+            current_logical_size: None,
+            current_logical_size_non_incremental: None,
        }
    }

-    pub fn tenant_id(&self) -> ZTenantId {
-        match *self {
-            TimelineInfo::Local { tenant_id, .. } => tenant_id,
-            TimelineInfo::Remote { tenant_id, .. } => tenant_id,
+    pub fn from_repo_timeline<T>(
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        repo_timeline: &RepositoryTimeline<T>,
+        include_non_incremental_logical_size: bool,
+    ) -> anyhow::Result<Self> {
+        match repo_timeline {
+            RepositoryTimeline::Loaded(_) => {
+                let datadir_tline =
+                    tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id)?;
+                Self::from_loaded_timeline(&datadir_tline, include_non_incremental_logical_size)
+            }
+            RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)),
        }
    }
 }

-fn get_current_logical_size_non_incremental<R: Repository>(
-    include_non_incremental_logical_size: bool,
-    timeline: &DatadirTimeline<R>,
-) -> Option<usize> {
-    if !include_non_incremental_logical_size {
-        return None;
-    }
-    match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) {
-        Ok(size) => Some(size),
-        Err(e) => {
-            error!("Failed to get non-incremental logical size: {:?}", e);
-            None
-        }
-    }
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct RemoteTimelineInfo {
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub remote_consistent_lsn: Option<Lsn>,
+    pub awaits_download: bool,
+}
+
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct TimelineInfo {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: ZTenantId,
+    #[serde_as(as = "DisplayFromStr")]
+    pub timeline_id: ZTimelineId,
+    pub local: Option<LocalTimelineInfo>,
+    pub remote: Option<RemoteTimelineInfo>,
 }

 #[derive(Debug, Clone, Copy)]
@@ -137,25 +145,12 @@ pub fn init_pageserver(
    // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
    let _log_file = logging::init(LOG_FILE_NAME, true)?;

-    // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
-    // process during repository initialization.
-    //
-    // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched
-    // initdb in the background, and it kept running even after the "zenith init" had exited.
-    // In tests, we started the  page server immediately after that, so that initdb was still
-    // running in the background, and we failed to run initdb again in the same directory. This
-    // has been solved for the rapid init+start case now, but the general race condition remains
-    // if you restart the server quickly. The WAL redo manager doesn't use a separate thread
-    // anymore, but I think that could still happen.
-    let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {});
-
    crashsafe_dir::create_dir_all(conf.tenants_path())?;

    if let Some(tenant_id) = create_tenant {
        println!("initializing tenantid {}", tenant_id);
-        let repo = create_repo(conf, tenant_id, dummy_redo_mgr)
-            .context("failed to create repo")?
-            .ok_or_else(|| anyhow!("For newely created pageserver, found already existing repository for tenant {}", tenant_id))?;
+        let repo =
+            create_repo(conf, tenant_id, CreateRepo::Dummy).context("failed to create repo")?;
        let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
        bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())
            .context("failed to create initial timeline")?;
@@ -168,15 +163,44 @@ pub fn init_pageserver(
    Ok(())
 }

+pub enum CreateRepo {
+    Real {
+        wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
+        remote_index: RemoteIndex,
+    },
+    Dummy,
+}
+
 pub fn create_repo(
    conf: &'static PageServerConf,
    tenant_id: ZTenantId,
-    wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
-) -> Result<Option<Arc<RepositoryImpl>>> {
+    create_repo: CreateRepo,
+) -> Result<Arc<RepositoryImpl>> {
+    let (wal_redo_manager, remote_index) = match create_repo {
+        CreateRepo::Real {
+            wal_redo_manager,
+            remote_index,
+        } => (wal_redo_manager, remote_index),
+        CreateRepo::Dummy => {
+            // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
+            // process during repository initialization.
+            //
+            // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched
+            // initdb in the background, and it kept running even after the "zenith init" had exited.
+            // In tests, we started the  page server immediately after that, so that initdb was still
+            // running in the background, and we failed to run initdb again in the same directory. This
+            // has been solved for the rapid init+start case now, but the general race condition remains
+            // if you restart the server quickly. The WAL redo manager doesn't use a separate thread
+            // anymore, but I think that could still happen.
+            let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {});
+
+            (wal_redo_manager as _, RemoteIndex::empty())
+        }
+    };
+
    let repo_dir = conf.tenant_path(&tenant_id);
    if repo_dir.exists() {
-        debug!("repo for {} already exists", tenant_id);
-        return Ok(None);
+        bail!("tenant {} directory already exists", tenant_id);
    }

    // top-level dir may exist if we are creating it through CLI
@@ -185,12 +209,13 @@ pub fn create_repo(
    crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?;
    info!("created directory structure in {}", repo_dir.display());

-    Ok(Some(Arc::new(LayeredRepository::new(
+    Ok(Arc::new(LayeredRepository::new(
        conf,
        wal_redo_manager,
        tenant_id,
+        remote_index,
        conf.remote_storage_config.is_some(),
-    ))))
+    )))
 }

 // Returns checkpoint LSN from controlfile
@@ -211,7 +236,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {

    let initdb_path = conf.pg_bin_dir().join("initdb");
    let initdb_output = Command::new(initdb_path)
-        .args(&["-D", initdbpath.to_str().unwrap()])
+        .args(&["-D", &initdbpath.to_string_lossy()])
        .args(&["-U", &conf.superuser])
        .args(&["-E", "utf8"])
        .arg("--no-instructions")
@@ -219,8 +244,8 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
        // so no need to fsync it
        .arg("--no-sync")
        .env_clear()
-        .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
-        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+        .env("LD_LIBRARY_PATH", conf.pg_lib_dir())
+        .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
        .stdout(Stdio::null())
        .output()
        .context("failed to execute initdb")?;
@@ -275,33 +300,27 @@ fn bootstrap_timeline<R: Repository>(
    Ok(())
 }

-pub(crate) fn get_timelines(
+pub(crate) fn get_local_timelines(
    tenant_id: ZTenantId,
    include_non_incremental_logical_size: bool,
-) -> Result<Vec<TimelineInfo>> {
+) -> Result<Vec<(ZTimelineId, LocalTimelineInfo)>> {
    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
        .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?;
+    let repo_timelines = repo.list_timelines();

-    let mut result = Vec::new();
-    for timeline in repo
-        .list_timelines()
-        .with_context(|| format!("Failed to list timelines for tenant {}", tenant_id))?
-    {
-        match timeline {
-            RepositoryTimeline::Local {
-                timeline: _,
-                id: timeline_id,
-            } => {
-                result.push(TimelineInfo::from_ids(
-                    tenant_id,
-                    timeline_id,
-                    include_non_incremental_logical_size,
-                )?);
-            }
-            RepositoryTimeline::Remote { .. } => continue,
-        }
+    let mut local_timeline_info = Vec::with_capacity(repo_timelines.len());
+    for (timeline_id, repository_timeline) in repo_timelines {
+        local_timeline_info.push((
+            timeline_id,
+            LocalTimelineInfo::from_repo_timeline(
+                tenant_id,
+                timeline_id,
+                &repository_timeline,
+                include_non_incremental_logical_size,
+            )?,
+        ))
    }
-    Ok(result)
+    Ok(local_timeline_info)
 }

 pub(crate) fn create_timeline(
@@ -315,32 +334,17 @@ pub(crate) fn create_timeline(
    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;

    if conf.timeline_path(&new_timeline_id, &tenant_id).exists() {
-        match repo.get_timeline(new_timeline_id)? {
-            RepositoryTimeline::Local { id, .. } => {
-                debug!("timeline {} already exists", id);
-                return Ok(None);
-            }
-            RepositoryTimeline::Remote { id, .. } => bail!(
-                "timeline {} already exists in pageserver's remote storage",
-                id
-            ),
-        }
+        debug!("timeline {} already exists", new_timeline_id);
+        return Ok(None);
    }

    let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0));

-    match ancestor_timeline_id {
+    let new_timeline_info = match ancestor_timeline_id {
        Some(ancestor_timeline_id) => {
            let ancestor_timeline = repo
-                .get_timeline(ancestor_timeline_id)
-                .with_context(|| format!("Cannot get ancestor timeline {}", ancestor_timeline_id))?
-                .local_timeline()
-                .with_context(|| {
-                    format!(
-                        "Cannot branch off the timeline {} that's not present locally",
-                        ancestor_timeline_id
-                    )
-                })?;
+                .get_timeline_load(ancestor_timeline_id)
+                .context("Cannot branch off the timeline that's not present locally")?;

            if start_lsn == Lsn(0) {
                // Find end of WAL on the old timeline
@@ -369,13 +373,25 @@ pub(crate) fn create_timeline(
                );
            }
            repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
+            // load the timeline into memory
+            let loaded_timeline =
+                tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?;
+            LocalTimelineInfo::from_loaded_timeline(&loaded_timeline, false)
+                .context("cannot fill timeline info")?
        }
        None => {
            bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
+            // load the timeline into memory
+            let new_timeline =
+                tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?;
+            LocalTimelineInfo::from_loaded_timeline(&new_timeline, false)
+                .context("cannot fill timeline info")?
        }
-    }
-
-    let new_timeline_info = TimelineInfo::from_ids(tenant_id, new_timeline_id, false)?;
-
-    Ok(Some(new_timeline_info))
+    };
+    Ok(Some(TimelineInfo {
+        tenant_id,
+        timeline_id: new_timeline_id,
+        local: Some(new_timeline_info),
+        remote: None,
+    }))
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -65,6 +65,7 @@ lazy_static! {
 /// currently open, the 'handle' can still point to the slot where it was last kept. The
 /// 'tag' field is used to detect whether the handle still is valid or not.
 ///
+#[derive(Debug)]
 pub struct VirtualFile {
    /// Lazy handle to the global file descriptor cache. The slot that this points to
    /// might contain our File, or it may be empty, or it may contain a File that
@@ -88,7 +89,7 @@ pub struct VirtualFile {
    timelineid: String,
 }

-#[derive(PartialEq, Clone, Copy)]
+#[derive(Debug, PartialEq, Clone, Copy)]
 struct SlotHandle {
    /// Index into OPEN_FILES.slots
    index: usize,
@@ -226,7 +227,8 @@ impl VirtualFile {
        path: &Path,
        open_options: &OpenOptions,
    ) -> Result<VirtualFile, std::io::Error> {
-        let parts = path.to_str().unwrap().split('/').collect::<Vec<&str>>();
+        let path_str = path.to_string_lossy();
+        let parts = path_str.split('/').collect::<Vec<&str>>();
        let tenantid;
        let timelineid;
        if parts.len() > 5 && parts[parts.len() - 5] == "tenants" {
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,7 +21,6 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

-use chrono::format::format;
 use postgres_ffi::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;

@@ -32,9 +31,8 @@ use tracing::*;
 use std::collections::HashMap;

 use crate::pgdatadir_mapping::*;
-use crate::relish::*;
+use crate::reltag::{RelTag, SlruKind};
 use crate::repository::Repository;
-use crate::wal_metadata::WalEntryMetadata;
 use crate::walrecord::*;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment;
 use postgres_ffi::xlog_utils::*;
@@ -82,7 +80,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        recdata: Bytes,
        lsn: Lsn,
    ) -> Result<()> {
-        let mut writer = timeline.begin_record(lsn);
+        let mut modification = timeline.begin_modification(lsn);

        let recdata_len = recdata.len();
        let mut decoded = decode_wal_record(recdata);
@@ -99,7 +97,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        if decoded.xl_rmid == pg_constants::RM_HEAP_ID
            || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
        {
-            self.ingest_heapam_record(&mut buf, &mut writer, &mut decoded)?;
+            self.ingest_heapam_record(&mut buf, &mut modification, &mut decoded)?;
        }
        // Handle other special record types
        if decoded.xl_rmid == pg_constants::RM_SMGR_ID
@@ -107,26 +105,26 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                == pg_constants::XLOG_SMGR_CREATE
        {
            let create = XlSmgrCreate::decode(&mut buf);
-            self.ingest_xlog_smgr_create(&mut writer, &create)?;
+            self.ingest_xlog_smgr_create(&mut modification, &create)?;
        } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
            && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                == pg_constants::XLOG_SMGR_TRUNCATE
        {
            let truncate = XlSmgrTruncate::decode(&mut buf);
-            self.ingest_xlog_smgr_truncate(&mut writer, &truncate)?;
+            self.ingest_xlog_smgr_truncate(&mut modification, &truncate)?;
        } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
            if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                == pg_constants::XLOG_DBASE_CREATE
            {
                let createdb = XlCreateDatabase::decode(&mut buf);
-                self.ingest_xlog_dbase_create(&mut writer, &createdb)?;
+                self.ingest_xlog_dbase_create(&mut modification, &createdb)?;
            } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                == pg_constants::XLOG_DBASE_DROP
            {
                let dropdb = XlDropDatabase::decode(&mut buf);
                for tablespace_id in dropdb.tablespace_ids {
                    trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                    writer.drop_dbdir(tablespace_id, dropdb.db_id)?;
+                    modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
                }
            }
        } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
@@ -138,7 +136,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                self.put_slru_page_image(
-                    &mut writer,
+                    &mut modification,
                    SlruKind::Clog,
                    segno,
                    rpageno,
@@ -147,7 +145,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            } else {
                assert!(info == pg_constants::CLOG_TRUNCATE);
                let xlrec = XlClogTruncate::decode(&mut buf);
-                self.ingest_clog_truncate_record(&mut writer, &xlrec)?;
+                self.ingest_clog_truncate_record(&mut modification, &xlrec)?;
            }
        } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
            let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
@@ -155,7 +153,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                let parsed_xact =
                    XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
                self.ingest_xact_record(
-                    &mut writer,
+                    &mut modification,
                    &parsed_xact,
                    info == pg_constants::XLOG_XACT_COMMIT,
                )?;
@@ -165,7 +163,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                let parsed_xact =
                    XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
                self.ingest_xact_record(
-                    &mut writer,
+                    &mut modification,
                    &parsed_xact,
                    info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
                )?;
@@ -176,9 +174,9 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                    parsed_xact.xid,
                    lsn,
                );
-                writer.drop_twophase_file(parsed_xact.xid)?;
+                modification.drop_twophase_file(parsed_xact.xid)?;
            } else if info == pg_constants::XLOG_XACT_PREPARE {
-                writer.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?;
+                modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?;
            }
        } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -188,7 +186,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                self.put_slru_page_image(
-                    &mut writer,
+                    &mut modification,
                    SlruKind::MultiXactOffsets,
                    segno,
                    rpageno,
@@ -199,7 +197,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
                self.put_slru_page_image(
-                    &mut writer,
+                    &mut modification,
                    SlruKind::MultiXactMembers,
                    segno,
                    rpageno,
@@ -207,14 +205,14 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                )?;
            } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
                let xlrec = XlMultiXactCreate::decode(&mut buf);
-                self.ingest_multixact_create_record(&mut writer, &xlrec)?;
+                self.ingest_multixact_create_record(&mut modification, &xlrec)?;
            } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
                let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                self.ingest_multixact_truncate_record(&mut writer, &xlrec)?;
+                self.ingest_multixact_truncate_record(&mut modification, &xlrec)?;
            }
        } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
            let xlrec = XlRelmapUpdate::decode(&mut buf);
-            self.ingest_relmap_page(&mut writer, &xlrec, &decoded)?;
+            self.ingest_relmap_page(&mut modification, &xlrec, &decoded)?;
        } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
            if info == pg_constants::XLOG_NEXTOID {
@@ -228,7 +226,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            {
                let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
                buf.copy_to_slice(&mut checkpoint_bytes);
-                let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap();
+                let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
                trace!(
                    "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
                    xlog_checkpoint.oldestXid,
@@ -249,31 +247,11 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        // Iterate through all the blocks that the record modifies, and
        // "put" a separate copy of the record for each block.
        for blk in decoded.blocks.iter() {
-
-            let lsn_hex = {
-                use bytes::BufMut;
-                let mut bytes = BytesMut::new();
-                bytes.put_u64(lsn.0);
-                hex::encode(bytes.freeze())
-            };
-            let page_hex = {
-                let foo: DecodedBkpBlock;
-                use bytes::BufMut;
-                let mut page = BytesMut::new();
-                page.put_u32(blk.rnode_spcnode);
-                page.put_u32(blk.rnode_dbnode);
-                page.put_u32(blk.rnode_relnode);
-                page.put_u8(blk.forknum);
-                page.put_u32(blk.blkno);
-                hex::encode(page.freeze())
-            };
-            println!("wal-at-lsn-modified-page {} {} {}", lsn_hex, page_hex, recdata_len);
-
-            self.ingest_decoded_block(&mut writer, lsn, &decoded, blk)?;
+            self.ingest_decoded_block(&mut modification, lsn, &decoded, blk)?;
        }

        // Emit wal entry metadata, if configured to do so
-        crate::wal_metadata::write(WalEntryMetadata {
+        crate::wal_metadata::write(crate::wal_metadata::WalEntryMetadata {
            lsn,
            size: recdata_len,
            affected_pages: decoded.blocks.iter().map(|blk| blk.into()).collect()
@@ -283,20 +261,20 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        if self.checkpoint_modified {
            let new_checkpoint_bytes = self.checkpoint.encode();

-            writer.put_checkpoint(new_checkpoint_bytes)?;
+            modification.put_checkpoint(new_checkpoint_bytes)?;
            self.checkpoint_modified = false;
        }

        // Now that this record has been fully handled, including updating the
        // checkpoint data, let the repository know that it is up-to-date to this LSN
-        writer.finish()?;
+        modification.commit()?;

        Ok(())
    }

    fn ingest_decoded_block(
        &mut self,
-        timeline: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        lsn: Lsn,
        decoded: &DecodedWALRecord,
        blk: &DecodedBkpBlock,
@@ -336,13 +314,13 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
            image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
            assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
-            self.put_rel_page_image(timeline, rel, blk.blkno, image.freeze())?;
+            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?;
        } else {
            let rec = ZenithWalRecord::Postgres {
                will_init: blk.will_init || blk.apply_image,
                rec: decoded.record.clone(),
            };
-            self.put_rel_wal_record(timeline, rel, blk.blkno, rec)?;
+            self.put_rel_wal_record(modification, rel, blk.blkno, rec)?;
        }
        Ok(())
    }
@@ -350,7 +328,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
    fn ingest_heapam_record(
        &mut self,
        buf: &mut Bytes,
-        timeline: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        decoded: &mut DecodedWALRecord,
    ) -> Result<()> {
        // Handle VM bit updates that are implicitly part of heap records.
@@ -448,7 +426,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                    // An UPDATE record that needs to clear the bits for both old and the
                    // new page, both of which reside on the same VM page.
                    self.put_rel_wal_record(
-                        timeline,
+                        modification,
                        vm_rel,
                        new_vm_blk.unwrap(),
                        ZenithWalRecord::ClearVisibilityMapFlags {
@@ -462,7 +440,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                    // different VM pages.
                    if let Some(new_vm_blk) = new_vm_blk {
                        self.put_rel_wal_record(
-                            timeline,
+                            modification,
                            vm_rel,
                            new_vm_blk,
                            ZenithWalRecord::ClearVisibilityMapFlags {
@@ -474,7 +452,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                    }
                    if let Some(old_vm_blk) = old_vm_blk {
                        self.put_rel_wal_record(
-                            timeline,
+                            modification,
                            vm_rel,
                            old_vm_blk,
                            ZenithWalRecord::ClearVisibilityMapFlags {
@@ -494,7 +472,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
    /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
    fn ingest_xlog_dbase_create(
        &mut self,
-        timeline: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        rec: &XlCreateDatabase,
    ) -> Result<()> {
        let db_id = rec.db_id;
@@ -507,15 +485,19 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
        // the last valid LSN to advance up to it. So we use the previous record's LSN in the
        // get calls instead.
-        let req_lsn = timeline.get_last_record_lsn();
+        let req_lsn = modification.tline.get_last_record_lsn();

-        let rels = timeline.list_rels(src_tablespace_id, src_db_id, req_lsn)?;
+        let rels = modification
+            .tline
+            .list_rels(src_tablespace_id, src_db_id, req_lsn)?;

        debug!("ingest_xlog_dbase_create: {} rels", rels.len());

        // Copy relfilemap
-        let filemap = timeline.get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?;
-        timeline.put_relmap_file(tablespace_id, db_id, filemap)?;
+        let filemap = modification
+            .tline
+            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?;
+        modification.put_relmap_file(tablespace_id, db_id, filemap)?;

        let mut num_rels_copied = 0;
        let mut num_blocks_copied = 0;
@@ -523,7 +505,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            assert_eq!(src_rel.spcnode, src_tablespace_id);
            assert_eq!(src_rel.dbnode, src_db_id);

-            let nblocks = timeline.get_rel_size(src_rel, req_lsn)?;
+            let nblocks = modification.tline.get_rel_size(src_rel, req_lsn)?;
            let dst_rel = RelTag {
                spcnode: tablespace_id,
                dbnode: db_id,
@@ -531,15 +513,17 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                forknum: src_rel.forknum,
            };

-            timeline.put_rel_creation(dst_rel, nblocks)?;
+            modification.put_rel_creation(dst_rel, nblocks)?;

            // Copy content
            debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
            for blknum in 0..nblocks {
                debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);

-                let content = timeline.get_rel_page_at_lsn(src_rel, blknum, req_lsn)?;
-                timeline.put_rel_page_image(dst_rel, blknum, content)?;
+                let content = modification
+                    .tline
+                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn)?;
+                modification.put_rel_page_image(dst_rel, blknum, content)?;
                num_blocks_copied += 1;
            }

@@ -555,7 +539,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_xlog_smgr_create(
        &mut self,
-        writer: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        rec: &XlSmgrCreate,
    ) -> Result<()> {
        let rel = RelTag {
@@ -564,7 +548,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            relnode: rec.rnode.relnode,
            forknum: rec.forknum,
        };
-        self.put_rel_creation(writer, rel)?;
+        self.put_rel_creation(modification, rel)?;
        Ok(())
    }

@@ -573,7 +557,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
    /// This is the same logic as in PostgreSQL's smgr_redo() function.
    fn ingest_xlog_smgr_truncate(
        &mut self,
-        writer: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        rec: &XlSmgrTruncate,
    ) -> Result<()> {
        let spcnode = rec.rnode.spcnode;
@@ -587,7 +571,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                relnode,
                forknum: pg_constants::MAIN_FORKNUM,
            };
-            self.put_rel_truncation(writer, rel, rec.blkno)?;
+            self.put_rel_truncation(modification, rel, rec.blkno)?;
        }
        if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
            let rel = RelTag {
@@ -610,7 +594,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                info!("Partial truncation of FSM is not supported");
            }
            let num_fsm_blocks = 0;
-            self.put_rel_truncation(writer, rel, num_fsm_blocks)?;
+            self.put_rel_truncation(modification, rel, num_fsm_blocks)?;
        }
        if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
            let rel = RelTag {
@@ -629,7 +613,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                info!("Partial truncation of VM is not supported");
            }
            let num_vm_blocks = 0;
-            self.put_rel_truncation(writer, rel, num_vm_blocks)?;
+            self.put_rel_truncation(modification, rel, num_vm_blocks)?;
        }
        Ok(())
    }
@@ -638,7 +622,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
    ///
    fn ingest_xact_record(
        &mut self,
-        writer: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        parsed: &XlXactParsedRecord,
        is_commit: bool,
    ) -> Result<()> {
@@ -654,7 +638,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                // This subxact goes to different page. Write the record
                // for all the XIDs on the previous page, and continue
                // accumulating XIDs on this new page.
-                writer.put_slru_wal_record(
+                modification.put_slru_wal_record(
                    SlruKind::Clog,
                    segno,
                    rpageno,
@@ -671,7 +655,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
            page_xids.push(*subxact);
        }
-        writer.put_slru_wal_record(
+        modification.put_slru_wal_record(
            SlruKind::Clog,
            segno,
            rpageno,
@@ -691,8 +675,8 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                    relnode: xnode.relnode,
                };
                let last_lsn = self.timeline.get_last_record_lsn();
-                if writer.get_rel_exists(rel, last_lsn)? {
-                    self.put_rel_drop(writer, rel)?;
+                if modification.tline.get_rel_exists(rel, last_lsn)? {
+                    self.put_rel_drop(modification, rel)?;
                }
            }
        }
@@ -701,7 +685,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_clog_truncate_record(
        &mut self,
-        timeline: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        xlrec: &XlClogTruncate,
    ) -> Result<()> {
        info!(
@@ -742,11 +726,14 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        // will block waiting for the last valid LSN to advance up to
        // it. So we use the previous record's LSN in the get calls
        // instead.
-        let req_lsn = timeline.get_last_record_lsn();
-        for segno in timeline.list_slru_segments(SlruKind::Clog, req_lsn)? {
+        let req_lsn = modification.tline.get_last_record_lsn();
+        for segno in modification
+            .tline
+            .list_slru_segments(SlruKind::Clog, req_lsn)?
+        {
            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
            if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
-                timeline.drop_slru_segment(SlruKind::Clog, segno)?;
+                modification.drop_slru_segment(SlruKind::Clog, segno)?;
                trace!("Drop CLOG segment {:>04X}", segno);
            }
        }
@@ -756,7 +743,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_multixact_create_record(
        &mut self,
-        timeline: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        xlrec: &XlMultiXactCreate,
    ) -> Result<()> {
        // Create WAL record for updating the multixact-offsets page
@@ -764,7 +751,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;

-        timeline.put_slru_wal_record(
+        modification.put_slru_wal_record(
            SlruKind::MultiXactOffsets,
            segno,
            rpageno,
@@ -798,7 +785,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            }
            let n_this_page = this_page_members.len();

-            timeline.put_slru_wal_record(
+            modification.put_slru_wal_record(
                SlruKind::MultiXactMembers,
                pageno / pg_constants::SLRU_PAGES_PER_SEGMENT,
                pageno % pg_constants::SLRU_PAGES_PER_SEGMENT,
@@ -835,7 +822,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_multixact_truncate_record(
        &mut self,
-        timeline: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        xlrec: &XlMultiXactTruncate,
    ) -> Result<()> {
        self.checkpoint.oldestMulti = xlrec.end_trunc_off;
@@ -851,7 +838,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        // Delete all the segments except the last one. The last segment can still
        // contain, possibly partially, valid data.
        while segment != endsegment {
-            timeline.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)?;
+            modification.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)?;

            /* move to next segment, handling wraparound correctly */
            if segment == maxsegment {
@@ -869,7 +856,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn ingest_relmap_page(
        &mut self,
-        timeline: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        xlrec: &XlRelmapUpdate,
        decoded: &DecodedWALRecord,
    ) -> Result<()> {
@@ -878,58 +865,62 @@ impl<'a, R: Repository> WalIngest<'a, R> {
        // skip xl_relmap_update
        buf.advance(12);

-        timeline.put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))?;
+        modification.put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))?;

        Ok(())
    }

    fn put_rel_creation(
        &mut self,
-        writer: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        rel: RelTag,
    ) -> Result<()> {
        self.relsize_cache.insert(rel, 0);
-        writer.put_rel_creation(rel, 0)?;
+        modification.put_rel_creation(rel, 0)?;
        Ok(())
    }

    fn put_rel_page_image(
        &mut self,
-        writer: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        rel: RelTag,
        blknum: BlockNumber,
        img: Bytes,
    ) -> Result<()> {
-        self.handle_rel_extend(writer, rel, blknum)?;
-        writer.put_rel_page_image(rel, blknum, img)?;
+        self.handle_rel_extend(modification, rel, blknum)?;
+        modification.put_rel_page_image(rel, blknum, img)?;
        Ok(())
    }

    fn put_rel_wal_record(
        &mut self,
-        writer: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        rel: RelTag,
        blknum: BlockNumber,
        rec: ZenithWalRecord,
    ) -> Result<()> {
-        self.handle_rel_extend(writer, rel, blknum)?;
-        writer.put_rel_wal_record(rel, blknum, rec)?;
+        self.handle_rel_extend(modification, rel, blknum)?;
+        modification.put_rel_wal_record(rel, blknum, rec)?;
        Ok(())
    }

    fn put_rel_truncation(
        &mut self,
-        writer: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        rel: RelTag,
        nblocks: BlockNumber,
    ) -> Result<()> {
-        writer.put_rel_truncation(rel, nblocks)?;
+        modification.put_rel_truncation(rel, nblocks)?;
        self.relsize_cache.insert(rel, nblocks);
        Ok(())
    }

-    fn put_rel_drop(&mut self, writer: &mut DatadirTimelineWriter<R>, rel: RelTag) -> Result<()> {
-        writer.put_rel_drop(rel)?;
+    fn put_rel_drop(
+        &mut self,
+        modification: &mut DatadirModification<R>,
+        rel: RelTag,
+    ) -> Result<()> {
+        modification.put_rel_drop(rel)?;
        self.relsize_cache.remove(&rel);
        Ok(())
    }
@@ -951,7 +942,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn handle_rel_extend(
        &mut self,
-        writer: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        rel: RelTag,
        blknum: BlockNumber,
    ) -> Result<()> {
@@ -965,7 +956,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            let last_lsn = self.timeline.get_last_record_lsn();
            let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? {
                // create it with 0 size initially, the logic below will extend it
-                writer.put_rel_creation(rel, 0)?;
+                modification.put_rel_creation(rel, 0)?;
                0
            } else {
                self.timeline.get_rel_size(rel, last_lsn)?
@@ -976,11 +967,11 @@ impl<'a, R: Repository> WalIngest<'a, R> {

        if new_nblocks > old_nblocks {
            //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
-            writer.put_rel_extend(rel, new_nblocks)?;
+            modification.put_rel_extend(rel, new_nblocks)?;

            // fill the gap with zeros
            for gap_blknum in old_nblocks..blknum {
-                writer.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
+                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
            }
            self.relsize_cache.insert(rel, new_nblocks);
        }
@@ -989,20 +980,20 @@ impl<'a, R: Repository> WalIngest<'a, R> {

    fn put_slru_page_image(
        &mut self,
-        writer: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        kind: SlruKind,
        segno: u32,
        blknum: BlockNumber,
        img: Bytes,
    ) -> Result<()> {
-        self.handle_slru_extend(writer, kind, segno, blknum)?;
-        writer.put_slru_page_image(kind, segno, blknum, img)?;
+        self.handle_slru_extend(modification, kind, segno, blknum)?;
+        modification.put_slru_page_image(kind, segno, blknum, img)?;
        Ok(())
    }

    fn handle_slru_extend(
        &mut self,
-        writer: &mut DatadirTimelineWriter<R>,
+        modification: &mut DatadirModification<R>,
        kind: SlruKind,
        segno: u32,
        blknum: BlockNumber,
@@ -1021,7 +1012,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            .get_slru_segment_exists(kind, segno, last_lsn)?
        {
            // create it with 0 size initially, the logic below will extend it
-            writer.put_slru_segment_creation(kind, segno, 0)?;
+            modification.put_slru_segment_creation(kind, segno, 0)?;
            0
        } else {
            self.timeline.get_slru_segment_size(kind, segno, last_lsn)?
@@ -1035,11 +1026,11 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                old_nblocks,
                new_nblocks
            );
-            writer.put_slru_extend(kind, segno, new_nblocks)?;
+            modification.put_slru_extend(kind, segno, new_nblocks)?;

            // fill the gap with zeros
            for gap_blknum in old_nblocks..blknum {
-                writer.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?;
+                modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?;
            }
        }
        Ok(())
@@ -1072,10 +1063,10 @@ mod tests {
    static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);

    fn init_walingest_test<R: Repository>(tline: &DatadirTimeline<R>) -> Result<WalIngest<R>> {
-        let mut writer = tline.begin_record(Lsn(0x10));
-        writer.put_checkpoint(ZERO_CHECKPOINT.clone())?;
-        writer.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(0x10));
+        m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
+        m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
+        m.commit()?;
        let walingest = WalIngest::new(tline, Lsn(0x10))?;

        Ok(walingest)
@@ -1087,27 +1078,25 @@ mod tests {
        let tline = create_test_timeline(repo, TIMELINE_ID)?;
        let mut walingest = init_walingest_test(&tline)?;

-        let mut writer = tline.begin_record(Lsn(0x20));
-        walingest.put_rel_creation(&mut writer, TESTREL_A)?;
-        walingest.put_rel_page_image(&mut writer, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
-        writer.finish()?;
-        let mut writer = tline.begin_record(Lsn(0x30));
-        walingest.put_rel_page_image(&mut writer, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?;
-        writer.finish()?;
-        let mut writer = tline.begin_record(Lsn(0x40));
-        walingest.put_rel_page_image(&mut writer, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?;
-        writer.finish()?;
-        let mut writer = tline.begin_record(Lsn(0x50));
-        walingest.put_rel_page_image(&mut writer, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(0x20));
+        walingest.put_rel_creation(&mut m, TESTREL_A)?;
+        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
+        m.commit()?;
+        let mut m = tline.begin_modification(Lsn(0x30));
+        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?;
+        m.commit()?;
+        let mut m = tline.begin_modification(Lsn(0x40));
+        walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?;
+        m.commit()?;
+        let mut m = tline.begin_modification(Lsn(0x50));
+        walingest.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?;
+        m.commit()?;

        assert_current_logical_size(&tline, Lsn(0x50));

        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
-
-        // FIXME: should error out?
-        //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10))?.is_none());
+        assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10)).is_err());

        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1);
@@ -1147,9 +1136,9 @@ mod tests {
        );

        // Truncate last block
-        let mut writer = tline.begin_record(Lsn(0x60));
-        walingest.put_rel_truncation(&mut writer, TESTREL_A, 2)?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(0x60));
+        walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?;
+        m.commit()?;
        assert_current_logical_size(&tline, Lsn(0x60));

        // Check reported size and contents after truncation
@@ -1171,15 +1160,15 @@ mod tests {
        );

        // Truncate to zero length
-        let mut writer = tline.begin_record(Lsn(0x68));
-        walingest.put_rel_truncation(&mut writer, TESTREL_A, 0)?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(0x68));
+        walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?;
+        m.commit()?;
        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0);

        // Extend from 0 to 2 blocks, leaving a gap
-        let mut writer = tline.begin_record(Lsn(0x70));
-        walingest.put_rel_page_image(&mut writer, TESTREL_A, 1, TEST_IMG("foo blk 1"))?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(0x70));
+        walingest.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))?;
+        m.commit()?;
        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2);
        assert_eq!(
            tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?,
@@ -1191,9 +1180,9 @@ mod tests {
        );

        // Extend a lot more, leaving a big gap that spans across segments
-        let mut writer = tline.begin_record(Lsn(0x80));
-        walingest.put_rel_page_image(&mut writer, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(0x80));
+        walingest.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?;
+        m.commit()?;
        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501);
        for blk in 2..1500 {
            assert_eq!(
@@ -1217,18 +1206,18 @@ mod tests {
        let tline = create_test_timeline(repo, TIMELINE_ID)?;
        let mut walingest = init_walingest_test(&tline)?;

-        let mut writer = tline.begin_record(Lsn(0x20));
-        walingest.put_rel_page_image(&mut writer, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(0x20));
+        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?;
+        m.commit()?;

        // Check that rel exists and size is correct
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1);

        // Drop rel
-        let mut writer = tline.begin_record(Lsn(0x30));
-        walingest.put_rel_drop(&mut writer, TESTREL_A)?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(0x30));
+        walingest.put_rel_drop(&mut m, TESTREL_A)?;
+        m.commit()?;

        // Check that rel is not visible anymore
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
@@ -1237,9 +1226,9 @@ mod tests {
        //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none());

        // Re-create it
-        let mut writer = tline.begin_record(Lsn(0x40));
-        walingest.put_rel_page_image(&mut writer, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(0x40));
+        walingest.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?;
+        m.commit()?;

        // Check that rel exists and size is correct
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
@@ -1257,23 +1246,18 @@ mod tests {
        let tline = create_test_timeline(repo, TIMELINE_ID)?;
        let mut walingest = init_walingest_test(&tline)?;

-        //from storage_layer.rs
-        const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
-        let relsize = RELISH_SEG_SIZE * 2;
-
-        // Create relation with relsize blocks
-        let mut writer = tline.begin_record(Lsn(0x20));
+        // Create a 20 MB relation (the size is arbitrary)
+        let relsize = 20 * 1024 * 1024 / 8192;
+        let mut m = tline.begin_modification(Lsn(0x20));
        for blkno in 0..relsize {
            let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
-            walingest.put_rel_page_image(&mut writer, TESTREL_A, blkno, TEST_IMG(&data))?;
+            walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
        }
-        writer.finish()?;
+        m.commit()?;

        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
-
-        // FIXME: should fail
-        // assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10))?.is_none());
+        assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10)).is_err());

        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, relsize);
@@ -1290,9 +1274,9 @@ mod tests {

        // Truncate relation so that second segment was dropped
        // - only leave one page
-        let mut writer = tline.begin_record(Lsn(0x60));
-        walingest.put_rel_truncation(&mut writer, TESTREL_A, 1)?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(0x60));
+        walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?;
+        m.commit()?;

        // Check reported size and contents after truncation
        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1);
@@ -1320,12 +1304,12 @@ mod tests {
        // Extend relation again.
        // Add enough blocks to create second segment
        let lsn = Lsn(0x80);
-        let mut writer = tline.begin_record(lsn);
+        let mut m = tline.begin_modification(lsn);
        for blkno in 0..relsize {
            let data = format!("foo blk {} at {}", blkno, lsn);
-            walingest.put_rel_page_image(&mut writer, TESTREL_A, blkno, TEST_IMG(&data))?;
+            walingest.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))?;
        }
-        writer.finish()?;
+        m.commit()?;

        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize);
@@ -1353,10 +1337,10 @@ mod tests {
        let mut lsn = 0x10;
        for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
            lsn += 0x10;
-            let mut writer = tline.begin_record(Lsn(lsn));
+            let mut m = tline.begin_modification(Lsn(lsn));
            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
-            walingest.put_rel_page_image(&mut writer, TESTREL_A, blknum as BlockNumber, img)?;
-            writer.finish()?;
+            walingest.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)?;
+            m.commit()?;
        }

        assert_current_logical_size(&tline, Lsn(lsn));
@@ -1368,9 +1352,9 @@ mod tests {

        // Truncate one block
        lsn += 0x10;
-        let mut writer = tline.begin_record(Lsn(lsn));
-        walingest.put_rel_truncation(&mut writer, TESTREL_A, pg_constants::RELSEG_SIZE)?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(lsn));
+        walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE)?;
+        m.commit()?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
            pg_constants::RELSEG_SIZE
@@ -1379,9 +1363,9 @@ mod tests {

        // Truncate another block
        lsn += 0x10;
-        let mut writer = tline.begin_record(Lsn(lsn));
-        walingest.put_rel_truncation(&mut writer, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?;
-        writer.finish()?;
+        let mut m = tline.begin_modification(Lsn(lsn));
+        walingest.put_rel_truncation(&mut m, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?;
+        m.commit()?;
        assert_eq!(
            tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
            pg_constants::RELSEG_SIZE - 1
@@ -1393,9 +1377,9 @@ mod tests {
        let mut size: i32 = 3000;
        while size >= 0 {
            lsn += 0x10;
-            let mut writer = tline.begin_record(Lsn(lsn));
-            walingest.put_rel_truncation(&mut writer, TESTREL_A, size as BlockNumber)?;
-            writer.finish()?;
+            let mut m = tline.begin_modification(Lsn(lsn));
+            walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?;
+            m.commit()?;
            assert_eq!(
                tline.get_rel_size(TESTREL_A, Lsn(lsn))?,
                size as BlockNumber
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -32,6 +32,7 @@ use tracing::*;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::pq_proto::ZenithFeedback;
 use zenith_utils::zid::ZTenantId;
+use zenith_utils::zid::ZTenantTimelineId;
 use zenith_utils::zid::ZTimelineId;

 //
@@ -69,7 +70,7 @@ pub fn launch_wal_receiver(

    match receivers.get_mut(&(tenantid, timelineid)) {
        Some(receiver) => {
-            info!("wal receiver already running, updating connection string");
+            debug!("wal receiver already running, updating connection string");
            receiver.wal_producer_connstr = wal_producer_connstr.into();
        }
        None => {
@@ -78,9 +79,11 @@ pub fn launch_wal_receiver(
                Some(tenantid),
                Some(timelineid),
                "WAL receiver thread",
+                false,
                move || {
                    IS_WAL_RECEIVER.with(|c| c.set(true));
-                    thread_main(conf, tenantid, timelineid)
+                    thread_main(conf, tenantid, timelineid);
+                    Ok(())
                },
            )?;

@@ -110,20 +113,16 @@ fn get_wal_producer_connstr(tenantid: ZTenantId, timelineid: ZTimelineId) -> Str
 //
 // This is the entry point for the WAL receiver thread.
 //
-fn thread_main(
-    conf: &'static PageServerConf,
-    tenantid: ZTenantId,
-    timelineid: ZTimelineId,
-) -> Result<()> {
-    let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
+fn thread_main(conf: &'static PageServerConf, tenant_id: ZTenantId, timeline_id: ZTimelineId) {
+    let _enter = info_span!("WAL receiver", timeline = %timeline_id, tenant = %tenant_id).entered();
    info!("WAL receiver thread started");

    // Look up the current WAL producer address
-    let wal_producer_connstr = get_wal_producer_connstr(tenantid, timelineid);
+    let wal_producer_connstr = get_wal_producer_connstr(tenant_id, timeline_id);

    // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
    // and start streaming WAL from it.
-    let res = walreceiver_main(conf, tenantid, timelineid, &wal_producer_connstr);
+    let res = walreceiver_main(conf, tenant_id, timeline_id, &wal_producer_connstr);

    // TODO cleanup info messages
    if let Err(e) = res {
@@ -131,22 +130,21 @@ fn thread_main(
    } else {
        info!(
            "walreceiver disconnected tenant {}, timelineid {}",
-            tenantid, timelineid
+            tenant_id, timeline_id
        );
    }

    // Drop it from list of active WAL_RECEIVERS
    // so that next callmemaybe request launched a new thread
-    drop_wal_receiver(tenantid, timelineid);
-    Ok(())
+    drop_wal_receiver(tenant_id, timeline_id);
 }

 fn walreceiver_main(
    _conf: &PageServerConf,
-    tenantid: ZTenantId,
-    timelineid: ZTimelineId,
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
    wal_producer_connstr: &str,
-) -> Result<(), Error> {
+) -> anyhow::Result<(), Error> {
    // Connect to the database in replication mode.
    info!("connecting to {:?}", wal_producer_connstr);
    let connect_cfg = format!(
@@ -183,13 +181,16 @@ fn walreceiver_main(
    let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
    let mut caught_up = false;

+    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
+        .with_context(|| format!("no repository found for tenant {}", tenant_id))?;
    let timeline =
-        tenant_mgr::get_timeline_for_tenant(tenantid, timelineid).with_context(|| {
+        tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).with_context(|| {
            format!(
-                "Can not start the walrecever for a remote tenant {}, timeline {}",
-                tenantid, timelineid,
+                "local timeline {} not found for tenant {}",
+                timeline_id, tenant_id
            )
        })?;
+    let remote_index = repo.get_remote_index();

    //
    // Start streaming the WAL, from where we left off previously.
@@ -252,7 +253,7 @@ fn walreceiver_main(
                    // It is important to deal with the aligned records as lsn in getPage@LSN is
                    // aligned and can be several bytes bigger. Without this alignment we are
                    // at risk of hitting a deadlock.
-                    assert!(lsn.is_aligned());
+                    anyhow::ensure!(lsn.is_aligned());

                    walingest.ingest_record(&timeline, recdata, lsn)?;

@@ -294,11 +295,19 @@ fn walreceiver_main(
        };

        if let Some(last_lsn) = status_update {
-            let timeline_synced_disk_consistent_lsn =
-                tenant_mgr::get_repository_for_tenant(tenantid)?
-                    .get_timeline_state(timelineid)
-                    .and_then(|state| state.remote_disk_consistent_lsn())
-                    .unwrap_or(Lsn(0));
+            let timeline_remote_consistent_lsn = runtime.block_on(async {
+                remote_index
+                    .read()
+                    .await
+                    // here we either do not have this timeline in remote index
+                    // or there were no checkpoints for it yet
+                    .timeline_entry(&ZTenantTimelineId {
+                        tenant_id,
+                        timeline_id,
+                    })
+                    .and_then(|e| e.disk_consistent_lsn())
+                    .unwrap_or(Lsn(0)) // no checkpoint was uploaded
+            });

            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
            let write_lsn = u64::from(last_lsn);
@@ -306,7 +315,7 @@ fn walreceiver_main(
            let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn());
            // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
            // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
-            let apply_lsn = u64::from(timeline_synced_disk_consistent_lsn);
+            let apply_lsn = u64::from(timeline_remote_consistent_lsn);
            let ts = SystemTime::now();

            // Send zenith feedback message.
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,7 +43,7 @@ use zenith_utils::zid::ZTenantId;

 use crate::config::PageServerConf;
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
-use crate::relish::*;
+use crate::reltag::{RelTag, SlruKind};
 use crate::repository::Key;
 use crate::walrecord::ZenithWalRecord;
 use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift;
@@ -344,13 +344,16 @@ impl PostgresRedoManager {
            ZenithWalRecord::Postgres {
                will_init: _,
                rec: _,
-            } => panic!("tried to pass postgres wal record to zenith WAL redo"),
+            } => {
+                error!("tried to pass postgres wal record to zenith WAL redo");
+                return Err(WalRedoError::InvalidRequest);
+            }
            ZenithWalRecord::ClearVisibilityMapFlags {
                new_heap_blkno,
                old_heap_blkno,
                flags,
            } => {
-                // sanity check that this is modifying the correct relish
+                // sanity check that this is modifying the correct relation
                let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
                assert!(
                    rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM,
@@ -563,20 +566,23 @@ impl PostgresRedoProcess {
        }
        info!("running initdb in {:?}", datadir.display());
        let initdb = Command::new(conf.pg_bin_dir().join("initdb"))
-            .args(&["-D", datadir.to_str().unwrap()])
+            .args(&["-D", &datadir.to_string_lossy()])
            .arg("-N")
            .env_clear()
-            .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
-            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+            .env("LD_LIBRARY_PATH", conf.pg_lib_dir())
+            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
            .output()
-            .expect("failed to execute initdb");
+            .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?;

        if !initdb.status.success() {
-            panic!(
-                "initdb failed: {}\nstderr:\n{}",
-                std::str::from_utf8(&initdb.stdout).unwrap(),
-                std::str::from_utf8(&initdb.stderr).unwrap()
-            );
+            return Err(Error::new(
+                ErrorKind::Other,
+                format!(
+                    "initdb failed\nstdout: {}\nstderr:\n{}",
+                    String::from_utf8_lossy(&initdb.stdout),
+                    String::from_utf8_lossy(&initdb.stderr)
+                ),
+            ));
        } else {
            // Limit shared cache for wal-redo-postres
            let mut config = OpenOptions::new()
@@ -594,11 +600,16 @@ impl PostgresRedoProcess {
            .stderr(Stdio::piped())
            .stdout(Stdio::piped())
            .env_clear()
-            .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
-            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
+            .env("LD_LIBRARY_PATH", conf.pg_lib_dir())
+            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
            .env("PGDATA", &datadir)
            .spawn()
-            .expect("postgres --wal-redo command failed to start");
+            .map_err(|e| {
+                Error::new(
+                    e.kind(),
+                    format!("postgres --wal-redo command failed to start: {}", e),
+                )
+            })?;

        info!(
            "launched WAL redo postgres process on {:?}",
@@ -658,7 +669,10 @@ impl PostgresRedoProcess {
            {
                build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
            } else {
-                panic!("tried to pass zenith wal record to postgres WAL redo");
+                return Err(Error::new(
+                    ErrorKind::Other,
+                    "tried to pass zenith wal record to postgres WAL redo",
+                ));
            }
        }
        build_get_page_msg(tag, &mut writebuf);
--- a/postgres_ffi/Cargo.toml
+++ b/postgres_ffi/Cargo.toml
@@ -17,8 +17,8 @@ log = "0.4.14"
 memoffset = "0.6.2"
 thiserror = "1.0"
 serde = { version = "1.0", features = ["derive"] }
-workspace_hack = { path = "../workspace_hack" }
 zenith_utils = { path = "../zenith_utils" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }

 [build-dependencies]
 bindgen = "0.59.1"
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -495,7 +495,13 @@ mod tests {
            .env("DYLD_LIBRARY_PATH", &lib_path)
            .output()
            .unwrap();
-        assert!(initdb_output.status.success());
+        assert!(
+            initdb_output.status.success(),
+            "initdb failed. Status: '{}', stdout: '{}', stderr: '{}'",
+            initdb_output.status,
+            String::from_utf8_lossy(&initdb_output.stdout),
+            String::from_utf8_lossy(&initdb_output.stderr),
+        );

        // 2. Pick WAL generated by initdb
        let wal_dir = data_dir.join("pg_wal");
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -22,13 +22,15 @@ rustls = "0.19.1"
 scopeguard = "1.1.0"
 serde = "1"
 serde_json = "1"
+socket2 = "0.4.4"
 thiserror = "1.0"
-tokio = { version = "1.11", features = ["macros"] }
+tokio = { version = "1.17", features = ["macros"] }
 tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
 tokio-rustls = "0.22.0"

 zenith_utils = { path = "../zenith_utils" }
 zenith_metrics = { path = "../zenith_metrics" }
+workspace_hack = { version = "0.1", path = "../workspace_hack" }

 [dev-dependencies]
 tokio-postgres-rustls = "0.8.0"
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -24,7 +24,7 @@ pub enum ConnectionError {
 impl UserFacingError for ConnectionError {}

 /// Compute node connection params.
-#[derive(Serialize, Deserialize, Debug, Default)]
+#[derive(Serialize, Deserialize, Default)]
 pub struct DatabaseInfo {
    pub host: String,
    pub port: u16,
@@ -33,6 +33,16 @@ pub struct DatabaseInfo {
    pub password: Option<String>,
 }

+// Manually implement debug to omit personal and sensitive info
+impl std::fmt::Debug for DatabaseInfo {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
+        fmt.debug_struct("DatabaseInfo")
+            .field("host", &self.host)
+            .field("port", &self.port)
+            .finish()
+    }
+}
+
 /// PostgreSQL version as [`String`].
 pub type Version = String;

@@ -41,6 +51,7 @@ impl DatabaseInfo {
        let host_port = format!("{}:{}", self.host, self.port);
        let socket = TcpStream::connect(host_port).await?;
        let socket_addr = socket.peer_addr()?;
+        socket2::SockRef::from(&socket).set_keepalive(true)?;

        Ok((socket_addr, socket))
    }
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -107,7 +107,7 @@ impl postgres_backend::Handler for MgmtHandler {
 }

 fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::Result<()> {
-    println!("Got mgmt query: '{}'", query_string);
+    println!("Got mgmt query [redacted]"); // Content contains password, don't print it

    let resp: PsqlSessionResponse = serde_json::from_str(query_string)?;

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -50,6 +50,10 @@ pub async fn thread_main(
        println!("proxy has shut down");
    }

+    // When set for the server socket, the keepalive setting
+    // will be inherited by all accepted client sockets.
+    socket2::SockRef::from(&listener).set_keepalive(true)?;
+
    let cancel_map = Arc::new(CancelMap::default());
    loop {
        let (socket, peer_addr) = listener.accept().await?;
@@ -367,4 +371,24 @@ mod tests {

        Ok(())
    }
+
+    #[tokio::test]
+    async fn keepalive_is_inherited() -> anyhow::Result<()> {
+        use tokio::net::{TcpListener, TcpStream};
+
+        let listener = TcpListener::bind("127.0.0.1:0").await?;
+        let port = listener.local_addr()?.port();
+        socket2::SockRef::from(&listener).set_keepalive(true)?;
+
+        let t = tokio::spawn(async move {
+            let (client, _) = listener.accept().await?;
+            let keepalive = socket2::SockRef::from(&client).keepalive()?;
+            anyhow::Ok(keepalive)
+        });
+
+        let _ = TcpStream::connect(("127.0.0.1", port)).await?;
+        assert!(t.await??, "keepalive should be inherited");
+
+        Ok(())
+    }
 }
--- a/scripts/pysync
+++ b/scripts/pysync
@@ -4,4 +4,10 @@
 # It is intended to be a primary endpoint for all the people who want to
 # just setup test environment without going into details of python package management

-poetry install --no-root # this installs dev dependencies by default
+poetry config --list
+
+if [ -z "${CI}" ]; then
+    poetry install --no-root --no-interaction --ansi
+else
+    poetry install --no-root
+fi
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -10,6 +10,8 @@ Prerequisites:
      below to run from other directories.
 - The zenith git repo, including the postgres submodule
  (for some tests, e.g. `pg_regress`)
+- Some tests (involving storage nodes coordination) require etcd installed. Follow
+  [`the guide`](https://etcd.io/docs/v3.5/install/) to obtain it.

 ### Test Organization

--- a/test_runner/batch_others/test_gc_aggressive.py
+++ b/test_runner/batch_others/test_gc_aggressive.py
@@ -1,10 +1,7 @@
-from contextlib import closing
-
 import asyncio
-import asyncpg
 import random

-from fixtures.zenith_fixtures import ZenithEnv, Postgres, Safekeeper
+from fixtures.zenith_fixtures import ZenithEnv, Postgres
 from fixtures.log_helper import log

 # Test configuration
@@ -76,5 +73,5 @@ def test_gc_aggressive(zenith_simple_env: ZenithEnv):

    asyncio.run(update_and_gc(env, pg, timeline))

-    row = cur.execute('SELECT COUNT(*), SUM(counter) FROM foo')
+    cur.execute('SELECT COUNT(*), SUM(counter) FROM foo')
    assert cur.fetchone() == (num_rows, updates_to_perform)
--- a/test_runner/batch_others/test_next_xid.py
+++ b/test_runner/batch_others/test_next_xid.py
@@ -1,9 +1,6 @@
-import pytest
-import random
 import time

 from fixtures.zenith_fixtures import ZenithEnvBuilder
-from fixtures.log_helper import log


 # Test restarting page server, while safekeeper and compute node keep
--- a/test_runner/batch_others/test_old_request_lsn.py
+++ b/test_runner/batch_others/test_old_request_lsn.py
@@ -1,5 +1,3 @@
-from contextlib import closing
-
 from fixtures.zenith_fixtures import ZenithEnv
 from fixtures.log_helper import log

--- a/test_runner/batch_others/test_pageserver_api.py
+++ b/test_runner/batch_others/test_pageserver_api.py
@@ -1,6 +1,6 @@
 from uuid import uuid4, UUID
 import pytest
-from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient, zenith_binpath
+from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient


 # test that we cannot override node id
@@ -39,10 +39,14 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID):
        timeline_id_str = str(timeline['timeline_id'])
        timeline_details = client.timeline_detail(tenant_id=tenant_id,
                                                  timeline_id=UUID(timeline_id_str))
-        assert timeline_details['kind'] == 'Local'
+
        assert timeline_details['tenant_id'] == tenant_id.hex
        assert timeline_details['timeline_id'] == timeline_id_str

+        local_timeline_details = timeline_details.get('local')
+        assert local_timeline_details is not None
+        assert local_timeline_details['timeline_state'] == 'Loaded'
+

 def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):
    env = zenith_simple_env
--- a/test_runner/batch_others/test_pageserver_catchup.py
+++ b/test_runner/batch_others/test_pageserver_catchup.py
@@ -1,11 +1,4 @@
-import pytest
-import random
-import time
-
-from contextlib import closing
-from multiprocessing import Process, Value
 from fixtures.zenith_fixtures import ZenithEnvBuilder
-from fixtures.log_helper import log


 # Test safekeeper sync and pageserver catch up
@@ -17,7 +10,9 @@ def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuil
    env = zenith_env_builder.init_start()

    env.zenith_cli.create_branch('test_pageserver_catchup_while_compute_down')
-    pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down')
+    # Make shared_buffers large to ensure we won't query pageserver while it is down.
+    pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down',
+                                   config_lines=['shared_buffers=512MB'])

    pg_conn = pg.connect()
    cur = pg_conn.cursor()
--- a/test_runner/batch_others/test_pageserver_restart.py
+++ b/test_runner/batch_others/test_pageserver_restart.py
@@ -1,9 +1,3 @@
-import pytest
-import random
-import time
-
-from contextlib import closing
-from multiprocessing import Process, Value
 from fixtures.zenith_fixtures import ZenithEnvBuilder
 from fixtures.log_helper import log

--- a/test_runner/batch_others/test_read_validation.py
+++ b/test_runner/batch_others/test_read_validation.py
@@ -0,0 +1,183 @@
+from contextlib import closing
+
+from fixtures.zenith_fixtures import ZenithEnv
+from fixtures.log_helper import log
+
+from psycopg2.errors import UndefinedTable
+from psycopg2.errors import IoError
+
+pytest_plugins = ("fixtures.zenith_fixtures")
+
+extensions = ["pageinspect", "zenith_test_utils", "pg_buffercache"]
+
+
+#
+# Validation of reading different page versions
+#
+def test_read_validation(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli.create_branch("test_read_validation", "empty")
+
+    pg = env.postgres.create_start("test_read_validation")
+    log.info("postgres is running on 'test_read_validation' branch")
+
+    with closing(pg.connect()) as con:
+        with con.cursor() as c:
+
+            for e in extensions:
+                c.execute("create extension if not exists {};".format(e))
+
+            c.execute("create table foo (c int) with (autovacuum_enabled = false)")
+            c.execute("insert into foo values (1)")
+
+            c.execute("select lsn, lower, upper from page_header(get_raw_page('foo', 'main', 0));")
+            first = c.fetchone()
+
+            c.execute("select relfilenode from pg_class where relname = 'foo'")
+            relfilenode = c.fetchone()[0]
+
+            c.execute("insert into foo values (2);")
+            c.execute("select lsn, lower, upper from page_header(get_raw_page('foo', 'main', 0));")
+            second = c.fetchone()
+
+            assert first != second, "Failed to update page"
+
+            log.info("Test table is populated, validating buffer cache")
+
+            c.execute(
+                "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode))
+            assert c.fetchone()[0] > 0, "No buffers cached for the test relation"
+
+            c.execute(
+                "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}"
+                .format(relfilenode))
+            reln = c.fetchone()
+
+            log.info("Clear buffer cache to ensure no stale pages are brought into the cache")
+
+            c.execute("select clear_buffer_cache()")
+
+            c.execute(
+                "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode))
+            assert c.fetchone()[0] == 0, "Failed to clear buffer cache"
+
+            log.info("Cache is clear, reading stale page version")
+
+            c.execute(
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))"
+                .format(first[0]))
+            direct_first = c.fetchone()
+            assert first == direct_first, "Failed fetch page at historic lsn"
+
+            c.execute(
+                "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode))
+            assert c.fetchone()[0] == 0, "relation buffers detected after invalidation"
+
+            log.info("Cache is clear, reading latest page version without cache")
+
+            c.execute(
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))"
+            )
+            direct_latest = c.fetchone()
+            assert second == direct_latest, "Failed fetch page at latest lsn"
+
+            c.execute(
+                "select count(*) from pg_buffercache where relfilenode =  {}".format(relfilenode))
+            assert c.fetchone()[0] == 0, "relation buffers detected after invalidation"
+
+            log.info(
+                "Cache is clear, reading stale page version without cache using relation identifiers"
+            )
+
+            c.execute(
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))"
+                .format(reln[0], reln[1], reln[2], first[0]))
+            direct_first = c.fetchone()
+            assert first == direct_first, "Failed fetch page at historic lsn using oid"
+
+            log.info(
+                "Cache is clear, reading latest page version without cache using relation identifiers"
+            )
+
+            c.execute(
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))"
+                .format(reln[0], reln[1], reln[2]))
+            direct_latest = c.fetchone()
+            assert second == direct_latest, "Failed fetch page at latest lsn"
+
+            c.execute('drop table foo;')
+
+            log.info(
+                "Relation dropped, attempting reading stale page version without cache using relation identifiers"
+            )
+
+            c.execute(
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))"
+                .format(reln[0], reln[1], reln[2], first[0]))
+            direct_first = c.fetchone()
+            assert first == direct_first, "Failed fetch page at historic lsn using oid"
+
+            log.info("Validation page inspect won't allow reading pages of dropped relations")
+            try:
+                c.execute("select * from page_header(get_raw_page('foo', 'main', 0));")
+                assert False, "query should have failed"
+            except UndefinedTable as e:
+                log.info("Caught an expected failure: {}".format(e))
+
+
+def test_read_validation_neg(zenith_simple_env: ZenithEnv):
+    env = zenith_simple_env
+    env.zenith_cli.create_branch("test_read_validation_neg", "empty")
+
+    pg = env.postgres.create_start("test_read_validation_neg")
+    log.info("postgres is running on 'test_read_validation_neg' branch")
+
+    with closing(pg.connect()) as con:
+        with con.cursor() as c:
+
+            for e in extensions:
+                c.execute("create extension if not exists {};".format(e))
+
+            log.info("read a page of a missing relation")
+            try:
+                c.execute(
+                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))"
+                )
+                assert False, "query should have failed"
+            except UndefinedTable as e:
+                log.info("Caught an expected failure: {}".format(e))
+
+            c.execute("create table foo (c int) with (autovacuum_enabled = false)")
+            c.execute("insert into foo values (1)")
+
+            log.info("read a page at lsn 0")
+            try:
+                c.execute(
+                    "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))"
+                )
+                assert False, "query should have failed"
+            except IoError as e:
+                log.info("Caught an expected failure: {}".format(e))
+
+            log.info("Pass NULL as an input")
+            expected = (None, None, None)
+            c.execute(
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))"
+            )
+            assert c.fetchone() == expected, "Expected null output"
+
+            c.execute(
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))"
+            )
+            assert c.fetchone() == expected, "Expected null output"
+
+            c.execute(
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))"
+            )
+            assert c.fetchone() == expected, "Expected null output"
+
+            # This check is currently failing, reading beyond EOF is returning a 0-page
+            log.info("Read beyond EOF")
+            c.execute(
+                "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))"
+            )
--- a/test_runner/batch_others/test_remote_storage.py
+++ b/test_runner/batch_others/test_remote_storage.py
@@ -1,12 +1,13 @@
 # It's possible to run any regular test with the local fs remote storage via
 # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/zenith_zzz/'}" poetry ......

-import time, shutil, os
+import shutil, os
 from contextlib import closing
 from pathlib import Path
 from uuid import UUID
-from fixtures.zenith_fixtures import ZenithEnvBuilder
+from fixtures.zenith_fixtures import ZenithEnvBuilder, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload
 from fixtures.log_helper import log
+from fixtures.utils import lsn_from_hex
 import pytest


@@ -26,7 +27,6 @@ import pytest
 #   * queries the specific data, ensuring that it matches the one stored before
 #
 # The tests are done for all types of remote storage pageserver supports.
-@pytest.mark.skip(reason="will be fixed with https://github.com/zenithdb/zenith/issues/1193")
@pytest.mark.parametrize('storage_type', ['local_fs', 'mock_s3'])
 def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, storage_type: str):
    zenith_env_builder.rust_log_override = 'debug'
@@ -45,6 +45,8 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
    env = zenith_env_builder.init_start()
    pg = env.postgres.create_start('main')

+    client = env.pageserver.http_client()
+
    tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
    timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]

@@ -54,13 +56,21 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
                CREATE TABLE t1(id int primary key, secret text);
                INSERT INTO t1 VALUES ({data_id}, '{data_secret}');
            ''')
+            cur.execute("SELECT pg_current_wal_flush_lsn()")
+            current_lsn = lsn_from_hex(cur.fetchone()[0])
+
+    # wait until pageserver receives that data
+    wait_for_last_record_lsn(client, UUID(tenant_id), UUID(timeline_id), current_lsn)

    # run checkpoint manually to be sure that data landed in remote storage
    with closing(env.pageserver.connect()) as psconn:
        with psconn.cursor() as pscur:
-            pscur.execute(f"do_gc {tenant_id} {timeline_id}")
-    log.info("waiting for upload")  # TODO api to check if upload is done
-    time.sleep(2)
+            pscur.execute(f"checkpoint {tenant_id} {timeline_id}")
+
+    log.info("waiting for upload")
+    # wait until pageserver successfully uploaded a checkpoint to remote storage
+    wait_for_upload(client, UUID(tenant_id), UUID(timeline_id), current_lsn)
+    log.info("upload is done")

    ##### Stop the first pageserver instance, erase all its data
    env.postgres.stop_all()
@@ -73,26 +83,12 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder,
    ##### Second start, restore the data and ensure it's the same
    env.pageserver.start()

-    client = env.pageserver.http_client()
    client.timeline_attach(UUID(tenant_id), UUID(timeline_id))
-    # FIXME cannot handle duplicate download requests (which might be caused by repeated timeline detail calls)
-    #   subject to fix in https://github.com/zenithdb/zenith/issues/997
-    time.sleep(5)

    log.info("waiting for timeline redownload")
-    attempts = 0
-    while True:
-        timeline_details = client.timeline_detail(UUID(tenant_id), UUID(timeline_id))
-        assert timeline_details['timeline_id'] == timeline_id
-        assert timeline_details['tenant_id'] == tenant_id
-        if timeline_details['kind'] == 'Local':
-            log.info("timeline downloaded, checking its data")
-            break
-        attempts += 1
-        if attempts > 10:
-            raise Exception("timeline redownload failed")
-        log.debug("still waiting")
-        time.sleep(1)
+    wait_for(number_of_iterations=10,
+             interval=1,
+             func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id)))

    pg = env.postgres.create_start('main')
    with closing(pg.connect()) as conn:
--- a/test_runner/batch_others/test_tenant_relocation.py
+++ b/test_runner/batch_others/test_tenant_relocation.py
@@ -5,15 +5,15 @@ import subprocess
 import threading
 from uuid import UUID
 from fixtures.log_helper import log
-import time
 import signal
 import pytest

-from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, ZenithPageserverHttpClient, zenith_binpath, pg_distrib_dir
+from fixtures.zenith_fixtures import PgProtocol, PortDistributor, Postgres, ZenithEnvBuilder, ZenithPageserverHttpClient, assert_local, wait_for, wait_for_last_record_lsn, wait_for_upload, zenith_binpath, pg_distrib_dir
+from fixtures.utils import lsn_from_hex


 def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float):
-    assert abs(a - b) / a < margin_ratio, (a, b, margin_ratio)
+    assert abs(a - b) / a < margin_ratio, abs(a - b) / a


@contextmanager
@@ -34,6 +34,7 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path,
        f"-c listen_pg_addr='localhost:{pg_port}'",
        f"-c listen_http_addr='localhost:{http_port}'",
        f"-c pg_distrib_dir='{pg_distrib_dir}'",
+        f"-c id=2",
        f"-c remote_storage={{local_path='{remote_storage_mock_path}'}}",
    ]

@@ -57,20 +58,6 @@ def new_pageserver_helper(new_pageserver_dir: pathlib.Path,
        os.kill(pid, signal.SIGQUIT)


-def wait_for(number_of_iterations: int, interval: int, func):
-    last_exception = None
-    for i in range(number_of_iterations):
-        try:
-            res = func()
-        except Exception as e:
-            log.info("waiting for %s iteration %s failed", func, i + 1)
-            last_exception = e
-            time.sleep(interval)
-            continue
-        return res
-    raise Exception("timed out while waiting for %s" % func) from last_exception
-
-
@contextmanager
 def pg_cur(pg):
    with closing(pg.connect()) as conn:
@@ -108,13 +95,6 @@ def load(pg: Postgres, stop_event: threading.Event, load_ok_event: threading.Eve
    log.info('load thread stopped')


-def assert_local(pageserver_http_client: ZenithPageserverHttpClient, tenant: UUID, timeline: str):
-    timeline_detail = pageserver_http_client.timeline_detail(tenant, UUID(timeline))
-    assert timeline_detail.get('type') == "Local", timeline_detail
-    return timeline_detail
-
-
-@pytest.mark.skip(reason="will be fixed with https://github.com/zenithdb/zenith/issues/1193")
@pytest.mark.parametrize('with_load', ['with_load', 'without_load'])
 def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
                           port_distributor: PortDistributor,
@@ -129,7 +109,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,

    tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209"))
    log.info("tenant to relocate %s", tenant)
-
+    env.zenith_cli.create_root_branch('main', tenant_id=tenant)
    env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant)

    tenant_pg = env.postgres.create_start(branch_name='main',
@@ -141,8 +121,8 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
        with conn.cursor() as cur:
            # save timeline for later gc call
            cur.execute("SHOW zenith.zenith_timeline")
-            timeline = cur.fetchone()[0]
-            log.info("timeline to relocate %s", timeline)
+            timeline = UUID(cur.fetchone()[0])
+            log.info("timeline to relocate %s", timeline.hex)

            # we rely upon autocommit after each statement
            # as waiting for acceptors happens there
@@ -150,6 +130,15 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
            cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'some payload'")
            cur.execute("SELECT sum(key) FROM t")
            assert cur.fetchone() == (500500, )
+            cur.execute("SELECT pg_current_wal_flush_lsn()")
+
+            current_lsn = lsn_from_hex(cur.fetchone()[0])
+
+    pageserver_http = env.pageserver.http_client()
+
+    # wait until pageserver receives that data
+    wait_for_last_record_lsn(pageserver_http, tenant, timeline, current_lsn)
+    timeline_detail = assert_local(pageserver_http, tenant, timeline)

    if with_load == 'with_load':
        # create load table
@@ -165,12 +154,10 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
    # run checkpoint manually to be sure that data landed in remote storage
    with closing(env.pageserver.connect()) as psconn:
        with psconn.cursor() as pscur:
-            pscur.execute(f"do_gc {tenant.hex} {timeline}")
+            pscur.execute(f"checkpoint {tenant.hex} {timeline.hex}")

-    # ensure upload is completed
-    pageserver_http_client = env.pageserver.http_client()
-    timeline_detail = pageserver_http_client.timeline_detail(tenant, UUID(timeline))
-    assert timeline_detail['disk_consistent_lsn'] == timeline_detail['timeline_state']['Ready']
+    # wait until pageserver successfully uploaded a checkpoint to remote storage
+    wait_for_upload(pageserver_http, tenant, timeline, current_lsn)

    log.info("inititalizing new pageserver")
    # bootstrap second pageserver
@@ -182,8 +169,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
    log.info("new pageserver ports pg %s http %s", new_pageserver_pg_port, new_pageserver_http_port)
    pageserver_bin = pathlib.Path(zenith_binpath) / 'pageserver'

-    new_pageserver_http_client = ZenithPageserverHttpClient(port=new_pageserver_http_port,
-                                                            auth_token=None)
+    new_pageserver_http = ZenithPageserverHttpClient(port=new_pageserver_http_port, auth_token=None)

    with new_pageserver_helper(new_pageserver_dir,
                               pageserver_bin,
@@ -192,25 +178,18 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
                               new_pageserver_http_port):

        # call to attach timeline to new pageserver
-        new_pageserver_http_client.timeline_attach(tenant, UUID(timeline))
-        # FIXME cannot handle duplicate download requests, subject to fix in https://github.com/zenithdb/zenith/issues/997
-        time.sleep(5)
-        # new pageserver should in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint
+        new_pageserver_http.timeline_attach(tenant, timeline)
+        # new pageserver should be in sync (modulo wal tail or vacuum activity) with the old one because there was no new writes since checkpoint
        new_timeline_detail = wait_for(
            number_of_iterations=5,
            interval=1,
-            func=lambda: assert_local(new_pageserver_http_client, tenant, timeline))
-        assert new_timeline_detail['timeline_state'].get('Ready'), new_timeline_detail
+            func=lambda: assert_local(new_pageserver_http, tenant, timeline))
+
        # when load is active these checks can break because lsns are not static
        # so lets check with some margin
-        if with_load == 'without_load':
-            # TODO revisit this once https://github.com/zenithdb/zenith/issues/1049 is fixed
-            assert_abs_margin_ratio(new_timeline_detail['disk_consistent_lsn'],
-                                    timeline_detail['disk_consistent_lsn'],
-                                    0.01)
-            assert_abs_margin_ratio(new_timeline_detail['timeline_state']['Ready'],
-                                    timeline_detail['timeline_state']['Ready'],
-                                    0.01)
+        assert_abs_margin_ratio(lsn_from_hex(new_timeline_detail['local']['disk_consistent_lsn']),
+                                lsn_from_hex(timeline_detail['local']['disk_consistent_lsn']),
+                                0.03)

        # callmemaybe to start replication from safekeeper to the new pageserver
        # when there is no load there is a clean checkpoint and no wal delta
@@ -219,7 +198,9 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
        with pg_cur(PgProtocol(host='localhost', port=new_pageserver_pg_port)) as cur:
            # "callmemaybe {} {} host={} port={} options='-c ztimelineid={} ztenantid={}'"
            safekeeper_connstring = f"host=localhost port={env.safekeepers[0].port.pg} options='-c ztimelineid={timeline} ztenantid={tenant} pageserver_connstr=postgresql://no_user:@localhost:{new_pageserver_pg_port}'"
-            cur.execute("callmemaybe {} {} {}".format(tenant, timeline, safekeeper_connstring))
+            cur.execute("callmemaybe {} {} {}".format(tenant.hex,
+                                                      timeline.hex,
+                                                      safekeeper_connstring))

        tenant_pg.stop()

@@ -239,7 +220,7 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
        # detach tenant from old pageserver before we check
        # that all the data is there to be sure that old pageserver
        # is no longer involved, and if it is, we will see the errors
-        pageserver_http_client.timeline_detach(tenant, UUID(timeline))
+        pageserver_http.timeline_detach(tenant, timeline)

        with pg_cur(tenant_pg) as cur:
            # check that data is still there
@@ -251,10 +232,10 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder,
            assert cur.fetchone() == (2001000, )

        if with_load == 'with_load':
-            assert load_ok_event.wait(1)
+            assert load_ok_event.wait(3)
            log.info('stopping load thread')
            load_stop_event.set()
-            load_thread.join()
+            load_thread.join(timeout=10)
            log.info('load thread stopped')

        # bring old pageserver back for clean shutdown via zenith cli
--- a/test_runner/batch_others/test_timeline_size.py
+++ b/test_runner/batch_others/test_timeline_size.py
@@ -1,8 +1,7 @@
 from contextlib import closing
-from uuid import UUID
 import psycopg2.extras
 import psycopg2.errors
-from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres
+from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres, assert_local
 from fixtures.log_helper import log
 import time

@@ -13,8 +12,9 @@ def test_timeline_size(zenith_simple_env: ZenithEnv):
    new_timeline_id = env.zenith_cli.create_branch('test_timeline_size', 'empty')

    client = env.pageserver.http_client()
-    res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
-    assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
+    timeline_details = assert_local(client, env.initial_tenant, new_timeline_id)
+    assert timeline_details['local']['current_logical_size'] == timeline_details['local'][
+        'current_logical_size_non_incremental']

    pgmain = env.postgres.create_start("test_timeline_size")
    log.info("postgres is running on 'test_timeline_size' branch")
@@ -31,12 +31,16 @@ def test_timeline_size(zenith_simple_env: ZenithEnv):
                    FROM generate_series(1, 10) g
            """)

-            res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
-            assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
+            res = assert_local(client, env.initial_tenant, new_timeline_id)
+            local_details = res['local']
+            assert local_details["current_logical_size"] == local_details[
+                "current_logical_size_non_incremental"]
            cur.execute("TRUNCATE foo")

-            res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
-            assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
+            res = assert_local(client, env.initial_tenant, new_timeline_id)
+            local_details = res['local']
+            assert local_details["current_logical_size"] == local_details[
+                "current_logical_size_non_incremental"]


 # wait until received_lsn_lag is 0
@@ -71,8 +75,9 @@ def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder):
    new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota')

    client = env.pageserver.http_client()
-    res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id)
-    assert res["current_logical_size"] == res["current_logical_size_non_incremental"]
+    res = assert_local(client, env.initial_tenant, new_timeline_id)
+    assert res['local']["current_logical_size"] == res['local'][
+        "current_logical_size_non_incremental"]

    pgmain = env.postgres.create_start(
        "test_timeline_size_quota",
--- a/test_runner/batch_others/test_wal_acceptor.py
+++ b/test_runner/batch_others/test_wal_acceptor.py
@@ -13,7 +13,7 @@ from dataclasses import dataclass, field
 from multiprocessing import Process, Value
 from pathlib import Path
 from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol
-from fixtures.utils import lsn_to_hex, mkdir_if_needed, lsn_from_hex
+from fixtures.utils import etcd_path, lsn_to_hex, mkdir_if_needed, lsn_from_hex
 from fixtures.log_helper import log
 from typing import List, Optional, Any

@@ -22,6 +22,7 @@ from typing import List, Optional, Any
 # succeed and data is written
 def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
    zenith_env_builder.num_safekeepers = 3
+    zenith_env_builder.broker = True
    env = zenith_env_builder.init_start()

    env.zenith_cli.create_branch('test_wal_acceptors_normal_work')
@@ -89,29 +90,33 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
        sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers]

        timeline_metrics = []
-        with env.pageserver.http_client() as pageserver_http:
-            for timeline_detail in timeline_details:
-                timeline_id: str = timeline_detail["timeline_id"]
+        for timeline_detail in timeline_details:
+            timeline_id: str = timeline_detail["timeline_id"]

-                m = TimelineMetrics(
-                    timeline_id=timeline_id,
-                    last_record_lsn=lsn_from_hex(timeline_detail["last_record_lsn"]),
-                )
-                for sk_m in sk_metrics:
-                    m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)])
-                    m.commit_lsns.append(sk_m.commit_lsn_inexact[(tenant_id.hex, timeline_id)])
+            local_timeline_detail = timeline_detail.get('local')
+            if local_timeline_detail is None:
+                log.debug(f"Timeline {timeline_id} is not present locally, skipping")
+                continue

-                for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
-                    # Invariant. May be < when transaction is in progress.
-                    assert commit_lsn <= flush_lsn
-                # We only call collect_metrics() after a transaction is confirmed by
-                # the compute node, which only happens after a consensus of safekeepers
-                # has confirmed the transaction. We assume majority consensus here.
-                assert (2 * sum(m.last_record_lsn <= lsn
-                                for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers)
-                assert (2 * sum(m.last_record_lsn <= lsn
-                                for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers)
-                timeline_metrics.append(m)
+            m = TimelineMetrics(
+                timeline_id=timeline_id,
+                last_record_lsn=lsn_from_hex(local_timeline_detail['last_record_lsn']),
+            )
+            for sk_m in sk_metrics:
+                m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)])
+                m.commit_lsns.append(sk_m.commit_lsn_inexact[(tenant_id.hex, timeline_id)])
+
+            for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
+                # Invariant. May be < when transaction is in progress.
+                assert commit_lsn <= flush_lsn, f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}"
+            # We only call collect_metrics() after a transaction is confirmed by
+            # the compute node, which only happens after a consensus of safekeepers
+            # has confirmed the transaction. We assume majority consensus here.
+            assert (2 * sum(m.last_record_lsn <= lsn
+                            for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}"
+            assert (2 * sum(m.last_record_lsn <= lsn
+                            for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers), f"timeline_id={timeline_id}, timeline_detail={timeline_detail}, sk_metrics={sk_metrics}"
+            timeline_metrics.append(m)
        log.info(f"{message}: {timeline_metrics}")
        return timeline_metrics

@@ -322,6 +327,49 @@ def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value):
    proc.join()


+# Test that safekeepers push their info to the broker and learn peer status from it
+@pytest.mark.skipif(etcd_path() is None, reason="requires etcd which is not present in PATH")
+def test_broker(zenith_env_builder: ZenithEnvBuilder):
+    zenith_env_builder.num_safekeepers = 3
+    zenith_env_builder.broker = True
+    zenith_env_builder.enable_local_fs_remote_storage()
+    env = zenith_env_builder.init_start()
+
+    env.zenith_cli.create_branch("test_broker", "main")
+    pg = env.postgres.create_start('test_broker')
+    pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
+
+    # learn zenith timeline from compute
+    tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
+    timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0]
+
+    # wait until remote_consistent_lsn gets advanced on all safekeepers
+    clients = [sk.http_client() for sk in env.safekeepers]
+    stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
+    log.info(f"statuses is {stat_before}")
+
+    pg.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'")
+    # force checkpoint to advance remote_consistent_lsn
+    with closing(env.pageserver.connect()) as psconn:
+        with psconn.cursor() as pscur:
+            pscur.execute(f"checkpoint {tenant_id} {timeline_id}")
+    # and wait till remote_consistent_lsn propagates to all safekeepers
+    started_at = time.time()
+    while True:
+        stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
+        if all(
+                lsn_from_hex(s_after.remote_consistent_lsn) > lsn_from_hex(
+                    s_before.remote_consistent_lsn) for s_after,
+                s_before in zip(stat_after, stat_before)):
+            break
+        elapsed = time.time() - started_at
+        if elapsed > 20:
+            raise RuntimeError(
+                f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}"
+            )
+        time.sleep(0.5)
+
+
 class ProposerPostgres(PgProtocol):
    """Object for running postgres without ZenithEnv"""
    def __init__(self,
--- a/test_runner/batch_others/test_wal_acceptor_async.py
+++ b/test_runner/batch_others/test_wal_acceptor_async.py
@@ -1,9 +1,10 @@
 import asyncio
+import uuid
 import asyncpg
 import random
 import time

-from fixtures.zenith_fixtures import ZenithEnvBuilder, Postgres, Safekeeper
+from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, Postgres, Safekeeper
 from fixtures.log_helper import getLogger
 from fixtures.utils import lsn_from_hex, lsn_to_hex
 from typing import List
@@ -30,10 +31,6 @@ class BankClient(object):
        await self.conn.execute('DROP TABLE IF EXISTS bank_log')
        await self.conn.execute('CREATE TABLE bank_log(from_uid int, to_uid int, amount int)')

-        # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
-        await self.conn.execute('ALTER TABLE bank_accs SET (autovacuum_enabled = false)')
-        await self.conn.execute('ALTER TABLE bank_log SET (autovacuum_enabled = false)')
-
    async def check_invariant(self):
        row = await self.conn.fetchrow('SELECT sum(amount) AS sum FROM bank_accs')
        assert row['sum'] == self.n_accounts * self.init_amount
@@ -139,12 +136,15 @@ async def wait_for_lsn(safekeeper: Safekeeper,
 # On each iteration 1 acceptor is stopped, and 2 others should allow
 # background workers execute transactions. In the end, state should remain
 # consistent.
-async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_workers=10):
+async def run_restarts_under_load(env: ZenithEnv,
+                                  pg: Postgres,
+                                  acceptors: List[Safekeeper],
+                                  n_workers=10):
    n_accounts = 100
    init_amount = 100000
    max_transfer = 100
-    period_time = 10
-    iterations = 6
+    period_time = 4
+    iterations = 10

    # Set timeout for this test at 5 minutes. It should be enough for test to complete
    # and less than CircleCI's no_output_timeout, taking into account that this timeout
@@ -176,6 +176,11 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_w
        flush_lsn = lsn_to_hex(flush_lsn)
        log.info(f'Postgres flush_lsn {flush_lsn}')

+        pageserver_lsn = env.pageserver.http_client().timeline_detail(
+            uuid.UUID(tenant_id), uuid.UUID((timeline_id)))["local"]["last_record_lsn"]
+        sk_ps_lag = lsn_from_hex(flush_lsn) - lsn_from_hex(pageserver_lsn)
+        log.info(f'Pageserver last_record_lsn={pageserver_lsn} lag={sk_ps_lag / 1024}kb')
+
        # Wait until alive safekeepers catch up with postgres
        for idx, safekeeper in enumerate(acceptors):
            if idx != victim_idx:
@@ -203,9 +208,8 @@ def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder):
    env = zenith_env_builder.init_start()

    env.zenith_cli.create_branch('test_wal_acceptors_restarts_under_load')
-    pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load')
+    # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long
+    pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load',
+                                   config_lines=['max_replication_write_lag=1MB'])

-    asyncio.run(run_restarts_under_load(pg, env.safekeepers))
-
-    # TODO: Remove when https://github.com/zenithdb/zenith/issues/644 is fixed
-    pg.stop()
+    asyncio.run(run_restarts_under_load(env, pg, env.safekeepers))
--- a/test_runner/batch_others/test_wal_restore.py
+++ b/test_runner/batch_others/test_wal_restore.py
@@ -0,0 +1,38 @@
+import os
+import subprocess
+
+from fixtures.utils import mkdir_if_needed
+from fixtures.zenith_fixtures import (ZenithEnvBuilder,
+                                      VanillaPostgres,
+                                      PortDistributor,
+                                      PgBin,
+                                      base_dir,
+                                      vanilla_pg,
+                                      pg_distrib_dir)
+from fixtures.log_helper import log
+
+
+def test_wal_restore(zenith_env_builder: ZenithEnvBuilder,
+                     test_output_dir,
+                     port_distributor: PortDistributor):
+    zenith_env_builder.num_safekeepers = 1
+    env = zenith_env_builder.init_start()
+    env.zenith_cli.create_branch("test_wal_restore")
+    pg = env.postgres.create_start('test_wal_restore')
+    pg.safe_psql("create table t as select generate_series(1,1000000)")
+    tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0]
+    env.zenith_cli.pageserver_stop()
+    port = port_distributor.get_port()
+    data_dir = os.path.join(test_output_dir, 'pgsql.restored')
+    restored = VanillaPostgres(data_dir, PgBin(test_output_dir), port)
+    subprocess.call([
+        'bash',
+        os.path.join(base_dir, 'zenith_utils/scripts/restore_from_wal.sh'),
+        os.path.join(pg_distrib_dir, 'bin'),
+        os.path.join(test_output_dir, 'repo/safekeepers/sk1/{}/*'.format(tenant_id)),
+        data_dir,
+        str(port)
+    ])
+    restored.start()
+    assert restored.safe_psql('select count(*) from t') == [(1000000, )]
+    restored.stop()
--- a/test_runner/batch_others/test_zenith_cli.py
+++ b/test_runner/batch_others/test_zenith_cli.py
@@ -1,8 +1,6 @@
-import json
 import uuid
 import requests

-from psycopg2.extensions import cursor as PgCursor
 from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
 from typing import cast

--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 import subprocess

 from typing import Any, List
@@ -76,3 +77,8 @@ def print_gc_result(row):
    log.info(
        "  total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}"
        .format_map(row))
+
+
+# path to etcd binary or None if not present.
+def etcd_path():
+    return shutil.which("etcd")
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
@@ -33,7 +33,7 @@ from typing_extensions import Literal
 import requests
 import backoff  # type: ignore

-from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
+from .utils import (etcd_path, get_self_dir, mkdir_if_needed, subprocess_capture, lsn_from_hex)
 from fixtures.log_helper import log
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
@@ -257,7 +257,8 @@ class PgProtocol:
                dbname: Optional[str] = None,
                schema: Optional[str] = None,
                username: Optional[str] = None,
-                password: Optional[str] = None) -> str:
+                password: Optional[str] = None,
+                statement_timeout_ms: Optional[int] = None) -> str:
        """
        Build a libpq connection string for the Postgres instance.
        """
@@ -277,16 +278,23 @@ class PgProtocol:
        if schema:
            res = f"{res} options='-c search_path={schema}'"

+        if statement_timeout_ms:
+            res = f"{res} options='-c statement_timeout={statement_timeout_ms}'"
+
        return res

    # autocommit=True here by default because that's what we need most of the time
-    def connect(self,
-                *,
-                autocommit=True,
-                dbname: Optional[str] = None,
-                schema: Optional[str] = None,
-                username: Optional[str] = None,
-                password: Optional[str] = None) -> PgConnection:
+    def connect(
+        self,
+        *,
+        autocommit=True,
+        dbname: Optional[str] = None,
+        schema: Optional[str] = None,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+        # individual statement timeout in seconds, 2 minutes should be enough for our tests
+        statement_timeout: Optional[int] = 120
+    ) -> PgConnection:
        """
        Connect to the node.
        Returns psycopg2's connection object.
@@ -294,12 +302,12 @@ class PgProtocol:
        """

        conn = psycopg2.connect(
-            self.connstr(
-                dbname=dbname,
-                schema=schema,
-                username=username,
-                password=password,
-            ))
+            self.connstr(dbname=dbname,
+                         schema=schema,
+                         username=username,
+                         password=password,
+                         statement_timeout_ms=statement_timeout *
+                         1000 if statement_timeout else None))
        # WARNING: this setting affects *all* tests!
        conn.autocommit = autocommit
        return conn
@@ -425,7 +433,8 @@ class ZenithEnvBuilder:
                 num_safekeepers: int = 0,
                 pageserver_auth_enabled: bool = False,
                 rust_log_override: Optional[str] = None,
-                 default_branch_name=DEFAULT_BRANCH_NAME):
+                 default_branch_name=DEFAULT_BRANCH_NAME,
+                 broker: bool = False):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
        self.port_distributor = port_distributor
@@ -434,6 +443,7 @@ class ZenithEnvBuilder:
        self.num_safekeepers = num_safekeepers
        self.pageserver_auth_enabled = pageserver_auth_enabled
        self.default_branch_name = default_branch_name
+        self.broker = broker
        self.env: Optional[ZenithEnv] = None

        self.s3_mock_server: Optional[MockS3Server] = None
@@ -509,6 +519,8 @@ class ZenithEnvBuilder:
            self.env.pageserver.stop(immediate=True)
            if self.s3_mock_server:
                self.s3_mock_server.kill()
+            if self.env.broker is not None:
+                self.env.broker.stop()


 class ZenithEnv:
@@ -561,6 +573,16 @@ class ZenithEnv:
            default_tenant_id = '{self.initial_tenant.hex}'
        """)

+        self.broker = None
+        if config.broker:
+            # keep etcd datadir inside 'repo'
+            self.broker = Etcd(datadir=os.path.join(self.repo_dir, "etcd"),
+                               port=self.port_distributor.get_port(),
+                               peer_port=self.port_distributor.get_port())
+            toml += textwrap.dedent(f"""
+            broker_endpoints = 'http://127.0.0.1:{self.broker.port}'
+        """)
+
        # Create config for pageserver
        pageserver_port = PageserverPort(
            pg=self.port_distributor.get_port(),
@@ -603,12 +625,15 @@ class ZenithEnv:
        self.zenith_cli.init(toml)

    def start(self):
-        # Start up the page server and all the safekeepers
+        # Start up the page server, all the safekeepers and the broker
        self.pageserver.start()

        for safekeeper in self.safekeepers:
            safekeeper.start()

+        if self.broker is not None:
+            self.broker.start()
+
    def get_safekeeper_connstrs(self) -> str:
        """ Get list of safekeeper endpoints suitable for wal_acceptors GUC  """
        return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers])
@@ -873,6 +898,30 @@ class ZenithCli:

        return uuid.UUID(created_timeline_id)

+    def create_root_branch(self, branch_name: str, tenant_id: Optional[uuid.UUID] = None):
+        cmd = [
+            'timeline',
+            'create',
+            '--branch-name',
+            branch_name,
+            '--tenant-id',
+            (tenant_id or self.env.initial_tenant).hex,
+        ]
+
+        res = self.raw_cli(cmd)
+        res.check_returncode()
+
+        matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout)
+
+        created_timeline_id = None
+        if matches is not None:
+            created_timeline_id = matches.group('timeline_id')
+
+        if created_timeline_id is None:
+            raise Exception('could not find timeline id after `zenith timeline create` invocation')
+        else:
+            return uuid.UUID(created_timeline_id)
+
    def create_branch(self,
                      new_branch_name: str = DEFAULT_BRANCH_NAME,
                      ancestor_branch_name: Optional[str] = None,
@@ -1649,6 +1698,7 @@ class Safekeeper:
 class SafekeeperTimelineStatus:
    acceptor_epoch: int
    flush_lsn: str
+    remote_consistent_lsn: str


@dataclass
@@ -1672,7 +1722,8 @@ class SafekeeperHttpClient(requests.Session):
        res.raise_for_status()
        resj = res.json()
        return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'],
-                                        flush_lsn=resj['flush_lsn'])
+                                        flush_lsn=resj['flush_lsn'],
+                                        remote_consistent_lsn=resj['remote_consistent_lsn'])

    def get_metrics(self) -> SafekeeperMetrics:
        request_result = self.get(f"http://localhost:{self.port}/metrics")
@@ -1693,6 +1744,54 @@ class SafekeeperHttpClient(requests.Session):
        return metrics


+@dataclass
+class Etcd:
+    """ An object managing etcd instance """
+    datadir: str
+    port: int
+    peer_port: int
+    handle: Optional[subprocess.Popen[Any]] = None  # handle of running daemon
+
+    def check_status(self):
+        s = requests.Session()
+        s.mount('http://', requests.adapters.HTTPAdapter(max_retries=1))  # do not retry
+        s.get(f"http://localhost:{self.port}/health").raise_for_status()
+
+    def start(self):
+        pathlib.Path(self.datadir).mkdir(exist_ok=True)
+        etcd_full_path = etcd_path()
+        if etcd_full_path is None:
+            raise Exception('etcd not found')
+
+        with open(os.path.join(self.datadir, "etcd.log"), "wb") as log_file:
+            args = [
+                etcd_full_path,
+                f"--data-dir={self.datadir}",
+                f"--listen-client-urls=http://localhost:{self.port}",
+                f"--advertise-client-urls=http://localhost:{self.port}",
+                f"--listen-peer-urls=http://localhost:{self.peer_port}"
+            ]
+            self.handle = subprocess.Popen(args, stdout=log_file, stderr=log_file)
+
+        # wait for start
+        started_at = time.time()
+        while True:
+            try:
+                self.check_status()
+            except Exception as e:
+                elapsed = time.time() - started_at
+                if elapsed > 5:
+                    raise RuntimeError(f"timed out waiting {elapsed:.0f}s for etcd start: {e}")
+                time.sleep(0.5)
+            else:
+                break  # success
+
+    def stop(self):
+        if self.handle is not None:
+            self.handle.terminate()
+            self.handle.wait()
+
+
 def get_test_output_dir(request: Any) -> str:
    """ Compute the working directory for an individual test. """
    test_name = request.node.name
@@ -1846,3 +1945,63 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos
            subprocess.run([cmd], stdout=stdout_f, shell=True)

    assert (mismatch, error) == ([], [])
+
+
+def wait_for(number_of_iterations: int, interval: int, func):
+    last_exception = None
+    for i in range(number_of_iterations):
+        try:
+            res = func()
+        except Exception as e:
+            log.info("waiting for %s iteration %s failed", func, i + 1)
+            last_exception = e
+            time.sleep(interval)
+            continue
+        return res
+    raise Exception("timed out while waiting for %s" % func) from last_exception
+
+
+def assert_local(pageserver_http_client: ZenithPageserverHttpClient,
+                 tenant: uuid.UUID,
+                 timeline: uuid.UUID):
+    timeline_detail = pageserver_http_client.timeline_detail(tenant, timeline)
+    assert timeline_detail.get('local', {}).get("disk_consistent_lsn"), timeline_detail
+    return timeline_detail
+
+
+def remote_consistent_lsn(pageserver_http_client: ZenithPageserverHttpClient,
+                          tenant: uuid.UUID,
+                          timeline: uuid.UUID) -> int:
+    detail = pageserver_http_client.timeline_detail(tenant, timeline)
+
+    lsn_str = detail['remote']['remote_consistent_lsn']
+    assert isinstance(lsn_str, str)
+    return lsn_from_hex(lsn_str)
+
+
+def wait_for_upload(pageserver_http_client: ZenithPageserverHttpClient,
+                    tenant: uuid.UUID,
+                    timeline: uuid.UUID,
+                    lsn: int):
+    """waits for local timeline upload up to specified lsn"""
+
+    wait_for(10, 1, lambda: remote_consistent_lsn(pageserver_http_client, tenant, timeline) >= lsn)
+
+
+def last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient,
+                    tenant: uuid.UUID,
+                    timeline: uuid.UUID) -> int:
+    detail = pageserver_http_client.timeline_detail(tenant, timeline)
+
+    lsn_str = detail['local']['last_record_lsn']
+    assert isinstance(lsn_str, str)
+    return lsn_from_hex(lsn_str)
+
+
+def wait_for_last_record_lsn(pageserver_http_client: ZenithPageserverHttpClient,
+                             tenant: uuid.UUID,
+                             timeline: uuid.UUID,
+                             lsn: int):
+    """waits for pageserver to catch up to a certain lsn"""
+
+    wait_for(10, 1, lambda: last_record_lsn(pageserver_http_client, tenant, timeline) >= lsn)
--- a/test_runner/performance/test_random_writes.py
+++ b/test_runner/performance/test_random_writes.py
@@ -49,7 +49,15 @@ def test_random_writes(zenith_with_baseline: PgCompare):
                        count integer default 0
                    );
                """)
-                cur.execute(f"INSERT INTO Big (pk) values (generate_series(1,{n_rows}))")
+
+                # Insert n_rows in batches to avoid query timeouts
+                rows_inserted = 0
+                while rows_inserted < n_rows:
+                    rows_to_insert = min(1000 * 1000, n_rows - rows_inserted)
+                    low = rows_inserted + 1
+                    high = rows_inserted + rows_to_insert
+                    cur.execute(f"INSERT INTO Big (pk) values (generate_series({low},{high}))")
+                    rows_inserted += rows_to_insert

            # Get table size (can't be predicted because padding and alignment)
            cur.execute("SELECT pg_relation_size('Big');")
--- a/test_runner/performance/test_seqscans.py
+++ b/test_runner/performance/test_seqscans.py
@@ -17,8 +17,8 @@ import pytest
        # into memory in the page server.
        pytest.param(100000, 100, 0),
        # Also test with a larger table, with and without parallelism
-        pytest.param(10000000, 1, 0, marks=pytest.mark.slow),
-        pytest.param(10000000, 1, 4, marks=pytest.mark.slow)
+        pytest.param(10000000, 1, 0),
+        pytest.param(10000000, 1, 4)
    ])
 def test_seqscans(zenith_with_baseline: PgCompare, rows: int, iters: int, workers: int):
    env = zenith_with_baseline
--- a/vendor/postgres
+++ b/vendor/postgres
--- a/Show More
+++ b/Show More