Store page image with the same LSN as replaced WAL record

Include zenith.signal file in tarball
[refer #258 ] Handle CHECKOINT_ONLINE WAL record
2026-01-22 21:02:56 +00:00 · 2021-07-09 12:06:46 +03:00 · 2021-06-21 16:01:10 +03:00 · 2021-06-18 23:55:59 +03:00 · 2021-06-17 22:55:13 +03:00 · 2021-06-17 22:49:44 +03:00
43 changed files with 1505 additions and 1911 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -37,7 +37,7 @@ jobs:
          command: |
            if [ ! -e tmp_install/bin/postgres ]; then
              sudo apt update
-              sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
+              sudo apt install build-essential libreadline-dev zlib1g-dev flex bison
            fi

        # Build postgres if the restore_cache didn't find a build.
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Install postgres dependencies
        run: |
          sudo apt update
-          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
+          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison

      - name: Set pg revision for caching
        id: pg_ver
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,7 +1,5 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
-
 [[package]]
 name = "ahash"
 version = "0.4.7"
@@ -2473,7 +2471,6 @@ dependencies = [
 "bytes",
 "hex-literal",
 "log",
- "postgres",
 "serde",
 "thiserror",
 "workspace_hack",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,8 +8,3 @@ members = [
    "zenith_utils",
    "workspace_hack",
 ]
-
-[profile.release]
-# This is useful for profiling and, to some extent, debug.
-# Besides, debug info should not affect the performance.
-debug = true
--- a/3
+++ b/3
@@ -89,6 +89,9 @@ RUN addgroup zenith && adduser -h /data -D -G zenith zenith
 VOLUME ["/data"]
 WORKDIR /data
 USER zenith
+ENV ZENITH_REPO_DIR /data/
+ENV POSTGRES_DISTRIB_DIR /usr/local
+
 EXPOSE 6400
 ENTRYPOINT ["/docker-entrypoint.sh"]
 CMD ["pageserver"]
--- a/17
+++ b/17
@@ -1,12 +1,3 @@
-# Seccomp BPF is only available for Linux
-UNAME_S := $(shell uname -s)
-ifeq ($(UNAME_S),Linux)
-	SECCOMP = --with-libseccomp
-	SECCOMP =
-else
-	SECCOMP =
-endif
-
 #
 # Top level Makefile to build Zenith and PostgreSQL
 #
@@ -30,12 +21,8 @@ tmp_install/build/config.status:
 	+@echo "Configuring postgres build"
 	mkdir -p tmp_install/build
 	(cd tmp_install/build && \
-	../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
-		--enable-cassert \
-		--enable-debug \
-		--enable-depend \
-		$(SECCOMP) \
-		--prefix=$(abspath tmp_install) > configure.log)
+	../../vendor/postgres/configure CFLAGS='-O0 $(CFLAGS)' --enable-debug --enable-cassert \
+	    --enable-depend --prefix=$(abspath tmp_install) > configure.log)

 # nicer alias for running 'configure'
 postgres-configure: tmp_install/build/config.status
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus

 On Ubuntu or Debian this set of packages should be sufficient to build the code:
 ```text
-apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
+apt install build-essential libtool libreadline-dev zlib1g-dev flex bison \
 libssl-dev clang
 ```

--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -14,7 +14,6 @@ use std::{
 use anyhow::{Context, Result};
 use lazy_static::lazy_static;
 use regex::Regex;
-use zenith_utils::connstring::connection_host_port;

 use crate::local_env::LocalEnv;
 use pageserver::ZTimelineId;
@@ -290,15 +289,14 @@ impl PostgresNode {
        // Connect it to the page server.

        // Configure that node to take pages from pageserver
-        let (host, port) = connection_host_port(&self.pageserver.connection_config());
        self.append_conf(
            "postgresql.conf",
            &format!(
                "shared_preload_libraries = zenith \n\
                 zenith.page_server_connstring = 'host={} port={}'\n\
                 zenith.zenith_timeline='{}'\n",
-                host,
-                port,
+                self.pageserver.address().ip(),
+                self.pageserver.address().port(),
                self.timelineid
            ),
        )?;
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -1,5 +1,5 @@
 use std::collections::HashMap;
-use std::net::{TcpStream};
+use std::net::{SocketAddr, TcpStream};
 use std::path::PathBuf;
 use std::process::Command;
 use std::thread;
@@ -8,11 +8,10 @@ use std::time::Duration;
 use anyhow::{anyhow, bail, Result};
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
-use postgres::{Config, NoTls};
+use postgres::{Client, NoTls};

 use crate::local_env::LocalEnv;
 use crate::read_pidfile;
-use zenith_utils::connstring::connection_address;
 use pageserver::branches::BranchInfo;

 //
@@ -22,7 +21,7 @@ use pageserver::branches::BranchInfo;
 //
 pub struct PageServerNode {
    pub kill_on_exit: bool,
-    pub connection_config: Option<Config>,
+    pub listen_address: Option<SocketAddr>,
    pub env: LocalEnv,
 }

@@ -30,19 +29,15 @@ impl PageServerNode {
    pub fn from_env(env: &LocalEnv) -> PageServerNode {
        PageServerNode {
            kill_on_exit: false,
-            connection_config: None, // default
+            listen_address: None, // default
            env: env.clone(),
        }
    }

-    fn default_config() -> Config {
-        "postgresql://no_user@localhost:64000/no_db".parse().unwrap()
-    }
-
-    pub fn connection_config(&self) -> Config {
-        match &self.connection_config {
-            Some(config) => config.clone(),
-            None => Self::default_config(),
+    pub fn address(&self) -> SocketAddr {
+        match self.listen_address {
+            Some(addr) => addr,
+            None => "127.0.0.1:64000".parse().unwrap(),
        }
    }

@@ -79,7 +74,7 @@ impl PageServerNode {
    pub fn start(&self) -> Result<()> {
        println!(
            "Starting pageserver at '{}' in {}",
-            connection_address(&self.connection_config()),
+            self.address(),
            self.repo_path().display()
        );

@@ -121,29 +116,42 @@ impl PageServerNode {
        }

        // wait for pageserver stop
-        let address = connection_address(&self.connection_config());
        for _ in 0..5 {
-            let stream = TcpStream::connect(&address);
+            let stream = TcpStream::connect(self.address());
            thread::sleep(Duration::from_secs(1));
            if let Err(_e) = stream {
                println!("Pageserver stopped");
                return Ok(());
            }
-            println!("Stopping pageserver on {}", address);
+            println!("Stopping pageserver on {}", self.address());
        }

        bail!("Failed to stop pageserver with pid {}", pid);
    }

    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
-        let mut client = self.connection_config().connect(NoTls).unwrap();
+        let connstring = format!(
+            "host={} port={} dbname={} user={}",
+            self.address().ip(),
+            self.address().port(),
+            "no_db",
+            "no_user",
+        );
+        let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();

        println!("Pageserver query: '{}'", sql);
        client.simple_query(sql).unwrap()
    }

    pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
-        self.connection_config().connect(NoTls)
+        let connstring = format!(
+            "host={} port={} dbname={} user={}",
+            self.address().ip(),
+            self.address().port(),
+            "no_db",
+            "no_user",
+        );
+        Client::connect(connstring.as_str(), NoTls)
    }

    pub fn branches_list(&self) -> Result<Vec<BranchInfo>> {
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -2,10 +2,10 @@
 if [ "$1" = 'pageserver' ]; then
    if [ ! -d "/data/timelines" ]; then
        echo "Initializing pageserver data directory"
-        pageserver --init -D /data --postgres-distrib /usr/local
+        pageserver --init --workdir $ZENITH_REPO_DIR
    fi
    echo "Staring pageserver at 0.0.0.0:6400"
-    pageserver -l 0.0.0.0:6400 -D /data
+    pageserver -l 0.0.0.0:6400 --workdir $ZENITH_REPO_DIR
 else
    "$@"
 fi
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -4,8 +4,8 @@
 //! TODO: this module has nothing to do with PostgreSQL pg_basebackup.
 //! It could use a better name.
 //!
-//! Stateless Postgres compute node is launched by sending tarball which contains non-relational data (multixacts, clog, filenodemaps, twophase files)
-//! and generate pg_control and dummy segment of WAL. This module is responsible for creation of such tarball from snapshot directory and
+//! Stateless Postgres compute node is lauched by sending taball which contains on-relational data (multixacts, clog, filenodemaps, twophase files)
+//! and generate pg_control and dummy segment of WAL. This module is responsible for creation of such tarball from snapshot directry and
 //! data stored in object storage.
 //!
 use crate::ZTimelineId;
@@ -17,15 +17,14 @@ use std::time::SystemTime;
 use tar::{Builder, Header};
 use walkdir::WalkDir;

-use crate::object_key::*;
-use crate::repository::Timeline;
+use crate::repository::{DatabaseTag, ObjectTag, Timeline};
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::xlog_utils::*;
 use postgres_ffi::*;
 use zenith_utils::lsn::Lsn;

-/// This is short-living object only for the time of tarball creation,
-/// created mostly to avoid passing a lot of parameters between various functions
+/// This is shorliving object only for the time of tarball creation,
+/// created mostly to avoid passing a lot of parameters between varyouds functions
 /// used for constructing tarball.
 pub struct Basebackup<'a> {
    ar: Builder<&'a mut dyn Write>,
@@ -56,6 +55,7 @@ impl<'a> Basebackup<'a> {
        }
    }

+    #[rustfmt::skip] // otherwise "cargo fmt" produce very strange formatting for macch arms of self.timeline.list_nonrels
    pub fn send_tarball(&mut self) -> anyhow::Result<()> {
        debug!("sending tarball of snapshot in {}", self.snappath);
        for entry in WalkDir::new(&self.snappath) {
@@ -85,8 +85,7 @@ impl<'a> Basebackup<'a> {
                        trace!("sending {}", relpath.display());
                        self.ar.append_path_with_name(fullpath, relpath)?;
                    }
-                } else {
-                    // relation pages are loaded on demand and should not be included in tarball
+                } else {  // relation pages are loaded on demand and should not be included in tarball
                    trace!("not sending {}", relpath.display());
                }
            } else {
@@ -95,25 +94,26 @@ impl<'a> Basebackup<'a> {
        }

        // Generate non-relational files.
-        // Iteration is sorted order: all objects of the same time are grouped and traversed
-        // in key ascending order. For example all pg_xact records precede pg_multixact records and are sorted by block number.
-        // It allows to easily construct SLRU segments (32 blocks).
+		// Iteration is sorted order: all objects of the same time are grouped and traversed
+		// in key ascending order. For example all pg_xact records precede pg_multixact records and are sorted by block number.
+		// It allows to easily construct SLRU segments (32 blocks).
        for obj in self.timeline.list_nonrels(self.lsn)? {
            match obj {
-                ObjectTag::Clog(slru) => self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
-                ObjectTag::MultiXactMembers(slru) => {
-                    self.add_slru_segment("pg_multixact/members", &obj, slru.blknum)?
-                }
-                ObjectTag::MultiXactOffsets(slru) => {
-                    self.add_slru_segment("pg_multixact/offsets", &obj, slru.blknum)?
-                }
-                ObjectTag::FileNodeMap(db) => self.add_relmap_file(&obj, &db)?,
-                ObjectTag::TwoPhase(prepare) => self.add_twophase_file(&obj, prepare.xid)?,
+                ObjectTag::Clog(slru) =>
+					self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
+                ObjectTag::MultiXactMembers(slru) =>
+                    self.add_slru_segment("pg_multixact/members", &obj, slru.blknum)?,
+                ObjectTag::MultiXactOffsets(slru) =>
+                    self.add_slru_segment("pg_multixact/offsets", &obj, slru.blknum)?,
+                ObjectTag::FileNodeMap(db) =>
+					self.add_relmap_file(&obj, &db)?,
+                ObjectTag::TwoPhase(prepare) =>
+					self.add_twophase_file(&obj, prepare.xid)?,
                _ => {}
            }
        }
        self.finish_slru_segment()?; // write last non-completed SLRU segment (if any)
-        self.add_pgcontrol_file()?;
+		self.add_pgcontrol_file()?;
        self.ar.finish()?;
        debug!("all tarred up!");
        Ok(())
@@ -238,9 +238,8 @@ impl<'a> Basebackup<'a> {
        info!("pg_control.state = {}", pg_control.state);
        pg_control.state = pg_constants::DB_SHUTDOWNED;

-        // add zenith.signal file
-        self.ar
-            .append(&new_tar_header("zenith.signal", 0)?, &b""[..])?;
+		// add zenith.signal file
+        self.ar.append(&new_tar_header("zenith.signal", 0)?, &b""[..])?;

        //send pg_control
        let pg_control_bytes = pg_control.encode();
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,7 +8,7 @@ use std::{
    env,
    fs::{File, OpenOptions},
    io,
-    net::TcpListener,
+    net::{SocketAddr, TcpListener},
    path::{Path, PathBuf},
    process::exit,
    thread,
@@ -65,10 +65,11 @@ impl CfgFileParams {

    /// Create a PageServerConf from these string parameters
    fn try_into_config(&self) -> Result<PageServerConf> {
-        let listen_addr = match self.listen_addr.as_ref() {
-            Some(addr) => addr.clone(),
-            None => DEFAULT_LISTEN_ADDR.to_owned(),
-        };
+        let listen_addr: SocketAddr = self
+            .listen_addr
+            .as_deref()
+            .unwrap_or(DEFAULT_LISTEN_ADDR)
+            .parse()?;

        let gc_horizon: u64 = match self.gc_horizon.as_ref() {
            Some(horizon_str) => horizon_str.parse()?,
@@ -271,7 +272,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {

    // Check that we can bind to address before further initialization
    info!("Starting pageserver on {}", conf.listen_addr);
-    let pageserver_listener = TcpListener::bind(conf.listen_addr.clone())?;
+    let pageserver_listener = TcpListener::bind(conf.listen_addr)?;

    // Initialize page cache, this will spawn walredo_thread
    page_cache::init(conf);
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -100,8 +100,7 @@ pub fn init_repo(conf: &'static PageServerConf, repo_dir: &Path) -> Result<()> {
    // and we failed to run initdb again in the same directory. This has been solved for the
    // rapid init+start case now, but the general race condition remains if you restart the
    // server quickly.
-    //let storage = crate::rocksdb_storage::RocksObjectStore::create(conf)?;
-    let storage = crate::inmem_storage::InmemObjectStore::create(conf)?;
+    let storage = crate::rocksdb_storage::RocksObjectStore::create(conf)?;

    let repo = crate::object_repository::ObjectRepository::new(
        conf,
--- a/pageserver/src/inmem_storage.rs
+++ b/pageserver/src/inmem_storage.rs
@@ -1,349 +0,0 @@
-//!
-//! An implementation of the ObjectStore interface, backed by BTreeMap
-//!
-use crate::object_key::*;
-use crate::object_store::ObjectStore;
-use crate::repository::RelTag;
-use crate::PageServerConf;
-use crate::ZTimelineId;
-use anyhow::{bail, Result};
-use std::collections::{BTreeMap,HashSet};
-use std::sync::RwLock;
-use zenith_utils::lsn::Lsn;
-use std::ops::Bound::*;
-use serde::{Deserialize, Serialize};
-use zenith_utils::bin_ser::BeSer;
-use std::io::prelude::*;
-use std::fs::File;
-
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize)]
-pub struct StorageKey {
-    obj_key: ObjectKey,
-    lsn: Lsn,
-}
-
-impl StorageKey {
-    /// The first key for a given timeline
-    fn timeline_start(timeline: ZTimelineId) -> Self {
-        Self {
-            obj_key: ObjectKey {
-                timeline,
-                tag: ObjectTag::FirstTag,
-            },
-            lsn: Lsn(0),
-        }
-    }
-}
-
-pub struct InmemObjectStore {
-    conf: &'static PageServerConf,
-    db: RwLock<BTreeMap<StorageKey, Vec<u8>>>,
-}
-
-impl ObjectStore for InmemObjectStore {
-    fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
-        let db = self.db.read().unwrap();
-        let val = db.get(&StorageKey {
-            obj_key: key.clone(),
-            lsn,
-        });
-        if let Some(val) = val {
-            Ok(val.clone())
-        } else {
-            bail!("could not find page {:?}", key);
-        }
-    }
-
-    fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
-        let search_key = StorageKey {
-            obj_key: key.clone(),
-            lsn: Lsn(0),
-        };
-        let db = self.db.read().unwrap();
-        for pair in db.range(&search_key..) {
-            let key = pair.0;
-            return Ok(Some(key.obj_key.clone()));
-        }
-		Ok(None)
-    }
-
-    fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
-        let mut db = self.db.write().unwrap();
-        db.insert(
-            StorageKey {
-                obj_key: key.clone(),
-                lsn,
-            },
-            value.to_vec(),
-        );
-        Ok(())
-    }
-
-    fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
-        let mut db = self.db.write().unwrap();
-        db.remove(&StorageKey {
-            obj_key: key.clone(),
-            lsn,
-        });
-        Ok(())
-    }
-
-    /// Iterate through page versions of given page, starting from the given LSN.
-    /// The versions are walked in descending LSN order.
-    fn object_versions<'a>(
-        &'a self,
-        key: &ObjectKey,
-        lsn: Lsn,
-    ) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
-        let from = StorageKey {
-            obj_key: key.clone(),
-            lsn: Lsn(0),
-        };
-        let till = StorageKey {
-            obj_key: key.clone(),
-            lsn,
-        };
-        let db = self.db.read().unwrap();
-		let versions: Vec<(Lsn, Vec<u8>)> = db.range(from..=till).map(|pair|(pair.0.lsn, pair.1.clone())).collect();
-        Ok(Box::new(InmemObjectVersionIter::new(versions)))
-	}
-
-    /// Iterate through all timeline objects
-    fn list_objects<'a>(
-        &'a self,
-        timeline: ZTimelineId,
-        nonrel_only: bool,
-        lsn: Lsn,
-    ) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
-        let curr_key = StorageKey::timeline_start(timeline);
-
-        Ok(Box::new(InmemObjectIter {
-			store: &self,
-            curr_key,
-            timeline,
-			nonrel_only,
-            lsn,
-        }))
-    }
-
-    /// Get a list of all distinct relations in given tablespace and database.
-    ///
-    /// TODO: This implementation is very inefficient, it scans
-    /// through all entries in the given database. In practice, this
-    /// is used for CREATE DATABASE, and usually the template database is small.
-    /// But if it's not, this will be slow.
-    fn list_rels(
-        &self,
-        timelineid: ZTimelineId,
-        spcnode: u32,
-        dbnode: u32,
-        lsn: Lsn,
-    ) -> Result<HashSet<RelTag>> {
-        // FIXME: This scans everything. Very slow
-
-        let mut rels: HashSet<RelTag> = HashSet::new();
-
-        let mut search_rel_tag = RelTag {
-            spcnode,
-            dbnode,
-            relnode: 0,
-            forknum: 0u8,
-        };
-		let db = self.db.read().unwrap();
-		'outer: loop {
-			let search_key = StorageKey {
-				obj_key: ObjectKey {
-					timeline: timelineid,
-					tag: ObjectTag::RelationMetadata(search_rel_tag),
-				},
-				lsn: Lsn(0),
-			};
-			for pair in db.range(&search_key..) {
-				let key = pair.0;
-
-				if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
-					if spcnode != 0 && rel_tag.spcnode != spcnode
-						|| dbnode != 0 && rel_tag.dbnode != dbnode
-					{
-						break 'outer;
-					}
-					if key.lsn <= lsn {
-						// visible in this snapshot
-						rels.insert(rel_tag);
-					}
-					search_rel_tag = rel_tag;
-					// skip to next relation
-					// FIXME: What if relnode is u32::MAX ?
-					search_rel_tag.relnode += 1;
-					continue  'outer;
-				} else {
-					// no more relation metadata entries
-					break 'outer;
-				}
-			}
-        }
-
-        Ok(rels)
-    }
-
-    /// Iterate through versions of all objects in a timeline.
-    ///
-    /// Returns objects in increasing key-version order.
-    /// Returns all versions up to and including the specified LSN.
-    fn objects<'a>(
-        &'a self,
-        timeline: ZTimelineId,
-        lsn: Lsn,
-    ) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
-        let curr_key = StorageKey::timeline_start(timeline);
-
-        Ok(Box::new(InmemObjects {
-			store: &self,
-            curr_key,
-            timeline,
-            lsn,
-        }))
-    }
-
-    fn compact(&self) {
-    }
-}
-
-impl Drop for InmemObjectStore {
-	fn drop(&mut self) {
-		let path = self.conf.workdir.join("objstore.dmp");
-		let mut f = File::create(path).unwrap();
-		f.write(&self.db.ser().unwrap()).unwrap();
-	}
-}
-
-impl InmemObjectStore {
-    pub fn open(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
-		let path = conf.workdir.join("objstore.dmp");
-		let mut f = File::open(path)?;
-		let mut buffer = Vec::new();
-		// read the whole file
-		f.read_to_end(&mut buffer)?;
-		let db = RwLock::new(BTreeMap::des(&buffer)?);
-		Ok(InmemObjectStore {
-            conf: conf,
-            db
-        })
-    }
-
-    pub fn create(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
-        Ok(InmemObjectStore {
-            conf: conf,
-            db: RwLock::new(BTreeMap::new()),
-        })
-    }
-}
-
-///
-/// Iterator for `object_versions`. Returns all page versions of a given block, in
-/// reverse LSN order.
-///
-struct InmemObjectVersionIter {
-	versions: Vec<(Lsn, Vec<u8>)>,
-	curr: usize,
-}
-impl InmemObjectVersionIter {
-    fn new(versions: Vec<(Lsn, Vec<u8>)>) -> InmemObjectVersionIter {
-		let curr = versions.len();
-        InmemObjectVersionIter {
-            versions,
-			curr
-		}
-    }
-}
-impl Iterator for InmemObjectVersionIter {
-    type Item = (Lsn, Vec<u8>);
-
-    fn next(&mut self) -> std::option::Option<Self::Item> {
-		if self.curr == 0 {
-			None
-		} else {
-			self.curr -= 1;
-			Some(self.versions[self.curr].clone())
-		}
-    }
-}
-
-struct InmemObjects<'r> {
-    store: &'r InmemObjectStore,
-	curr_key: StorageKey,
-    timeline: ZTimelineId,
-    lsn: Lsn,
-}
-
-impl<'r> Iterator for InmemObjects<'r> {
-    // TODO consider returning Box<[u8]>
-    type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-		self.next_result().transpose()
-    }
-}
-
-impl<'r> InmemObjects<'r> {
-    fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
-		let db = self.store.db.read().unwrap();
-        for pair in db.range((Excluded(&self.curr_key),Unbounded)) {
-			let key = pair.0;
-			if key.obj_key.timeline != self.timeline {
-                return Ok(None);
-            }
-			if key.lsn > self.lsn {
-                // TODO can speed up by seeking iterator
-                continue;
-            }
-			self.curr_key = key.clone();
-			let value = pair.1.clone();
-            return Ok(Some((key.obj_key.tag, key.lsn, value)));
-		}
-        Ok(None)
-    }
-}
-
-///
-/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
-///
-struct InmemObjectIter<'a> {
-    store: &'a InmemObjectStore,
-	curr_key: StorageKey,
-    timeline: ZTimelineId,
-	nonrel_only: bool,
-    lsn: Lsn,
-}
-
-impl<'a> Iterator for InmemObjectIter<'a> {
-    type Item = ObjectTag;
-
-    fn next(&mut self) -> std::option::Option<Self::Item> {
-		let db = self.store.db.read().unwrap();
-		'outer: loop {
-			for pair in db.range((Excluded(&self.curr_key),Unbounded)) {
-				let key = pair.0;
-				if key.obj_key.timeline != self.timeline {
-					return None;
-				}
-				self.curr_key = key.clone();
-				self.curr_key.lsn = Lsn(u64::MAX); // next seek should skip all versions
-				if key.lsn <= self.lsn {
-					// visible in this snapshot
-					if self.nonrel_only {
-						match key.obj_key.tag {
-							ObjectTag::RelationMetadata(_) => return None,
-							ObjectTag::RelationBuffer(_) => return None,
-							_ => return Some(key.obj_key.tag),
-						}
-					} else {
-						return Some(key.obj_key.tag);
-					}
-				}
-				continue 'outer;
-			}
-			return None;
-		}
-    }
-}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,13 +1,13 @@
 use serde::{Deserialize, Serialize};

 use std::fmt;
+use std::net::SocketAddr;
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::time::Duration;

 pub mod basebackup;
 pub mod branches;
-pub mod object_key;
 pub mod object_repository;
 pub mod object_store;
 pub mod page_cache;
@@ -15,7 +15,6 @@ pub mod page_service;
 pub mod repository;
 pub mod restore_local_repo;
 pub mod rocksdb_storage;
-pub mod inmem_storage;
 pub mod tui;
 pub mod tui_event;
 mod tui_logger;
@@ -27,7 +26,7 @@ pub mod walredo;
 pub struct PageServerConf {
    pub daemonize: bool,
    pub interactive: bool,
-    pub listen_addr: String,
+    pub listen_addr: SocketAddr,
    pub gc_horizon: u64,
    pub gc_period: Duration,

@@ -104,7 +103,7 @@ impl PageServerConf {
 /// is separate from PostgreSQL timelines, and doesn't have those
 /// limitations. A zenith timeline is identified by a 128-bit ID, which
 /// is usually printed out as a hex string.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub struct ZTimelineId([u8; 16]);

 impl FromStr for ZTimelineId {
--- a/pageserver/src/object_key.rs
+++ b/pageserver/src/object_key.rs
@@ -1,84 +0,0 @@
-use crate::repository::{BufferTag, RelTag};
-use crate::waldecoder::TransactionId;
-use crate::ZTimelineId;
-use serde::{Deserialize, Serialize};
-
-///
-/// ObjectKey is the key type used to identify objects stored in an object
-/// repository. It is shared between object_repository.rs and object_store.rs.
-/// It is mostly opaque to ObjectStore, it just stores and retrieves objects
-/// using the key given by the caller.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
-pub struct ObjectKey {
-    pub timeline: ZTimelineId,
-    pub tag: ObjectTag,
-}
-
-///
-/// Non-relation transaction status files (clog (a.k.a. pg_xact) and pg_multixact)
-/// in Postgres are handled by SLRU (Simple LRU) buffer, hence the name.
-///
-/// These files are global for a postgres instance.
-///
-/// These files are divided into segments, which are divided into pages
-/// of the same BLCKSZ as used for relation files.
-///
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
-pub struct SlruBufferTag {
-    pub blknum: u32,
-}
-
-///
-/// Special type of Postgres files: pg_filenode.map is needed to map
-/// catalog table OIDs to filenode numbers, which define filename.
-///
-/// Each database has a map file for its local mapped catalogs,
-/// and there is a separate map file for shared catalogs.
-///
-/// These files have untypical size of 512 bytes.
-///
-/// See PostgreSQL relmapper.c for details.
-///
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
-pub struct DatabaseTag {
-    pub spcnode: u32,
-    pub dbnode: u32,
-}
-
-///
-/// Non-relation files that keep state for prepared transactions.
-/// Unlike other files these are not divided into pages.
-///
-/// See PostgreSQL twophase.c for details.
-///
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
-pub struct PrepareTag {
-    pub xid: TransactionId,
-}
-
-/// ObjectTag is a part of ObjectKey that is specific to the type of
-/// the stored object.
-///
-/// NB: the order of the enum values is significant!  In particular,
-/// rocksdb_storage.rs assumes that TimelineMetadataTag is first
-///
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
-pub enum ObjectTag {
-    // dummy tag preceeding all other keys
-    FirstTag,
-    TimelineMetadataTag,
-    // Special entry that represents PostgreSQL checkpoint.
-    // We use it to track fields needed to restore controlfile checkpoint.
-    Checkpoint,
-    // Various types of non-relation files.
-    // We need them to bootstrap compute node.
-    ControlFile,
-    Clog(SlruBufferTag),
-    MultiXactMembers(SlruBufferTag),
-    MultiXactOffsets(SlruBufferTag),
-    FileNodeMap(DatabaseTag),
-    TwoPhase(PrepareTag),
-    // put relations at the end of enum to allow efficient iterations through non-rel objects
-    RelationMetadata(RelTag),
-    RelationBuffer(BufferTag),
-}
--- a/pageserver/src/object_repository.rs
+++ b/pageserver/src/object_repository.rs
@@ -13,8 +13,7 @@
 //! until we find the page we're looking for, making a separate lookup into the
 //! key-value store for each timeline.

-use crate::object_key::*;
-use crate::object_store::ObjectStore;
+use crate::object_store::{ObjectKey, ObjectStore};
 use crate::repository::*;
 use crate::restore_local_repo::import_timeline_wal;
 use crate::walredo::WalRedoManager;
@@ -106,7 +105,7 @@ impl Repository for ObjectRepository {
    ) -> Result<Arc<dyn Timeline>> {
        let mut timelines = self.timelines.lock().unwrap();

-        // Write initial metadata key.
+        // Write metadata key
        let metadata = MetadataEntry {
            last_valid_lsn: start_lsn,
            last_record_lsn: start_lsn,
@@ -162,7 +161,7 @@ impl Repository for ObjectRepository {
                ObjectTag::TimelineMetadataTag => {} // skip it
                _ => {
                    let img = src_timeline.get_page_at_lsn_nowait(tag, at_lsn)?;
-                    let val = ObjectValue::Page(PageEntry::Page(img));
+                    let val = ObjectValue::Page(img);
                    let key = ObjectKey { timeline: dst, tag };
                    self.obj_store.put(&key, at_lsn, &ObjectValue::ser(&val)?)?;
                }
@@ -236,19 +235,22 @@ impl ObjectTimeline {
        let v = obj_store
            .get(&timeline_metadata_key(timelineid), Lsn(0))
            .with_context(|| "timeline not found in repository")?;
-        let metadata = ObjectValue::des_timeline_metadata(&v)?;

-        let timeline = ObjectTimeline {
-            timelineid,
-            obj_store,
-            walredo_mgr,
-            last_valid_lsn: SeqWait::new(metadata.last_valid_lsn),
-            last_record_lsn: AtomicLsn::new(metadata.last_record_lsn.0),
-            ancestor_timeline: metadata.ancestor_timeline,
-            ancestor_lsn: metadata.ancestor_lsn,
-            rel_meta: RwLock::new(BTreeMap::new()),
-        };
-        Ok(timeline)
+        if let ObjectValue::TimelineMetadata(metadata) = ObjectValue::des(&v)? {
+            let timeline = ObjectTimeline {
+                timelineid,
+                obj_store,
+                walredo_mgr,
+                last_valid_lsn: SeqWait::new(metadata.last_valid_lsn),
+                last_record_lsn: AtomicLsn::new(metadata.last_record_lsn.0),
+                ancestor_timeline: metadata.ancestor_timeline,
+                ancestor_lsn: metadata.ancestor_lsn,
+                rel_meta: RwLock::new(BTreeMap::new()),
+            };
+            Ok(timeline)
+        } else {
+            bail!("Invalid timeline metadata");
+        }
    }
 }

@@ -278,22 +280,17 @@ impl Timeline for ObjectTimeline {
            let page_img: Bytes;

            match ObjectValue::des(&value)? {
-                ObjectValue::Page(PageEntry::Page(img)) => {
+                ObjectValue::Page(img) => {
                    page_img = img;
                }
-                ObjectValue::Page(PageEntry::WALRecord(_rec)) => {
+                ObjectValue::WALRecord(_rec) => {
                    // Request the WAL redo manager to apply the WAL records for us.
                    let (base_img, records) = self.collect_records_for_apply(tag, lsn)?;
                    page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;

-                    // Garbage collection assumes that we remember the materialized page
-                    // version. Otherwise we could opt to not do it, with the downside that
-                    // the next GetPage@LSN call of the same page version would have to
-                    // redo the WAL again.
-                    self.put_page_image(tag, lsn, page_img.clone(), false)?;
+                    self.put_page_image(tag, lsn, page_img.clone())?;
                }
-                ObjectValue::SLRUTruncate => page_img = Bytes::from_static(&ZERO_PAGE),
-                _ => bail!("Invalid object kind, expected a page entry or SRLU truncate"),
+                x => bail!("Unexpected object value: {:?}", x),
            }
            // FIXME: assumes little-endian. Only used for the debugging log though
            let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
@@ -373,10 +370,12 @@ impl Timeline for ObjectTimeline {
                .obj_store
                .get(&timeline_metadata_key(timeline), Lsn(0))
                .with_context(|| "timeline not found in repository")?;
-            let metadata = ObjectValue::des_timeline_metadata(&v)?;
-
-            prev_timeline = metadata.ancestor_timeline;
-            lsn = metadata.ancestor_lsn;
+            if let ObjectValue::TimelineMetadata(metadata) = ObjectValue::des(&v)? {
+                prev_timeline = metadata.ancestor_timeline;
+                lsn = metadata.ancestor_lsn;
+            } else {
+                bail!("Invalid timeline metadata");
+            }
        }

        Ok(all_rels)
@@ -394,8 +393,13 @@ impl Timeline for ObjectTimeline {
    /// current end-of-file.
    fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()> {
        let lsn = rec.lsn;
+        let key = ObjectKey {
+            timeline: self.timelineid,
+            tag,
+        };
+        let val = ObjectValue::WALRecord(rec);

-        self.put_page_entry(&tag, lsn, PageEntry::WALRecord(rec))?;
+        self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
        debug!("put_wal_record {:?} at {}", tag, lsn);

        if let ObjectTag::RelationBuffer(tag) = tag {
@@ -404,6 +408,8 @@ impl Timeline for ObjectTimeline {

            if tag.blknum >= old_nblocks {
                let new_nblocks = tag.blknum + 1;
+                let key = relation_size_key(self.timelineid, tag.rel);
+                let val = ObjectValue::RelationSize(new_nblocks);

                trace!(
                    "Extended relation {} from {} to {} blocks at {}",
@@ -413,7 +419,7 @@ impl Timeline for ObjectTimeline {
                    lsn
                );

-                self.put_relsize_entry(&tag.rel, lsn, RelationSizeEntry::Size(new_nblocks))?;
+                self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
                let mut rel_meta = self.rel_meta.write().unwrap();
                rel_meta.insert(
                    tag.rel,
@@ -427,36 +433,18 @@ impl Timeline for ObjectTimeline {
        Ok(())
    }

-    /// Unlink relation. This method is used for marking dropped relations.
-    fn put_unlink(&self, rel_tag: RelTag, lsn: Lsn) -> Result<()> {
-        self.put_relsize_entry(&rel_tag, lsn, RelationSizeEntry::Unlink)?;
-
-        Ok(())
-    }
-
-    /// Truncate SRLU segment
-    fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()> {
+    /// Unlink object. This method is used for marking dropped relations
+    /// and removed segments of SLRUs.
+    fn put_unlink(&self, tag: ObjectTag, lsn: Lsn) -> Result<()> {
        let key = ObjectKey {
            timeline: self.timelineid,
            tag,
        };
-        let val = ObjectValue::SLRUTruncate;
+        let val = ObjectValue::Unlink;
        self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
        Ok(())
    }

-    fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>> {
-        let key = ObjectKey {
-            timeline: self.timelineid,
-            tag,
-        };
-        if let Some(key) = self.obj_store.get_next_key(&key)? {
-            Ok(Some(key.tag))
-        } else {
-            Ok(None)
-        }
-    }
-
    fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()> {
        let key = ObjectKey {
            timeline: self.timelineid,
@@ -469,13 +457,16 @@ impl Timeline for ObjectTimeline {
    ///
    /// Memorize a full image of a page version
    ///
-    fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes, update_meta: bool) -> Result<()> {
-        self.put_page_entry(&tag, lsn, PageEntry::Page(img))?;
+    fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes) -> Result<()> {
+        let key = ObjectKey {
+            timeline: self.timelineid,
+            tag,
+        };
+        let val = ObjectValue::Page(img);

-		if !update_meta {
-			return Ok(());
-		}
-        debug!("put_page_image rel {:?} at {}", tag, lsn);
+        self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
+
+        debug!("put_page_image {:?} at {}", tag, lsn);

        if let ObjectTag::RelationBuffer(tag) = tag {
            // Also check if this created or extended the file
@@ -483,7 +474,8 @@ impl Timeline for ObjectTimeline {

            if tag.blknum >= old_nblocks {
                let new_nblocks = tag.blknum + 1;
-
+                let key = relation_size_key(self.timelineid, tag.rel);
+                let val = ObjectValue::RelationSize(new_nblocks);
                trace!(
                    "Extended relation {} from {} to {} blocks at {}",
                    tag.rel,
@@ -492,7 +484,7 @@ impl Timeline for ObjectTimeline {
                    lsn
                );

-                self.put_relsize_entry(&tag.rel, lsn, RelationSizeEntry::Size(new_nblocks))?;
+                self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
                let mut rel_meta = self.rel_meta.write().unwrap();
                rel_meta.insert(
                    tag.rel,
@@ -511,9 +503,10 @@ impl Timeline for ObjectTimeline {
    /// associating it with all pages started with specified block number
    ///
    fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()> {
-        info!("Truncate relation {} to {} blocks at {}", rel, nblocks, lsn);
+        let key = relation_size_key(self.timelineid, rel);
+        let val = ObjectValue::RelationSize(nblocks);

-        self.put_relsize_entry(&rel, lsn, RelationSizeEntry::Size(nblocks))?;
+        self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
        let mut rel_meta = self.rel_meta.write().unwrap();
        rel_meta.insert(
            rel,
@@ -598,7 +591,12 @@ impl Timeline for ObjectTimeline {
        };
        trace!("checkpoint at {}", metadata.last_valid_lsn);

-        self.put_timeline_metadata_entry(metadata)?;
+        let val = ObjectValue::TimelineMetadata(metadata);
+        self.obj_store.put(
+            &timeline_metadata_key(self.timelineid),
+            Lsn(0),
+            &ObjectValue::ser(&val)?,
+        )?;

        Ok(())
    }
@@ -606,171 +604,7 @@ impl Timeline for ObjectTimeline {
    fn history<'a>(&'a self) -> Result<Box<dyn History + 'a>> {
        let lsn = self.last_valid_lsn.load();
        let iter = self.obj_store.objects(self.timelineid, lsn)?;
-        Ok(Box::new(ObjectHistory {
-            lsn,
-            iter,
-            last_relation_size: None,
-        }))
-    }
-
-    fn gc_iteration(&self, horizon: u64) -> Result<GcResult> {
-        let last_lsn = self.get_last_valid_lsn();
-        let mut result: GcResult = Default::default();
-
-        // checked_sub() returns None on overflow.
-        if let Some(horizon) = last_lsn.checked_sub(horizon) {
-            // WAL is large enough to perform GC
-            let now = Instant::now();
-
-            // Iterate through all objects in timeline
-            for obj in self
-                .obj_store
-                .list_objects(self.timelineid, false, last_lsn)?
-            {
-                result.inspected += 1;
-                match obj {
-                    // Prepared transactions
-                    ObjectTag::TwoPhase(prepare) => {
-                        let key = ObjectKey {
-                            timeline: self.timelineid,
-                            tag: obj,
-                        };
-                        for vers in self.obj_store.object_versions(&key, horizon)? {
-                            if self.get_tx_status(prepare.xid, horizon)?
-                                != pg_constants::TRANSACTION_STATUS_IN_PROGRESS
-                            {
-                                let lsn = vers.0;
-                                self.obj_store.unlink(&key, lsn)?;
-                                result.prep_deleted += 1;
-                            }
-                        }
-                    }
-                    ObjectTag::RelationMetadata(_) => {
-                        // Do not need to reconstruct page images,
-                        // just delete all old versions over horizon
-                        let mut last_version = true;
-                        let key = ObjectKey {
-                            timeline: self.timelineid,
-                            tag: obj,
-                        };
-                        for vers in self.obj_store.object_versions(&key, horizon)? {
-                            let lsn = vers.0;
-                            if last_version {
-                                let content = vers.1;
-                                match ObjectValue::des(&content[..])? {
-                                    ObjectValue::RelationSize(RelationSizeEntry::Unlink) => {
-                                        self.obj_store.unlink(&key, lsn)?;
-                                        result.deleted += 1;
-                                        result.dropped += 1;
-                                    }
-                                    _ => (), // preserve last version
-                                }
-                                last_version = false;
-                                result.truncated += 1;
-                                result.n_relations += 1;
-                            } else {
-                                self.obj_store.unlink(&key, lsn)?;
-                                result.deleted += 1;
-                            }
-                        }
-                    }
-                    ObjectTag::RelationBuffer(tag) => {
-                        // Reconstruct page at horizon unless relation was dropped
-                        // and delete all older versions over horizon
-                        let mut last_version = true;
-                        let key = ObjectKey {
-                            timeline: self.timelineid,
-                            tag: obj,
-                        };
-                        for vers in self.obj_store.object_versions(&key, horizon)? {
-                            let lsn = vers.0;
-                            if last_version {
-                                result.truncated += 1;
-                                last_version = false;
-                                if let Some(rel_size) = self.relsize_get_nowait(tag.rel, last_lsn)? {
-                                    if rel_size > tag.blknum {
-                                        // preserve and materialize last version before deleting all preceeding
-                                        self.get_page_at_lsn_nowait(obj, lsn)?;
-                                        continue;
-                                    }
-                                    debug!("Drop last block {} of relation {:?} at {} because it is beyond relation size {}", tag.blknum, tag.rel, lsn, rel_size);
-                                } else {
-                                    if let Some(rel_size) =
-                                        self.relsize_get_nowait(tag.rel, last_lsn)?
-                                    {
-                                        debug!("Preserve block {} of relation {:?} at {} because relation has size {} at {}", tag.rel, tag, lsn, rel_size, last_lsn);
-                                        continue;
-                                    }
-                                    debug!("Relation {:?} was dropped at {}", tag.rel, lsn);
-                                }
-                                // relation was dropped or truncated so this block can be removed
-                            }
-                            self.obj_store.unlink(&key, lsn)?;
-                            result.deleted += 1;
-                        }
-                    }
-                    // SLRU-s
-                    ObjectTag::Clog(_)
-                    | ObjectTag::MultiXactOffsets(_)
-                    | ObjectTag::MultiXactMembers(_) => {
-                        // Remove old versions over horizon
-                        let mut last_version = true;
-                        let key = ObjectKey {
-                            timeline: self.timelineid,
-                            tag: obj,
-                        };
-                        for vers in self.obj_store.object_versions(&key, horizon)? {
-                            let lsn = vers.0;
-                            if last_version {
-                                let content = vers.1;
-                                match ObjectValue::des(&content[..])? {
-                                    ObjectValue::SLRUTruncate => {
-                                        self.obj_store.unlink(&key, lsn)?;
-                                        result.slru_deleted += 1;
-                                    }
-                                    ObjectValue::Page(PageEntry::WALRecord(_)) => {
-                                        // preserve and materialize last version before deleting all preceeding
-                                        self.get_page_at_lsn_nowait(obj, lsn)?;
-                                    }
-                                    _ => {} // do nothing if already materialized
-                                }
-                                last_version = false;
-                            } else {
-                                // delete deteriorated version
-                                self.obj_store.unlink(&key, lsn)?;
-                                result.slru_deleted += 1;
-                            }
-                        }
-                    }
-                    // versioned always materialized objects: no need to reconstruct pages
-                    ObjectTag::Checkpoint | ObjectTag::ControlFile => {
-                        // Remove old versions over horizon
-                        let mut last_version = true;
-                        let key = ObjectKey {
-                            timeline: self.timelineid,
-                            tag: obj,
-                        };
-                        for vers in self.obj_store.object_versions(&key, horizon)? {
-                            let lsn = vers.0;
-                            if last_version {
-                                // preserve last version
-                                last_version = false;
-                            } else {
-                                // delete deteriorated version
-                                self.obj_store.unlink(&key, lsn)?;
-                                result.chkp_deleted += 1;
-                            }
-                        }
-                    }
-                    _ => (), // do nothing
-                }
-            }
-            result.elapsed = now.elapsed();
-            info!("Garbage collection completed in {:?}: {} relations inspected, {} object inspected, {} version histories truncated, {} versions deleted, {} relations dropped",
-                  result.elapsed, result.n_relations, result.inspected, result.truncated, result.deleted, result.dropped);
-            self.obj_store.compact();
-        }
-        Ok(result)
+        Ok(Box::new(ObjectHistory { lsn, iter }))
    }
 }

@@ -793,8 +627,9 @@ impl ObjectTimeline {
        let mut iter = self.object_versions(&*self.obj_store, &key, lsn)?;

        if let Some((version_lsn, value)) = iter.next().transpose()? {
-            match ObjectValue::des_relsize(&value)? {
-                RelationSizeEntry::Size(nblocks) => {
+            let value = ObjectValue::des(&value)?;
+            match value {
+                ObjectValue::RelationSize(nblocks) => {
                    trace!(
                        "relation {} has size {} at {} (request {})",
                        rel,
@@ -804,7 +639,7 @@ impl ObjectTimeline {
                    );
                    Ok(Some(nblocks))
                }
-                RelationSizeEntry::Unlink => {
+                ObjectValue::Unlink => {
                    trace!(
                        "relation {} not found; it was dropped at lsn {}",
                        rel,
@@ -812,9 +647,15 @@ impl ObjectTimeline {
                    );
                    Ok(None)
                }
+                _ => bail!(
+                    "Unexpect relation {} size value {:?} at {}",
+                    rel,
+                    value,
+                    lsn
+                ),
            }
        } else {
-            info!("relation {} not found at {}", rel, lsn);
+            debug!("relation {} not found at {}", rel, lsn);
            Ok(None)
        }
    }
@@ -842,20 +683,21 @@ impl ObjectTimeline {
        };
        let mut iter = self.object_versions(&*self.obj_store, &searchkey, lsn)?;
        while let Some((_key, value)) = iter.next().transpose()? {
-            match ObjectValue::des_page(&value)? {
-                PageEntry::Page(img) => {
+            match ObjectValue::des(&value)? {
+                ObjectValue::Page(img) => {
                    // We have a base image. No need to dig deeper into the list of
                    // records
                    base_img = Some(img);
                    break;
                }
-                PageEntry::WALRecord(rec) => {
+                ObjectValue::WALRecord(rec) => {
                    records.push(rec.clone());
                    // If this WAL record initializes the page, no need to dig deeper.
                    if rec.will_init {
                        break;
                    }
                }
+                x => bail!("Unexpected object value {:?}", x),
            }
        }
        records.reverse();
@@ -867,15 +709,170 @@ impl ObjectTimeline {
            .name("Garbage collection thread".into())
            .spawn(move || {
                // FIXME
-                timeline_rc.gc_loop(conf).expect("GC thread died");
+                timeline_rc.do_gc(conf).expect("GC thread died");
            })
            .unwrap();
    }

-    fn gc_loop(&self, conf: &'static PageServerConf) -> Result<()> {
+    fn do_gc(&self, conf: &'static PageServerConf) -> Result<()> {
        loop {
            thread::sleep(conf.gc_period);
-            self.gc_iteration(conf.gc_horizon)?;
+            let last_lsn = self.get_last_valid_lsn();
+
+            // checked_sub() returns None on overflow.
+            if let Some(horizon) = last_lsn.checked_sub(conf.gc_horizon) {
+                // WAL is large enough to perform GC
+                let now = Instant::now();
+                let mut truncated = 0u64;
+                let mut inspected = 0u64;
+                let mut deleted = 0u64;
+
+                // Iterate through all objects in timeline
+                for obj in self
+                    .obj_store
+                    .list_objects(self.timelineid, false, last_lsn)?
+                {
+                    inspected += 1;
+                    match obj {
+                        // Prepared transactions
+                        ObjectTag::TwoPhase(prepare) => {
+                            let key = ObjectKey {
+                                timeline: self.timelineid,
+                                tag: obj,
+                            };
+                            for vers in self.obj_store.object_versions(&key, horizon)? {
+                                if self.get_tx_status(prepare.xid, horizon)?
+                                    != pg_constants::TRANSACTION_STATUS_IN_PROGRESS
+                                {
+                                    let lsn = vers.0;
+                                    self.obj_store.unlink(&key, lsn)?;
+                                    deleted += 1;
+                                }
+                            }
+                        }
+                        ObjectTag::RelationMetadata(_) => {
+                            // Do not need to reconstruct page images,
+                            // just delete all old versions over horizon
+                            let mut last_version = true;
+                            let key = ObjectKey {
+                                timeline: self.timelineid,
+                                tag: obj,
+                            };
+                            for vers in self.obj_store.object_versions(&key, horizon)? {
+                                let lsn = vers.0;
+                                if last_version {
+                                    let content = vers.1;
+                                    match ObjectValue::des(&content[..])? {
+                                        ObjectValue::Unlink => {
+                                            self.obj_store.unlink(&key, lsn)?;
+                                            deleted += 1;
+                                        }
+                                        _ => (), // preserve last version
+                                    }
+                                    last_version = false;
+                                    truncated += 1;
+                                } else {
+                                    self.obj_store.unlink(&key, lsn)?;
+                                    deleted += 1;
+                                }
+                            }
+                        }
+                        ObjectTag::RelationBuffer(tag) => {
+                            // Reconstruct page at horizon unless relation was dropped
+                            // and delete all older versions over horizon
+                            let mut last_version = true;
+                            let key = ObjectKey {
+                                timeline: self.timelineid,
+                                tag: obj,
+                            };
+                            for vers in self.obj_store.object_versions(&key, horizon)? {
+                                let lsn = vers.0;
+                                if last_version {
+                                    truncated += 1;
+                                    last_version = false;
+                                    if let Some(rel_size) = self.relsize_get_nowait(tag.rel, lsn)? {
+                                        if rel_size > tag.blknum {
+                                            // preserve and materialize last version before deleting all preceeding
+                                            self.get_page_at_lsn_nowait(obj, lsn)?;
+                                            continue;
+                                        }
+                                        debug!("Drop last block {} of relation {:?} at {} because it is beyond relation size {}", tag.blknum, tag.rel, lsn, rel_size);
+                                    } else {
+                                        if let Some(rel_size) =
+                                            self.relsize_get_nowait(tag.rel, last_lsn)?
+                                        {
+                                            debug!("Preserve block {} of relation {:?} at {} because relation has size {} at {}", tag.rel, tag, lsn, rel_size, last_lsn);
+                                            continue;
+                                        }
+                                        debug!("Relation {:?} was dropped at {}", tag.rel, lsn);
+                                    }
+                                    // relation was dropped or truncated so this block can be removed
+                                }
+                                self.obj_store.unlink(&key, lsn)?;
+                                deleted += 1;
+                            }
+                        }
+                        // SLRU-s
+                        ObjectTag::Clog(_)
+                        | ObjectTag::MultiXactOffsets(_)
+                        | ObjectTag::MultiXactMembers(_) => {
+                            // Remove old versions over horizon
+                            let mut last_version = true;
+                            let key = ObjectKey {
+                                timeline: self.timelineid,
+                                tag: obj,
+                            };
+                            for vers in self.obj_store.object_versions(&key, horizon)? {
+                                let lsn = vers.0;
+                                if last_version {
+                                    let content = vers.1;
+                                    match ObjectValue::des(&content[..])? {
+                                        ObjectValue::Unlink => {
+                                            self.obj_store.unlink(&key, lsn)?;
+                                            deleted += 1;
+                                        }
+                                        ObjectValue::WALRecord(_) => {
+                                            // preserve and materialize last version before deleting all preceeding
+                                            self.get_page_at_lsn_nowait(obj, lsn)?;
+                                        }
+                                        _ => {} // do nothing if already materialized
+                                    }
+                                    last_version = false;
+                                    truncated += 1;
+                                } else {
+                                    // delete deteriorated version
+                                    self.obj_store.unlink(&key, lsn)?;
+                                    deleted += 1;
+                                }
+                            }
+                        }
+                        // versioned alwaysmaterialized objects: no need to reconstruct pages
+                        ObjectTag::Checkpoint | ObjectTag::ControlFile => {
+                            // Remove old versions over horizon
+                            let mut last_version = true;
+                            let key = ObjectKey {
+                                timeline: self.timelineid,
+                                tag: obj,
+                            };
+                            for vers in self.obj_store.object_versions(&key, horizon)? {
+                                let lsn = vers.0;
+                                if last_version {
+                                    // presrve last version
+                                    last_version = false;
+                                    truncated += 1;
+                                } else {
+                                    // delete deteriorated version
+                                    self.obj_store.unlink(&key, lsn)?;
+                                    deleted += 1;
+                                }
+                            }
+                        }
+                        _ => (), // do nothing
+                    }
+                }
+                info!("Garbage collection completed in {:?}:\n{} version chains inspected,  {} version histories truncated, {} versions deleted",
+					  now.elapsed(), inspected, truncated, deleted);
+            }
        }
    }

@@ -930,52 +927,26 @@ impl ObjectTimeline {

        Ok(ObjectVersionIter {
            obj_store,
-            object_tag: key.tag,
+            tag: key.tag,
            current_iter,
            ancestor_timeline: self.ancestor_timeline,
            ancestor_lsn: self.ancestor_lsn,
        })
    }
-
-    //
-    // Helper functions to store different kinds of objects to the underlying ObjectStore
-    //
-    fn put_page_entry(&self, tag: &ObjectTag, lsn: Lsn, val: PageEntry) -> Result<()> {
-        let key = ObjectKey {
-            timeline: self.timelineid,
-            tag: *tag,
-        };
-        let val = ObjectValue::Page(val);
-
-        self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)
-    }
-
-    fn put_relsize_entry(&self, tag: &RelTag, lsn: Lsn, val: RelationSizeEntry) -> Result<()> {
-        let key = relation_size_key(self.timelineid, *tag);
-        let val = ObjectValue::RelationSize(val);
-
-        self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)
-    }
-
-    fn put_timeline_metadata_entry(&self, val: MetadataEntry) -> Result<()> {
-        let key = timeline_metadata_key(self.timelineid);
-        let val = ObjectValue::TimelineMetadata(val);
-
-        self.obj_store.put(&key, Lsn(0), &ObjectValue::ser(&val)?)
-    }
 }

 struct ObjectHistory<'a> {
    iter: Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>,
    lsn: Lsn,
-    last_relation_size: Option<(RelTag, u32)>,
 }

 impl<'a> Iterator for ObjectHistory<'a> {
-    type Item = Result<RelationUpdate>;
+    type Item = Result<Modification>;

    fn next(&mut self) -> Option<Self::Item> {
-        self.next_result().transpose()
+        self.iter
+            .next()
+            .map(|result| result.map(|t| Modification::new(t)))
    }
 }

@@ -985,117 +956,29 @@ impl<'a> History for ObjectHistory<'a> {
    }
 }

-impl<'a> ObjectHistory<'a> {
-    fn handle_relation_size(
-        &mut self,
-        rel_tag: RelTag,
-        entry: RelationSizeEntry,
-    ) -> Option<Update> {
-        match entry {
-            RelationSizeEntry::Size(size) => {
-                // we only want to output truncations, expansions are filtered out
-                let last_relation_size = self.last_relation_size.replace((rel_tag, size));
-
-                match last_relation_size {
-                    Some((last_buf, last_size)) if last_buf != rel_tag || size < last_size => {
-                        Some(Update::Truncate { n_blocks: size })
-                    }
-                    _ => None,
-                }
-            }
-            RelationSizeEntry::Unlink => Some(Update::Unlink),
-        }
-    }
-
-    fn handle_page(&mut self, buf_tag: BufferTag, entry: PageEntry) -> Update {
-        match entry {
-            PageEntry::Page(img) => Update::Page {
-                blknum: buf_tag.blknum,
-                img,
-            },
-            PageEntry::WALRecord(rec) => Update::WALRecord {
-                blknum: buf_tag.blknum,
-                rec,
-            },
-        }
-    }
-
-    fn next_result(&mut self) -> Result<Option<RelationUpdate>> {
-        while let Some((object_tag, lsn, value)) = self.iter.next().transpose()? {
-            let (rel_tag, update) = match object_tag {
-                ObjectTag::RelationMetadata(rel_tag) => {
-                    let entry = ObjectValue::des_relsize(&value)?;
-                    match self.handle_relation_size(rel_tag, entry) {
-                        Some(relation_update) => (rel_tag, relation_update),
-                        None => continue,
-                    }
-                }
-                ObjectTag::RelationBuffer(buf_tag) => {
-                    let entry = ObjectValue::des_page(&value)?;
-                    let update = self.handle_page(buf_tag, entry);
-
-                    (buf_tag.rel, update)
-                }
-                _ => continue,
-            };
-
-            return Ok(Some(RelationUpdate {
-                rel: rel_tag,
-                lsn,
-                update,
-            }));
-        }
-
-        Ok(None)
-    }
-}
-
 ///
 /// We store several kinds of objects in the repository.
-/// We have per-page, per-relation and per-timeline entries.
+/// We have per-page, per-relation(or non-rel file) and per-timeline entries.
 ///
 #[derive(Debug, Clone, Serialize, Deserialize)]
-enum ObjectValue {
-    Page(PageEntry),
-    RelationSize(RelationSizeEntry),
-    TimelineMetadata(MetadataEntry),
-    SLRUTruncate,
-}
-
-///
-/// This is what we store for each page in the object store. Use
-/// ObjectTag::RelationBuffer as key.
-///
-#[derive(Debug, Clone, Serialize, Deserialize)]
-enum PageEntry {
-    /// Ready-made image of the block
+pub enum ObjectValue {
+    /// Ready-made images of the block
    Page(Bytes),
-
-    /// WAL record, to be applied on top of the "previous" entry
+    /// WAL records, to be applied on top of the "previous" entry
    ///
    /// Some WAL records will initialize the page from scratch. For such records,
    /// the 'will_init' flag is set. They don't need the previous page image before
    /// applying. The 'will_init' flag is set for records containing a full-page image,
-    /// and for records with the BKPBLOCK_WILL_INIT flag. These differ from Page images
-    /// stored directly in the repository in that you still need to run the WAL redo
+    /// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
+    /// stored directly in the cache entry in that you still need to run the WAL redo
    /// routine to generate the page image.
    WALRecord(WALRecord),
-}
-
-///
-/// In addition to page versions, we store relation size as a separate, versioned,
-/// object. That way we can answer nblocks requests faster, and we also use it to
-/// support relation truncation without having to add a tombstone page version for
-/// each block that is truncated away.
-///
-/// Use ObjectTag::RelationMetadata as the key.
-///
-#[derive(Debug, Clone, Serialize, Deserialize)]
-enum RelationSizeEntry {
-    Size(u32),
-
+    /// RelationSize. We store it separately not only to ansver nblocks requests faster.
+    /// We also need it to support relation truncation.
+    RelationSize(u32),
    /// Tombstone for a dropped relation.
    Unlink,
+    TimelineMetadata(MetadataEntry),
 }

 const fn relation_size_key(timelineid: ZTimelineId, rel: RelTag) -> ObjectKey {
@@ -1106,9 +989,8 @@ const fn relation_size_key(timelineid: ZTimelineId, rel: RelTag) -> ObjectKey {
 }

 ///
-/// In addition to the per-page and per-relation entries, we also store
-/// a little metadata blob for each timeline. This is not versioned, use
-/// ObjectTag::TimelineMetadataTag with constant Lsn(0) as the key.
+/// In addition to those per-page and per-relation entries, we also
+/// store a little metadata blob for each timeline.
 ///
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct MetadataEntry {
@@ -1125,44 +1007,6 @@ const fn timeline_metadata_key(timelineid: ZTimelineId) -> ObjectKey {
    }
 }

-///
-/// Helper functions to deserialize ObjectValue, when the caller knows what kind of
-/// a value it should be.
-///
-/// There are no matching helper functions for serializing. Instead, there are
-/// `put_page_entry`, `put_relsize_entry`, and `put_timeline_metadata_entry` helper
-/// functions in ObjectTimeline that both construct the right kind of key and
-/// serialize the value in the same call.
-///
-impl ObjectValue {
-    fn des_page(v: &[u8]) -> Result<PageEntry> {
-        match ObjectValue::des(&v)? {
-            ObjectValue::Page(p) => Ok(p),
-            _ => {
-                bail!("Invalid object kind, expected a page entry");
-            }
-        }
-    }
-
-    fn des_relsize(v: &[u8]) -> Result<RelationSizeEntry> {
-        match ObjectValue::des(&v)? {
-            ObjectValue::RelationSize(rs) => Ok(rs),
-            _ => {
-                bail!("Invalid object kind, expected a relation size entry");
-            }
-        }
-    }
-
-    fn des_timeline_metadata(v: &[u8]) -> Result<MetadataEntry> {
-        match ObjectValue::des(&v)? {
-            ObjectValue::TimelineMetadata(t) => Ok(t),
-            _ => {
-                bail!("Invalid object kind, expected a timeline metadata entry");
-            }
-        }
-    }
-}
-
 ///
 /// Iterator for `object_versions`. Returns all page versions of a given block, in
 /// reverse LSN order. This implements the traversal of ancestor timelines. If
@@ -1172,7 +1016,7 @@ impl ObjectValue {
 struct ObjectVersionIter<'a> {
    obj_store: &'a dyn ObjectStore,

-    object_tag: ObjectTag,
+    tag: ObjectTag,

    /// Iterator on the current timeline.
    current_iter: Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>,
@@ -1209,7 +1053,7 @@ impl<'a> ObjectVersionIter<'a> {
            if let Some(ancestor_timeline) = self.ancestor_timeline {
                let searchkey = ObjectKey {
                    timeline: ancestor_timeline,
-                    tag: self.object_tag,
+                    tag: self.tag,
                };
                let ancestor_iter = self
                    .obj_store
@@ -1222,10 +1066,13 @@ impl<'a> ObjectVersionIter<'a> {
                    .obj_store
                    .get(&timeline_metadata_key(ancestor_timeline), Lsn(0))
                    .with_context(|| "timeline not found in repository")?;
-                let ancestor_metadata = ObjectValue::des_timeline_metadata(&v)?;
-                self.ancestor_timeline = ancestor_metadata.ancestor_timeline;
-                self.ancestor_lsn = ancestor_metadata.ancestor_lsn;
-                self.current_iter = ancestor_iter;
+                if let ObjectValue::TimelineMetadata(ancestor_metadata) = ObjectValue::des(&v)? {
+                    self.ancestor_timeline = ancestor_metadata.ancestor_timeline;
+                    self.ancestor_lsn = ancestor_metadata.ancestor_lsn;
+                    self.current_iter = ancestor_iter;
+                } else {
+                    bail!("Invalid timeline metadata");
+                }
            } else {
                return Ok(None);
            }
--- a/pageserver/src/object_store.rs
+++ b/pageserver/src/object_store.rs
@@ -1,13 +1,19 @@
 //! Low-level key-value storage abstraction.
 //!
-use crate::object_key::*;
-use crate::repository::RelTag;
+use crate::repository::{ObjectTag, RelTag};
 use crate::ZTimelineId;
 use anyhow::Result;
+use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
 use std::iter::Iterator;
 use zenith_utils::lsn::Lsn;

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ObjectKey {
+    pub timeline: ZTimelineId,
+    pub tag: ObjectTag,
+}
+
 ///
 /// Low-level storage abstraction.
 ///
@@ -34,9 +40,6 @@ pub trait ObjectStore: Send + Sync {
    /// correspond to any real relation.
    fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>>;

-    /// Read key greater or equal than specified
-    fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>>;
-
    /// Iterate through all page versions of one object.
    ///
    /// Returns all page versions in descending LSN order, along with the LSN
@@ -82,7 +85,4 @@ pub trait ObjectStore: Send + Sync {

    /// Unlink object (used by GC). This mehod may actually delete object or just mark it for deletion.
    fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()>;
-
-    // Compact storage and remove versions marged for deletion
-    fn compact(&self);
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -5,8 +5,7 @@

 use crate::object_repository::ObjectRepository;
 use crate::repository::Repository;
-//use crate::rocksdb_storage::RocksObjectStore;
-use crate::inmem_storage::InmemObjectStore;
+use crate::rocksdb_storage::RocksObjectStore;
 use crate::walredo::PostgresRedoManager;
 use crate::PageServerConf;
 use lazy_static::lazy_static;
@@ -19,8 +18,7 @@ lazy_static! {
 pub fn init(conf: &'static PageServerConf) {
    let mut m = REPOSITORY.lock().unwrap();

-    //let obj_store = RocksObjectStore::open(conf).unwrap();
-    let obj_store = InmemObjectStore::open(conf).unwrap();
+    let obj_store = RocksObjectStore::open(conf).unwrap();

    // Set up a WAL redo manager, for applying WAL records.
    let walredo_mgr = PostgresRedoManager::new(conf);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -21,16 +21,13 @@ use std::thread;
 use std::{io, net::TcpStream};
 use zenith_utils::postgres_backend;
 use zenith_utils::postgres_backend::PostgresBackend;
-use zenith_utils::pq_proto::{
-    BeMessage, FeMessage, RowDescriptor, HELLO_WORLD_ROW, SINGLE_COL_ROWDESC,
-};
+use zenith_utils::pq_proto::{BeMessage, FeMessage, HELLO_WORLD_ROW, SINGLE_COL_ROWDESC};
 use zenith_utils::{bin_ser::BeSer, lsn::Lsn};

 use crate::basebackup;
 use crate::branches;
-use crate::object_key::ObjectTag;
 use crate::page_cache;
-use crate::repository::{BufferTag, RelTag, RelationUpdate, Update};
+use crate::repository::{BufferTag, Modification, ObjectTag, RelTag};
 use crate::restore_local_repo;
 use crate::walreceiver;
 use crate::PageServerConf;
@@ -413,38 +410,14 @@ impl postgres_backend::Handler for PageServerHandler {
            while let Some(msg) = pgb.read_message()? {
                match msg {
                    FeMessage::CopyData(bytes) => {
-                        let relation_update = RelationUpdate::des(&bytes)?;
+                        let modification = Modification::des(&bytes)?;

-                        last_lsn = relation_update.lsn;
-
-                        match relation_update.update {
-                            Update::Page { blknum, img } => {
-                                let tag = ObjectTag::RelationBuffer(BufferTag {
-                                    rel: relation_update.rel,
-                                    blknum,
-                                });
-
-                                timeline.put_page_image(tag, relation_update.lsn, img, true)?;
-                            }
-                            Update::WALRecord { blknum, rec } => {
-                                let tag = ObjectTag::RelationBuffer(BufferTag {
-                                    rel: relation_update.rel,
-                                    blknum,
-                                });
-
-                                timeline.put_wal_record(tag, rec)?;
-                            }
-                            Update::Truncate { n_blocks } => {
-                                timeline.put_truncation(
-                                    relation_update.rel,
-                                    relation_update.lsn,
-                                    n_blocks,
-                                )?;
-                            }
-                            Update::Unlink => {
-                                todo!()
-                            }
-                        }
+                        last_lsn = modification.lsn;
+                        timeline.put_raw_data(
+                            modification.tag,
+                            last_lsn,
+                            &modification.data[..],
+                        )?;
                    }
                    FeMessage::CopyDone => {
                        timeline.advance_last_valid_lsn(last_lsn);
@@ -509,97 +482,11 @@ impl postgres_backend::Handler for PageServerHandler {
            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?;
            pgb.write_message_noflush(&BeMessage::DataRow(&[Some(system_id.as_bytes())]))?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with(b"do_gc ") {
-            // Run GC immediately on given timeline.
-            // FIXME: This is just for tests. See test_runner/batch_others/test_gc.py.
-            // This probably should require special authentication or a global flag to
-            // enable, I don't think we want to or need to allow regular clients to invoke
-            // GC.
-            let query_str = std::str::from_utf8(&query_string)?;
-
-            let mut it = query_str.split(' ');
-            it.next().unwrap();
-
-            let timeline_id: ZTimelineId = it
-                .next()
-                .ok_or_else(|| anyhow!("missing timeline id"))?
-                .parse()?;
-            let timeline = page_cache::get_repository().get_timeline(timeline_id)?;
-
-            let horizon: u64 = it
-                .next()
-                .unwrap_or(&self.conf.gc_horizon.to_string())
-                .parse()?;
-
-            let result = timeline.gc_iteration(horizon)?;
-
-            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor {
-                    name: b"n_relations",
-                    typoid: 20,
-                    typlen: 8,
-                    ..Default::default()
-                },
-                RowDescriptor {
-                    name: b"truncated",
-                    typoid: 20,
-                    typlen: 8,
-                    ..Default::default()
-                },
-                RowDescriptor {
-                    name: b"deleted",
-                    typoid: 20,
-                    typlen: 8,
-                    ..Default::default()
-                },
-                RowDescriptor {
-                    name: b"prep_deleted",
-                    typoid: 20,
-                    typlen: 8,
-                    ..Default::default()
-                },
-                RowDescriptor {
-                    name: b"slru_deleted",
-                    typoid: 20,
-                    typlen: 8,
-                    ..Default::default()
-                },
-                RowDescriptor {
-                    name: b"chkp_deleted",
-                    typoid: 20,
-                    typlen: 8,
-                    ..Default::default()
-                },
-                RowDescriptor {
-                    name: b"dropped",
-                    typoid: 20,
-                    typlen: 8,
-                    ..Default::default()
-                },
-                RowDescriptor {
-                    name: b"elapsed",
-                    typoid: 20,
-                    typlen: 8,
-                    ..Default::default()
-                },
-            ]))?
-            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(&result.n_relations.to_string().as_bytes()),
-                Some(&result.truncated.to_string().as_bytes()),
-                Some(&result.deleted.to_string().as_bytes()),
-                Some(&result.prep_deleted.to_string().as_bytes()),
-                Some(&result.slru_deleted.to_string().as_bytes()),
-                Some(&result.chkp_deleted.to_string().as_bytes()),
-                Some(&result.dropped.to_string().as_bytes()),
-                Some(&result.elapsed.as_millis().to_string().as_bytes()),
-            ]))?
-            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            bail!("unknown command");
        }

        pgb.flush()?;
-
        Ok(())
    }
 }
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,4 +1,3 @@
-use crate::object_key::*;
 use crate::waldecoder::TransactionId;
 use crate::ZTimelineId;
 use anyhow::Result;
@@ -11,7 +10,6 @@ use std::collections::HashSet;
 use std::fmt;
 use std::iter::Iterator;
 use std::sync::Arc;
-use std::time::Duration;
 use zenith_utils::lsn::Lsn;

 ///
@@ -34,22 +32,6 @@ pub trait Repository: Send + Sync {
    //fn get_stats(&self) -> RepositoryStats;
 }

-///
-/// Result of performing GC
-///
-#[derive(Default)]
-pub struct GcResult {
-    pub n_relations: u64,
-    pub inspected: u64,
-    pub truncated: u64,
-    pub deleted: u64,
-    pub prep_deleted: u64, // 2PC prepare
-    pub slru_deleted: u64, // SLRU (clog, multixact)
-    pub chkp_deleted: u64, // Checkpoints
-    pub dropped: u64,
-    pub elapsed: Duration,
-}
-
 pub trait Timeline: Send + Sync {
    //------------------------------------------------------------------------------
    // Public GET functions
@@ -67,7 +49,7 @@ pub trait Timeline: Send + Sync {
    /// Does relation exist?
    fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;

-    /// Get a list of all distinct relations in given tablespace and database.
+    /// Get a list of all relations in given tablespace and database.
    fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;

    /// Get a list of non-relational objects
@@ -79,29 +61,24 @@ pub trait Timeline: Send + Sync {
    // These are called by the WAL receiver to digest WAL records.
    //------------------------------------------------------------------------------

+    /// Put raw data
+    fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()>;
+
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
    fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()>;

-    /// Put raw data
-    fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()>;
-
    /// Like put_wal_record, but with ready-made image of the page.
-    fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes, update_meta: bool) -> Result<()>;
+    fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes) -> Result<()>;

    /// Truncate relation
    fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;

-    /// Unlink relation. This method is used for marking dropped relations.
-    fn put_unlink(&self, tag: RelTag, lsn: Lsn) -> Result<()>;
-
-    /// Truncate SRLU segment
-    fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()>;
-
-    // Get object tag greater or equal than specified
-    fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>>;
+    /// Unlink object. This method is used for marking dropped relations
+    /// and removed segments of SLRUs.
+    fn put_unlink(&self, tag: ObjectTag, lsn: Lsn) -> Result<()>;

    /// Remember the all WAL before the given LSN has been processed.
    ///
@@ -135,13 +112,6 @@ pub trait Timeline: Send + Sync {
    // TODO ordering guarantee?
    fn history<'a>(&'a self) -> Result<Box<dyn History + 'a>>;

-    /// Perform one garbage collection iteration.
-    /// Garbage collection is periodically performed by GC thread,
-    /// but it can be explicitly requested through page server API.
-    ///
-    /// `horizon` specifies delta from last LSN to preserve all object versions (PITR interval).
-    fn gc_iteration(&self, horizon: u64) -> Result<GcResult>;
-
    // Check transaction status
    fn get_tx_status(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<u8> {
        let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
@@ -152,24 +122,26 @@ pub trait Timeline: Send + Sync {
    }
 }

-pub trait History: Iterator<Item = Result<RelationUpdate>> {
+pub trait History: Iterator<Item = Result<Modification>> {
    /// The last_valid_lsn at the time of history() call.
    fn lsn(&self) -> Lsn;
 }

 #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
-pub struct RelationUpdate {
-    pub rel: RelTag,
+pub struct Modification {
+    pub tag: ObjectTag,
    pub lsn: Lsn,
-    pub update: Update,
+    pub data: Vec<u8>,
 }

-#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
-pub enum Update {
-    Page { blknum: u32, img: Bytes },
-    WALRecord { blknum: u32, rec: WALRecord },
-    Truncate { n_blocks: u32 },
-    Unlink,
+impl Modification {
+    pub fn new(entry: (ObjectTag, Lsn, Vec<u8>)) -> Modification {
+        Modification {
+            tag: entry.0,
+            lsn: entry.1,
+            data: entry.2,
+        }
+    }
 }

 #[derive(Clone)]
@@ -195,6 +167,10 @@ pub struct RepositoryStats {
 /// are used for the same purpose.
 /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
 ///
+/// We use additional fork numbers to logically separate relational and
+/// non-relational data inside pageserver key-value storage.
+/// See, e.g., `ROCKSDB_SPECIAL_FORKNUM`.
+///
 #[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
 pub struct RelTag {
    pub forknum: u8,
@@ -237,6 +213,8 @@ impl fmt::Display for RelTag {
 /// In Postgres `BufferTag` structure is used for exactly the same purpose.
 /// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
 ///
+/// NOTE: In this context we use buffer, block and page interchangeably when speak about relation files.
+///
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
 pub struct BufferTag {
    pub rel: RelTag,
@@ -250,6 +228,71 @@ impl BufferTag {
    };
 }

+///
+/// Non-relation transaction status files (clog (a.k.a. pg_xact) and pg_multixact)
+/// in Postgres are handled by SLRU (Simple LRU) buffer, hence the name.
+///
+/// These files are global for a postgres instance.
+///
+/// These files are divided into segments, which are divided into pages
+/// of the same BLCKSZ as used for relation files.
+///
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+pub struct SlruBufferTag {
+    pub blknum: u32,
+}
+
+///
+/// Special type of Postgres files: pg_filenode.map is needed to map
+/// catalog table OIDs to filenode numbers, which define filename.
+///
+/// Each database has a map file for its local mapped catalogs,
+/// and there is a separate map file for shared catalogs.
+///
+/// These files have untypical size of 512 bytes.
+///
+/// See PostgreSQL relmapper.c for details.
+///
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+pub struct DatabaseTag {
+    pub spcnode: u32,
+    pub dbnode: u32,
+}
+
+///
+/// Non-relation files that keep state for prepared transactions.
+/// Unlike other files these are not divided into pages.
+///
+/// See PostgreSQL twophase.c for details.
+///
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+pub struct PrepareTag {
+    pub xid: TransactionId,
+}
+
+/// ObjectTag is a part of ObjectKey that is specific
+/// to the type of the stored object.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+pub enum ObjectTag {
+    // dummy tag preceeding all other keys
+    FirstTag,
+    TimelineMetadataTag,
+    // Special entry that represents PostgreSQL checkpoint.
+    // We use it to track fields needed to restore controlfile checkpoint.
+    Checkpoint,
+    // Various types of non-relation files.
+    // We need them to bootstrap compute node.
+    ControlFile,
+    Clog(SlruBufferTag),
+    MultiXactMembers(SlruBufferTag),
+    MultiXactOffsets(SlruBufferTag),
+    FileNodeMap(DatabaseTag),
+    TwoPhase(PrepareTag),
+    // put relations at the end of enum to allow efficient iterations through non-rel objects
+    RelationMetadata(RelTag),
+    RelationBuffer(BufferTag),
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct WALRecord {
    pub lsn: Lsn, // LSN at the *end* of the record
@@ -291,6 +334,7 @@ impl WALRecord {
 mod tests {
    use super::*;
    use crate::object_repository::ObjectRepository;
+    use crate::object_repository::ObjectValue;
    use crate::rocksdb_storage::RocksObjectStore;
    use crate::walredo::{WalRedoError, WalRedoManager};
    use crate::PageServerConf;
@@ -299,6 +343,7 @@ mod tests {
    use std::path::PathBuf;
    use std::str::FromStr;
    use std::time::Duration;
+    use zenith_utils::bin_ser::BeSer;

    /// Arbitrary relation tag, for testing.
    const TESTREL_A: RelTag = RelTag {
@@ -338,7 +383,7 @@ mod tests {
            interactive: false,
            gc_horizon: 64 * 1024 * 1024,
            gc_period: Duration::from_secs(10),
-            listen_addr: "127.0.0.1:5430".to_string(),
+            listen_addr: "127.0.0.1:5430".parse().unwrap(),
            workdir: repo_dir,
            pg_distrib_dir: "".into(),
        };
@@ -367,11 +412,11 @@ mod tests {
        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

        tline.init_valid_lsn(Lsn(1));
-        tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
-        tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
-        tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
-        tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"), true)?;
-        tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"), true)?;
+        tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"))?;
+        tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"))?;
+        tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"))?;

        tline.advance_last_valid_lsn(Lsn(5));

@@ -458,7 +503,7 @@ mod tests {
        for i in 0..pg_constants::RELSEG_SIZE + 1 {
            let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
            lsn += 1;
-            tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img, true)?;
+            tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img)?;
        }
        tline.advance_last_valid_lsn(Lsn(lsn));

@@ -495,48 +540,82 @@ mod tests {
        let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
        let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;

-        let mut snapshot = tline.history()?;
+        let snapshot = tline.history()?;
        assert_eq!(snapshot.lsn(), Lsn(0));
+        let mut snapshot = snapshot.skip_while(|r| match r {
+            Ok(m) => match m.tag {
+                ObjectTag::RelationBuffer(_) => false,
+                _ => true,
+            },
+            _ => true,
+        });
        assert_eq!(None, snapshot.next().transpose()?);

        // add a page and advance the last valid LSN
        let rel = TESTREL_A;
        let tag = TEST_BUF(1);
-        tline.put_page_image(tag, Lsn(1), TEST_IMG("blk 1 @ lsn 1"), true)?;
+        tline.put_page_image(tag, Lsn(1), TEST_IMG("blk 1 @ lsn 1"))?;
        tline.advance_last_valid_lsn(Lsn(1));
-        let mut snapshot = tline.history()?;
+        let snapshot = tline.history()?;
        assert_eq!(snapshot.lsn(), Lsn(1));
-        let expected_page = RelationUpdate {
-            rel: rel,
-            lsn: Lsn(1),
-            update: Update::Page {
-                blknum: 1,
-                img: TEST_IMG("blk 1 @ lsn 1"),
+        let mut snapshot = snapshot.skip_while(|r| match r {
+            Ok(m) => match m.tag {
+                ObjectTag::RelationBuffer(_) => false,
+                _ => true,
            },
+            _ => true,
+        });
+        let expected_page = Modification {
+            tag,
+            lsn: Lsn(1),
+            data: ObjectValue::ser(&ObjectValue::Page(TEST_IMG("blk 1 @ lsn 1")))?,
        };
        assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
        assert_eq!(None, snapshot.next().transpose()?);

        // truncate to zero, but don't advance the last valid LSN
        tline.put_truncation(rel, Lsn(2), 0)?;
-        let mut snapshot = tline.history()?;
+        let snapshot = tline.history()?;
        assert_eq!(snapshot.lsn(), Lsn(1));
+        let mut snapshot = snapshot.skip_while(|r| match r {
+            Ok(m) => match m.tag {
+                ObjectTag::RelationBuffer(_) => false,
+                _ => true,
+            },
+            _ => true,
+        });
        assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
        assert_eq!(None, snapshot.next().transpose()?);

        // advance the last valid LSN and the truncation should be observable
        tline.advance_last_valid_lsn(Lsn(2));
-        let mut snapshot = tline.history()?;
+        let snapshot = tline.history()?;
        assert_eq!(snapshot.lsn(), Lsn(2));
-
-        // TODO ordering not guaranteed by API. But currently it returns the
-        // truncation entry before the block data.
-        let expected_truncate = RelationUpdate {
-            rel: rel,
-            lsn: Lsn(2),
-            update: Update::Truncate { n_blocks: 0 },
+        let mut snapshot = snapshot.skip_while(|r| match r {
+            Ok(m) => match m.tag {
+                ObjectTag::RelationMetadata(_) => false,
+                _ => true,
+            },
+            _ => true,
+        });
+        let expected_truncate = Modification {
+            tag: ObjectTag::RelationMetadata(rel),
+            lsn: Lsn(1),
+            data: ObjectValue::ser(&ObjectValue::RelationSize(2))?,
        };
-        assert_eq!(Some(expected_truncate), snapshot.next().transpose()?);
+        assert_eq!(
+            Some(&expected_truncate),
+            snapshot.next().transpose()?.as_ref()
+        ); // TODO ordering not guaranteed by API
+        let expected_truncate = Modification {
+            tag: ObjectTag::RelationMetadata(rel),
+            lsn: Lsn(2),
+            data: ObjectValue::ser(&ObjectValue::RelationSize(0))?,
+        };
+        assert_eq!(
+            Some(&expected_truncate),
+            snapshot.next().transpose()?.as_ref()
+        ); // TODO ordering not guaranteed by API
        assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
        assert_eq!(None, snapshot.next().transpose()?);

--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -12,21 +12,21 @@ use std::io::SeekFrom;
 use std::path::{Path, PathBuf};

 use anyhow::Result;
-use bytes::{Buf, Bytes};
+use bytes::Bytes;

-use crate::object_key::*;
-use crate::repository::*;
-use crate::waldecoder::*;
+use crate::repository::{
+    BufferTag, DatabaseTag, ObjectTag, PrepareTag, RelTag, SlruBufferTag, Timeline, WALRecord,
+};
+use crate::waldecoder::{decode_wal_record, DecodedWALRecord, Oid, WalStreamDecoder};
+use crate::waldecoder::{XlCreateDatabase, XlSmgrTruncate};
 use crate::PageServerConf;
 use crate::ZTimelineId;
+use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::xlog_utils::*;
-use postgres_ffi::{pg_constants, CheckPoint, ControlFileData};
+use postgres_ffi::*;
 use zenith_utils::lsn::Lsn;

-const MAX_MBR_BLKNO: u32 =
-    pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-
 ///
 /// Find latest snapshot in a timeline's 'snapshots' directory
 ///
@@ -203,7 +203,7 @@ fn import_relfile(
                    },
                    blknum,
                });
-                timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf), true)?;
+                timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf))?;
            }

            // TODO: UnexpectedEof is expected
@@ -236,7 +236,7 @@ fn import_nonrel_file(
    // read the whole file
    file.read_to_end(&mut buffer)?;

-    timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]), false)?;
+    timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]))?;
    Ok(())
 }

@@ -256,7 +256,7 @@ fn import_slru_file(
        let r = file.read_exact(&mut buf);
        match r {
            Ok(_) => {
-                timeline.put_page_image(gen_tag(blknum), lsn, Bytes::copy_from_slice(&buf), false)?;
+                timeline.put_page_image(gen_tag(blknum), lsn, Bytes::copy_from_slice(&buf))?;
            }

            // TODO: UnexpectedEof is expected
@@ -289,8 +289,6 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:

    let checkpoint_bytes = timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, startpoint)?;
    let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
-    // get_page_at_lsn_nowait returns pages with zeros when object is not found in the storage.
-    // nextXid can not be zero, so this check is used to detect situation when checkpoint record needs to be initialized.
    if checkpoint.nextXid.value == 0 {
        let pg_control_bytes =
            timeline.get_page_at_lsn_nowait(ObjectTag::ControlFile, startpoint)?;
@@ -339,8 +337,8 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
                break;
            }
            if let Some((lsn, recdata)) = rec.unwrap() {
-                let decoded = decode_wal_record(recdata.clone());
-                save_decoded_record(&mut checkpoint, timeline, &decoded, recdata, lsn)?;
+                let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
+                save_decoded_record(timeline, &decoded, recdata, lsn)?;
                last_lsn = lsn;
            } else {
                break;
@@ -360,7 +358,7 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
    }
    info!("reached end of WAL at {}", last_lsn);
    let checkpoint_bytes = checkpoint.encode();
-    timeline.put_page_image(ObjectTag::Checkpoint, last_lsn, checkpoint_bytes, false)?;
+    timeline.put_page_image(ObjectTag::Checkpoint, last_lsn, checkpoint_bytes)?;
    Ok(())
 }

@@ -369,163 +367,40 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
 /// relations/pages that the record affects.
 ///
 pub fn save_decoded_record(
-    checkpoint: &mut CheckPoint,
    timeline: &dyn Timeline,
    decoded: &DecodedWALRecord,
    recdata: Bytes,
    lsn: Lsn,
 ) -> Result<()> {
-    checkpoint.update_next_xid(decoded.xl_xid);
-
    // Iterate through all the blocks that the record modifies, and
    // "put" a separate copy of the record for each block.
    for blk in decoded.blocks.iter() {
-        let tag = ObjectTag::RelationBuffer(BufferTag {
-            rel: RelTag {
-                spcnode: blk.rnode_spcnode,
-                dbnode: blk.rnode_dbnode,
-                relnode: blk.rnode_relnode,
-                forknum: blk.forknum as u8,
-            },
-            blknum: blk.blkno,
-        });
-
-        let rec = WALRecord {
-            lsn,
-            will_init: blk.will_init || blk.apply_image,
-            rec: recdata.clone(),
-            main_data_offset: decoded.main_data_offset as u32,
-        };
-
-        timeline.put_wal_record(tag, rec)?;
+        if blk.will_drop {
+            timeline.put_unlink(blk.tag, lsn)?;
+        } else {
+            let rec = WALRecord {
+                lsn,
+                will_init: blk.will_init || blk.apply_image,
+                rec: recdata.clone(),
+                main_data_offset: decoded.main_data_offset as u32,
+            };
+            timeline.put_wal_record(blk.tag, rec)?;
+        }
    }

-    let mut buf = decoded.record.clone();
-    buf.advance(decoded.main_data_offset);
-
    // Handle a few special record types
    if decoded.xl_rmid == pg_constants::RM_SMGR_ID
        && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE
    {
-        let truncate = XlSmgrTruncate::decode(&mut buf);
+        let truncate = XlSmgrTruncate::decode(&decoded);
        save_xlog_smgr_truncate(timeline, lsn, &truncate)?;
-    } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
-        if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE {
-            let createdb = XlCreateDatabase::decode(&mut buf);
-            save_xlog_dbase_create(timeline, lsn, &createdb)?;
-        } else {
-            // TODO
-            trace!("XLOG_DBASE_DROP is not handled yet");
-        }
-    } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
-        trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
-    } else if decoded.xl_rmid == pg_constants::RM_CLOG_ID {
-        let blknum = buf.get_u32_le();
-        let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
-        let tag = ObjectTag::Clog(SlruBufferTag { blknum });
-        if info == pg_constants::CLOG_ZEROPAGE {
-            let rec = WALRecord {
-                lsn,
-                will_init: true,
-                rec: recdata.clone(),
-                main_data_offset: decoded.main_data_offset as u32,
-            };
-            timeline.put_wal_record(tag, rec)?;
-        } else {
-            assert!(info == pg_constants::CLOG_TRUNCATE);
-            checkpoint.oldestXid = buf.get_u32_le();
-            checkpoint.oldestXidDB = buf.get_u32_le();
-            trace!(
-                "RM_CLOG_ID truncate blkno {} oldestXid {} oldestXidDB {}",
-                blknum,
-                checkpoint.oldestXid,
-                checkpoint.oldestXidDB
-            );
-            if let Some(ObjectTag::Clog(first_slru_tag)) =
-                timeline.get_next_tag(ObjectTag::Clog(SlruBufferTag { blknum: 0 }))?
-            {
-                for trunc_blknum in first_slru_tag.blknum..=blknum {
-                    let tag = ObjectTag::Clog(SlruBufferTag {
-                        blknum: trunc_blknum,
-                    });
-                    timeline.put_slru_truncate(tag, lsn)?;
-                }
-            }
-        }
-    } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
-        let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
-        if info == pg_constants::XLOG_XACT_COMMIT
-            || info == pg_constants::XLOG_XACT_COMMIT_PREPARED
-            || info == pg_constants::XLOG_XACT_ABORT
-            || info == pg_constants::XLOG_XACT_ABORT_PREPARED
-        {
-            let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-            save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
-        } else if info == pg_constants::XLOG_XACT_PREPARE {
-            let rec = WALRecord {
-                lsn,
-                will_init: true,
-                rec: recdata.clone(),
-                main_data_offset: decoded.main_data_offset as u32,
-            };
-            timeline.put_wal_record(
-                ObjectTag::TwoPhase(PrepareTag {
-                    xid: decoded.xl_xid,
-                }),
-                rec,
-            )?;
-        }
-    } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
-            || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
-        {
-            let blknum = buf.get_u32_le();
-            let rec = WALRecord {
-                lsn,
-                will_init: true,
-                rec: recdata.clone(),
-                main_data_offset: decoded.main_data_offset as u32,
-            };
-            let tag = if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
-                ObjectTag::MultiXactOffsets(SlruBufferTag { blknum })
-            } else {
-                ObjectTag::MultiXactMembers(SlruBufferTag { blknum })
-            };
-            timeline.put_wal_record(tag, rec)?;
-        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-            let xlrec = XlMultiXactCreate::decode(&mut buf);
-            save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
-        } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
-            let xlrec = XlMultiXactTruncate::decode(&mut buf);
-            save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
-        }
-    } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
-        let xlrec = XlRelmapUpdate::decode(&mut buf);
-        save_relmap_record(timeline, lsn, &xlrec, decoded)?;
-    } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        if info == pg_constants::XLOG_NEXTOID {
-            let next_oid = buf.get_u32_le();
-            checkpoint.nextOid = next_oid;
-        } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
-            || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-        {
-            let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
-            let mut buf = decoded.record.clone();
-            buf.advance(decoded.main_data_offset);
-            buf.copy_to_slice(&mut checkpoint_bytes);
-            let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap();
-            trace!(
-                "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
-                xlog_checkpoint.oldestXid,
-                checkpoint.oldestXid
-            );
-            if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
-                checkpoint.oldestXid = xlog_checkpoint.oldestXid;
-            }
-        }
+    } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
+        && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE
+    {
+        let createdb = XlCreateDatabase::decode(&decoded);
+        save_xlog_dbase_create(timeline, lsn, &createdb)?;
    }
+
    // Now that this record has been handled, let the repository know that
    // it is up-to-date to this LSN
    timeline.advance_last_record_lsn(lsn);
@@ -579,7 +454,7 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab

            debug!("copying block {:?} to {:?}", src_key, dst_key);

-            timeline.put_page_image(dst_key, lsn, content, true)?;
+            timeline.put_page_image(dst_key, lsn, content)?;
            num_blocks_copied += 1;
        }

@@ -600,7 +475,7 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
                        spcnode: tablespace_id,
                        dbnode: db_id,
                    });
-                    timeline.put_page_image(new_tag, lsn, img, false)?;
+                    timeline.put_page_image(new_tag, lsn, img)?;
                    break;
                }
            }
@@ -675,158 +550,3 @@ fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTrunca
    }
    Ok(())
 }
-
-/// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
-///
-/// We are currently only interested in the dropped relations.
-fn save_xact_record(
-    timeline: &dyn Timeline,
-    lsn: Lsn,
-    parsed: &XlXactParsedRecord,
-    decoded: &DecodedWALRecord,
-) -> Result<()> {
-    // Record update of CLOG page
-    let mut blknum = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
-    let tag = ObjectTag::Clog(SlruBufferTag { blknum });
-    let rec = WALRecord {
-        lsn,
-        will_init: false,
-        rec: decoded.record.clone(),
-        main_data_offset: decoded.main_data_offset as u32,
-    };
-    timeline.put_wal_record(tag, rec.clone())?;
-
-    for subxact in &parsed.subxacts {
-        let subxact_blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
-        if subxact_blknum != blknum {
-            blknum = subxact_blknum;
-            let tag = ObjectTag::Clog(SlruBufferTag { blknum });
-            timeline.put_wal_record(tag, rec.clone())?;
-        }
-    }
-    for xnode in &parsed.xnodes {
-        for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM {
-            let rel_tag = RelTag {
-                forknum,
-                spcnode: xnode.spcnode,
-                dbnode: xnode.dbnode,
-                relnode: xnode.relnode,
-            };
-            timeline.put_unlink(rel_tag, lsn)?;
-        }
-    }
-    Ok(())
-}
-
-fn save_multixact_create_record(
-    checkpoint: &mut CheckPoint,
-    timeline: &dyn Timeline,
-    lsn: Lsn,
-    xlrec: &XlMultiXactCreate,
-    decoded: &DecodedWALRecord,
-) -> Result<()> {
-    let rec = WALRecord {
-        lsn,
-        will_init: false,
-        rec: decoded.record.clone(),
-        main_data_offset: decoded.main_data_offset as u32,
-    };
-    let blknum = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-    let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
-    timeline.put_wal_record(tag, rec.clone())?;
-
-    let first_mbr_blkno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-    let last_mbr_blkno =
-        (xlrec.moff + xlrec.nmembers - 1) / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-    // The members SLRU can, in contrast to the offsets one, be filled to almost
-    // the full range at once. So we need to handle wraparound.
-    let mut blknum = first_mbr_blkno;
-    loop {
-        // Update members page
-        let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
-        timeline.put_wal_record(tag, rec.clone())?;
-
-        if blknum == last_mbr_blkno {
-            // last block inclusive
-            break;
-        }
-
-        // handle wraparound
-        if blknum == MAX_MBR_BLKNO {
-            blknum = 0;
-        } else {
-            blknum += 1;
-        }
-    }
-    if xlrec.mid >= checkpoint.nextMulti {
-        checkpoint.nextMulti = xlrec.mid + 1;
-    }
-    if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
-        checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
-    }
-    let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
-        if mbr.xid.wrapping_sub(acc) as i32 > 0 {
-            mbr.xid
-        } else {
-            acc
-        }
-    });
-    checkpoint.update_next_xid(max_mbr_xid);
-    Ok(())
-}
-
-fn save_multixact_truncate_record(
-    checkpoint: &mut CheckPoint,
-    timeline: &dyn Timeline,
-    lsn: Lsn,
-    xlrec: &XlMultiXactTruncate,
-) -> Result<()> {
-    checkpoint.oldestMulti = xlrec.end_trunc_off;
-    checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
-    let first_off_blkno = xlrec.start_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-    let last_off_blkno = xlrec.end_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
-    // Delete all the segments except the last one. The last segment can still
-    // contain, possibly partially, valid data.
-    for blknum in first_off_blkno..last_off_blkno {
-        let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
-        timeline.put_slru_truncate(tag, lsn)?;
-    }
-    let first_mbr_blkno = xlrec.start_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-    let last_mbr_blkno = xlrec.end_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-    // The members SLRU can, in contrast to the offsets one, be filled to almost
-    // the full range at once. So we need to handle wraparound.
-    let mut blknum = first_mbr_blkno;
-    // Delete all the segments but the last one. The last segment can still
-    // contain, possibly partially, valid data.
-    while blknum != last_mbr_blkno {
-        let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
-        timeline.put_slru_truncate(tag, lsn)?;
-        // handle wraparound
-        if blknum == MAX_MBR_BLKNO {
-            blknum = 0;
-        } else {
-            blknum += 1;
-        }
-    }
-    Ok(())
-}
-
-fn save_relmap_record(
-    timeline: &dyn Timeline,
-    lsn: Lsn,
-    xlrec: &XlRelmapUpdate,
-    decoded: &DecodedWALRecord,
-) -> Result<()> {
-    let rec = WALRecord {
-        lsn,
-        will_init: true,
-        rec: decoded.record.clone(),
-        main_data_offset: decoded.main_data_offset as u32,
-    };
-    let tag = ObjectTag::FileNodeMap(DatabaseTag {
-        spcnode: xlrec.tsid,
-        dbnode: xlrec.dbid,
-    });
-    timeline.put_wal_record(tag, rec)?;
-    Ok(())
-}
--- a/pageserver/src/restore_s3.rs
+++ b/pageserver/src/restore_s3.rs
@@ -0,0 +1,274 @@
+//
+// Restore chunks from S3
+//
+// This runs once at Page Server startup. It loads all the "base images" from
+// S3 into the in-memory page cache. It also initializes the "last valid LSN"
+// in the page cache to the LSN of the base image, so that when the WAL receiver
+// is started, it starts streaming from that LSN.
+//
+
+use bytes::{Buf, BytesMut};
+use log::*;
+use regex::Regex;
+use std::env;
+use std::fmt;
+
+use s3::bucket::Bucket;
+use s3::creds::Credentials;
+use s3::region::Region;
+use s3::S3Error;
+
+use tokio::runtime;
+
+use futures::future;
+
+use crate::{page_cache, PageServerConf};
+use postgres_ffi::pg_constants;
+use postgres_ffi::relfile_utils::*;
+
+struct Storage {
+    region: Region,
+    credentials: Credentials,
+    bucket: String,
+}
+
+pub fn restore_main(conf: &PageServerConf) {
+    // Create a new thread pool
+    let runtime = runtime::Runtime::new().unwrap();
+
+    runtime.block_on(async {
+        let result = restore_chunk(conf).await;
+
+        match result {
+            Ok(_) => {}
+            Err(err) => {
+                error!("S3 error: {}", err);
+            }
+        }
+    });
+}
+
+//
+// Restores one chunk from S3.
+//
+// 1. Fetch the last base image >= given LSN
+// 2. Fetch all WAL
+//
+// Load it all into the page cache.
+//
+async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
+    let backend = Storage {
+        region: Region::Custom {
+            region: env::var("S3_REGION").unwrap(),
+            endpoint: env::var("S3_ENDPOINT").unwrap(),
+        },
+        credentials: Credentials::new(
+            Some(&env::var("S3_ACCESSKEY").unwrap()),
+            Some(&env::var("S3_SECRET").unwrap()),
+            None,
+            None,
+            None,
+        )
+        .unwrap(),
+        bucket: "zenith-testbucket".to_string(),
+    };
+
+    info!("Restoring from S3...");
+
+    // Create Bucket in REGION for BUCKET
+    let bucket = Bucket::new_with_path_style(&backend.bucket, backend.region, backend.credentials)?;
+
+    // List out contents of directory
+    let results: Vec<s3::serde_types::ListBucketResult> = bucket
+        .list("relationdata/".to_string(), Some("".to_string()))
+        .await?;
+
+    // TODO: get that from backup
+    let sys_id: u64 = 42;
+    let mut oldest_lsn = 0;
+    let mut slurp_futures: Vec<_> = Vec::new();
+
+    for result in results {
+        for object in result.contents {
+            // Download every relation file, slurping them into memory
+
+            let key = object.key;
+            let relpath = key.strip_prefix("relationdata/").unwrap();
+
+            let parsed = parse_rel_file_path(&relpath);
+
+            match parsed {
+                Ok(p) => {
+                    if oldest_lsn == 0 || p.lsn < oldest_lsn {
+                        oldest_lsn = p.lsn;
+                    }
+                    let b = bucket.clone();
+                    let f = slurp_base_file(conf, sys_id, b, key.to_string(), p);
+
+                    slurp_futures.push(f);
+                }
+                Err(e) => {
+                    warn!("unrecognized file: {} ({})", relpath, e);
+                }
+            };
+        }
+    }
+
+    if oldest_lsn == 0 {
+        panic!("no base backup found");
+    }
+
+    let pcache = page_cache::get_pagecache(conf, sys_id);
+    pcache.init_valid_lsn(oldest_lsn);
+
+    info!("{} files to restore...", slurp_futures.len());
+
+    future::join_all(slurp_futures).await;
+    info!("restored!");
+
+    Ok(())
+}
+
+#[derive(Debug)]
+struct ParsedBaseImageFileName {
+    pub spcnode: u32,
+    pub dbnode: u32,
+    pub relnode: u32,
+    pub forknum: u8,
+    pub segno: u32,
+
+    pub lsn: u64,
+}
+
+// formats:
+// <oid>
+// <oid>_<fork name>
+// <oid>.<segment number>
+// <oid>_<fork name>.<segment number>
+
+fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
+    let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
+
+    let caps = re
+        .captures(fname)
+        .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
+
+    let relnode_str = caps.name("relnode").unwrap().as_str();
+    let relnode: u32 = relnode_str.parse()?;
+
+    let forkname = caps.name("forkname").map(|f| f.as_str());
+    let forknum = forkname_to_forknum(forkname)?;
+
+    let segno_match = caps.name("segno");
+    let segno = if segno_match.is_none() {
+        0
+    } else {
+        segno_match.unwrap().as_str().parse::<u32>()?
+    };
+
+    let lsn_hi: u64 = caps.name("lsnhi").unwrap().as_str().parse()?;
+    let lsn_lo: u64 = caps.name("lsnlo").unwrap().as_str().parse()?;
+    let lsn = lsn_hi << 32 | lsn_lo;
+
+    Ok((relnode, forknum, segno, lsn))
+}
+
+fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
+    /*
+     * Relation data files can be in one of the following directories:
+     *
+     * global/
+     *		shared relations
+     *
+     * base/<db oid>/
+     *		regular relations, default tablespace
+     *
+     * pg_tblspc/<tblspc oid>/<tblspc version>/
+     *		within a non-default tablespace (the name of the directory
+     *		depends on version)
+     *
+     * And the relation data files themselves have a filename like:
+     *
+     * <oid>.<segment number>
+     */
+    if let Some(fname) = path.strip_prefix("global/") {
+        let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
+
+        Ok(ParsedBaseImageFileName {
+            spcnode: pg_constants::GLOBALTABLESPACE_OID,
+            dbnode: 0,
+            relnode,
+            forknum,
+            segno,
+            lsn,
+        })
+    } else if let Some(dbpath) = path.strip_prefix("base/") {
+        let mut s = dbpath.split("/");
+        let dbnode_str = s
+            .next()
+            .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
+        let dbnode: u32 = dbnode_str.parse()?;
+        let fname = s
+            .next()
+            .ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
+        if s.next().is_some() {
+            return Err(FilePathError::new("invalid relation data file name"));
+        };
+
+        let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
+
+        Ok(ParsedBaseImageFileName {
+            spcnode: pg_constants::DEFAULTTABLESPACE_OID,
+            dbnode,
+            relnode,
+            forknum,
+            segno,
+            lsn,
+        })
+    } else if let Some(_) = path.strip_prefix("pg_tblspc/") {
+        // TODO
+        Err(FilePathError::new("tablespaces not supported"))
+    } else {
+        Err(FilePathError::new("invalid relation data file name"))
+    }
+}
+
+//
+// Load a base file from S3, and insert it into the page cache
+//
+async fn slurp_base_file(
+    conf: &PageServerConf,
+    sys_id: u64,
+    bucket: Bucket,
+    s3path: String,
+    parsed: ParsedBaseImageFileName,
+) {
+    // FIXME: rust-s3 opens a new connection for each request. Should reuse
+    // the reqwest::Client object. But that requires changes to rust-s3 itself.
+    let (data, code) = bucket.get_object(s3path.clone()).await.unwrap();
+
+    trace!("got response: {} on {}", code, &s3path);
+    assert_eq!(200, code);
+
+    let mut bytes = BytesMut::from(data.as_slice()).freeze();
+
+    let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
+
+    let pcache = page_cache::get_pagecache(conf, sys_id);
+
+    while bytes.remaining() >= 8192 {
+        let tag = page_cache::BufferTag {
+            rel: page_cache::RelTag {
+                spcnode: parsed.spcnode,
+                dbnode: parsed.dbnode,
+                relnode: parsed.relnode,
+                forknum: parsed.forknum,
+            },
+            blknum,
+        };
+
+        pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
+
+        blknum += 1;
+    }
+}
--- a/pageserver/src/rocksdb_storage.rs
+++ b/pageserver/src/rocksdb_storage.rs
@@ -1,9 +1,8 @@
 //!
 //! An implementation of the ObjectStore interface, backed by RocksDB
 //!
-use crate::object_key::*;
-use crate::object_store::ObjectStore;
-use crate::repository::RelTag;
+use crate::object_store::{ObjectKey, ObjectStore};
+use crate::repository::{BufferTag, ObjectTag, RelTag};
 use crate::PageServerConf;
 use crate::ZTimelineId;
 use anyhow::{bail, Result};
@@ -25,7 +24,7 @@ impl StorageKey {
        Self {
            obj_key: ObjectKey {
                timeline,
-                tag: ObjectTag::TimelineMetadataTag,
+                tag: ObjectTag::FirstTag,
            },
            lsn: Lsn(0),
        }
@@ -94,21 +93,6 @@ impl ObjectStore for RocksObjectStore {
        }
    }

-    fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
-        let mut iter = self.db.raw_iterator();
-        let search_key = StorageKey {
-            obj_key: key.clone(),
-            lsn: Lsn(0),
-        };
-        iter.seek(search_key.ser()?);
-        if !iter.valid() {
-            Ok(None)
-        } else {
-            let key = StorageKey::des(iter.key().unwrap())?;
-            Ok(Some(key.obj_key.clone()))
-        }
-    }
-
    fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
        self.db.put(
            StorageKey::ser(&StorageKey {
@@ -167,43 +151,41 @@ impl ObjectStore for RocksObjectStore {

        let mut rels: HashSet<RelTag> = HashSet::new();

-        let mut search_rel_tag = RelTag {
-            spcnode,
-            dbnode,
-            relnode: 0,
-            forknum: 0u8,
+        let mut search_key = StorageKey {
+            obj_key: ObjectKey {
+                timeline: timelineid,
+                tag: ObjectTag::RelationBuffer(BufferTag {
+                    rel: RelTag {
+                        spcnode,
+                        dbnode,
+                        relnode: 0,
+                        forknum: 0u8,
+                    },
+                    blknum: 0,
+                }),
+            },
+            lsn: Lsn(0),
        };
        let mut iter = self.db.raw_iterator();
        loop {
-            let search_key = StorageKey {
-                obj_key: ObjectKey {
-                    timeline: timelineid,
-                    tag: ObjectTag::RelationMetadata(search_rel_tag),
-                },
-                lsn: Lsn(0),
-            };
            iter.seek(search_key.ser()?);
            if !iter.valid() {
                break;
            }
            let key = StorageKey::des(iter.key().unwrap())?;
-
-            if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
-                if spcnode != 0 && rel_tag.spcnode != spcnode
-                    || dbnode != 0 && rel_tag.dbnode != dbnode
+            if let ObjectTag::RelationBuffer(buf_tag) = key.obj_key.tag {
+                if (spcnode != 0 && buf_tag.rel.spcnode != spcnode)
+                    || (dbnode != 0 && buf_tag.rel.dbnode != dbnode)
                {
                    break;
                }
-                if key.lsn <= lsn {
-                    // visible in this snapshot
-                    rels.insert(rel_tag);
+                if key.lsn < lsn {
+                    rels.insert(buf_tag.rel);
                }
-                search_rel_tag = rel_tag;
-                // skip to next relation
-                // FIXME: What if relnode is u32::MAX ?
-                search_rel_tag.relnode += 1;
+                let mut next_tag = buf_tag.clone();
+                next_tag.rel.relnode += 1; // skip to next relation
+                search_key.obj_key.tag = ObjectTag::RelationBuffer(next_tag);
            } else {
-                // no more relation metadata entries
                break;
            }
        }
@@ -233,10 +215,6 @@ impl ObjectStore for RocksObjectStore {
            lsn,
        }))
    }
-
-    fn compact(&self) {
-        self.db.compact_range::<&[u8], &[u8]>(None, None);
-    }
 }

 impl RocksObjectStore {
--- a/pageserver/src/waldecoder.rs
+++ b/pageserver/src/waldecoder.rs
@@ -2,14 +2,17 @@
 //! WAL decoder. For each WAL record, it decodes the record to figure out which data blocks
 //! the record affects, to add the records to the page cache.
 //!
+use crate::repository::*;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use log::*;
 use postgres_ffi::pg_constants;
 use postgres_ffi::xlog_utils::*;
+use postgres_ffi::CheckPoint;
 use postgres_ffi::XLogLongPageHeaderData;
 use postgres_ffi::XLogPageHeaderData;
 use postgres_ffi::XLogRecord;
 use std::cmp::min;
+use std::str;
 use thiserror::Error;
 use zenith_utils::lsn::Lsn;

@@ -21,6 +24,9 @@ pub type MultiXactId = TransactionId;
 pub type MultiXactOffset = u32;
 pub type MultiXactStatus = u32;

+const MAX_MBR_BLKNO: u32 =
+    pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+
 #[allow(dead_code)]
 pub struct WalStreamDecoder {
    lsn: Lsn,
@@ -196,12 +202,7 @@ pub struct DecodedBkpBlock {
    //in_use: bool,

    /* Identify the block this refers to */
-    pub rnode_spcnode: u32,
-    pub rnode_dbnode: u32,
-    pub rnode_relnode: u32,
-    // Note that we have a few special forknum values for non-rel files.
-    pub forknum: u8,
-    pub blkno: u32,
+    pub tag: ObjectTag,

    /* copy of the fork_flags field from the XLogRecordBlockHeader */
    flags: u8,
@@ -210,6 +211,7 @@ pub struct DecodedBkpBlock {
    has_image: bool,       /* has image, even for consistency checking */
    pub apply_image: bool, /* has image that should be restored */
    pub will_init: bool,   /* record doesn't need previous page version to apply */
+    pub will_drop: bool,   /* record drops relation */
    //char	   *bkp_image;
    hole_offset: u16,
    hole_length: u16,
@@ -224,16 +226,13 @@ pub struct DecodedBkpBlock {
 impl DecodedBkpBlock {
    pub fn new() -> DecodedBkpBlock {
        DecodedBkpBlock {
-            rnode_spcnode: 0,
-            rnode_dbnode: 0,
-            rnode_relnode: 0,
-            forknum: 0,
-            blkno: 0,
+            tag: ObjectTag::FirstTag,

            flags: 0,
            has_image: false,
            apply_image: false,
            will_init: false,
+            will_drop: false,
            hole_offset: 0,
            hole_length: 0,
            bimg_len: 0,
@@ -246,7 +245,6 @@ impl DecodedBkpBlock {
 }

 pub struct DecodedWALRecord {
-    pub xl_xid: TransactionId,
    pub xl_info: u8,
    pub xl_rmid: u8,
    pub record: Bytes, // raw XLogRecord
@@ -290,7 +288,9 @@ pub struct XlSmgrTruncate {
 }

 impl XlSmgrTruncate {
-    pub fn decode(buf: &mut Bytes) -> XlSmgrTruncate {
+    pub fn decode(decoded: &DecodedWALRecord) -> XlSmgrTruncate {
+        let mut buf = decoded.record.clone();
+        buf.advance((XLOG_SIZE_OF_XLOG_RECORD + 2) as usize);
        XlSmgrTruncate {
            blkno: buf.get_u32_le(),
            rnode: RelFileNode {
@@ -313,7 +313,9 @@ pub struct XlCreateDatabase {
 }

 impl XlCreateDatabase {
-    pub fn decode(buf: &mut Bytes) -> XlCreateDatabase {
+    pub fn decode(decoded: &DecodedWALRecord) -> XlCreateDatabase {
+        let mut buf = decoded.record.clone();
+        buf.advance((XLOG_SIZE_OF_XLOG_RECORD + 2) as usize);
        XlCreateDatabase {
            db_id: buf.get_u32_le(),
            tablespace_id: buf.get_u32_le(),
@@ -399,103 +401,6 @@ impl XlHeapUpdate {
    }
 }

-///
-/// Note: Parsing some fields is missing, because they're not needed.
-///
-/// This is similar to the xl_xact_parsed_commit and
-/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
-/// struct for commits and aborts.
-///
-#[derive(Debug)]
-pub struct XlXactParsedRecord {
-    pub xid: TransactionId,
-    pub info: u8,
-    pub xact_time: TimestampTz,
-    pub xinfo: u32,
-
-    pub db_id: Oid, /* MyDatabaseId */
-    pub ts_id: Oid, /* MyDatabaseTableSpace */
-
-    pub subxacts: Vec<TransactionId>,
-
-    pub xnodes: Vec<RelFileNode>,
-}
-
-impl XlXactParsedRecord {
-    /// Decode a XLOG_XACT_COMMIT/ABORT/COMMIT_PREPARED/ABORT_PREPARED
-    /// record. This should agree with the ParseCommitRecord and ParseAbortRecord
-    /// functions in PostgreSQL (in src/backend/access/rmgr/xactdesc.c)
-    pub fn decode(buf: &mut Bytes, mut xid: TransactionId, xl_info: u8) -> XlXactParsedRecord {
-        let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
-        // The record starts with time of commit/abort
-        let xact_time = buf.get_u64_le();
-        let xinfo;
-        if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
-            xinfo = buf.get_u32_le();
-        } else {
-            xinfo = 0;
-        }
-        let db_id;
-        let ts_id;
-        if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
-            db_id = buf.get_u32_le();
-            ts_id = buf.get_u32_le();
-        } else {
-            db_id = 0;
-            ts_id = 0;
-        }
-        let mut subxacts = Vec::<TransactionId>::new();
-        if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
-            let nsubxacts = buf.get_i32_le();
-            for _i in 0..nsubxacts {
-                let subxact = buf.get_u32_le();
-                subxacts.push(subxact);
-            }
-        }
-        let mut xnodes = Vec::<RelFileNode>::new();
-        if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
-            let nrels = buf.get_i32_le();
-            for _i in 0..nrels {
-                let spcnode = buf.get_u32_le();
-                let dbnode = buf.get_u32_le();
-                let relnode = buf.get_u32_le();
-                trace!(
-                    "XLOG_XACT_COMMIT relfilenode {}/{}/{}",
-                    spcnode,
-                    dbnode,
-                    relnode
-                );
-                xnodes.push(RelFileNode {
-                    spcnode,
-                    dbnode,
-                    relnode,
-                });
-            }
-        }
-        if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
-            let nmsgs = buf.get_i32_le();
-            for _i in 0..nmsgs {
-                let sizeof_shared_invalidation_message = 0;
-                buf.advance(sizeof_shared_invalidation_message);
-            }
-        }
-        if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
-            xid = buf.get_u32_le();
-            trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
-        }
-        XlXactParsedRecord {
-            xid,
-            info,
-            xact_time,
-            xinfo,
-            db_id,
-            ts_id,
-            subxacts,
-            xnodes,
-        }
-    }
-}
-
 #[repr(C)]
 #[derive(Debug)]
 pub struct MultiXactMember {
@@ -542,14 +447,14 @@ impl XlMultiXactCreate {
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlMultiXactTruncate {
-    pub oldest_multi_db: Oid,
+    oldest_multi_db: Oid,
    /* to-be-truncated range of multixact offsets */
-    pub start_trunc_off: MultiXactId, /* just for completeness' sake */
-    pub end_trunc_off: MultiXactId,
+    start_trunc_off: MultiXactId, /* just for completeness' sake */
+    end_trunc_off: MultiXactId,

    /* to-be-truncated range of multixact members */
-    pub start_trunc_memb: MultiXactOffset,
-    pub end_trunc_memb: MultiXactOffset,
+    start_trunc_memb: MultiXactOffset,
+    end_trunc_memb: MultiXactOffset,
 }

 impl XlMultiXactTruncate {
@@ -582,10 +487,11 @@ impl XlMultiXactTruncate {
 //      block data
 //      ...
 //      main data
-pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
-    let mut rnode_spcnode: u32 = 0;
-    let mut rnode_dbnode: u32 = 0;
-    let mut rnode_relnode: u32 = 0;
+pub fn decode_wal_record(checkpoint: &mut CheckPoint, record: Bytes) -> DecodedWALRecord {
+    let mut spcnode: u32 = 0;
+    let mut dbnode: u32 = 0;
+    let mut relnode: u32 = 0;
+    let mut forknum: u8;
    let mut got_rnode = false;

    let mut buf = record.clone();
@@ -601,6 +507,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
        xlogrec.xl_info
    );

+    checkpoint.update_next_xid(xlogrec.xl_xid);
    let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD;

    if buf.remaining() != remaining {
@@ -659,7 +566,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                max_block_id = block_id;

                fork_flags = buf.get_u8();
-                blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
+                forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
                blk.flags = fork_flags;
                blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
                blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0;
@@ -765,9 +672,9 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                    }
                }
                if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 {
-                    rnode_spcnode = buf.get_u32_le();
-                    rnode_dbnode = buf.get_u32_le();
-                    rnode_relnode = buf.get_u32_le();
+                    spcnode = buf.get_u32_le();
+                    dbnode = buf.get_u32_le();
+                    relnode = buf.get_u32_le();
                    got_rnode = true;
                } else if !got_rnode {
                    // TODO
@@ -778,18 +685,16 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                    goto err;           */
                }

-                blk.rnode_spcnode = rnode_spcnode;
-                blk.rnode_dbnode = rnode_dbnode;
-                blk.rnode_relnode = rnode_relnode;
-
-                blk.blkno = buf.get_u32_le();
-                trace!(
-                    "this record affects {}/{}/{} blk {}",
-                    rnode_spcnode,
-                    rnode_dbnode,
-                    rnode_relnode,
-                    blk.blkno
-                );
+                blk.tag = ObjectTag::RelationBuffer(BufferTag {
+                    rel: RelTag {
+                        forknum,
+                        spcnode,
+                        dbnode,
+                        relnode,
+                    },
+                    blknum: buf.get_u32_le(),
+                });
+                trace!("this record affects {:?}", blk.tag);

                blocks.push(blk);
            }
@@ -811,11 +716,226 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
        assert_eq!(buf.remaining(), main_data_len as usize);
    }

-    // 5. Handle a few special record types that modify blocks without registering
-    // them with the standard mechanism.
-    if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
-        let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-        let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
+    //5. Handle special CLOG and XACT records
+    if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
+        let mut blk = DecodedBkpBlock::new();
+        let blknum = buf.get_i32_le() as u32;
+        blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
+        let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
+        if info == pg_constants::CLOG_ZEROPAGE {
+            blk.will_init = true;
+        } else {
+            assert!(info == pg_constants::CLOG_TRUNCATE);
+            blk.will_drop = true;
+            checkpoint.oldestXid = buf.get_u32_le();
+            checkpoint.oldestXidDB = buf.get_u32_le();
+            trace!(
+                "RM_CLOG_ID truncate blkno {} oldestXid {} oldestXidDB {}",
+                blknum,
+                checkpoint.oldestXid,
+                checkpoint.oldestXidDB
+            );
+        }
+        trace!("RM_CLOG_ID updates block {}", blknum);
+        blocks.push(blk);
+    } else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
+        let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
+        if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_COMMIT_PREPARED
+        {
+            if info == pg_constants::XLOG_XACT_COMMIT {
+                let mut blk = DecodedBkpBlock::new();
+                let blknum = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
+                trace!(
+					"XLOG_XACT_COMMIT xl_info {} xl_prev {:X}/{:X}  xid {} updates block {} main_data_len {}",
+					xlogrec.xl_info, (xlogrec.xl_prev >> 32),
+					xlogrec.xl_prev & 0xffffffff,
+					xlogrec.xl_xid,
+					blknum,
+					main_data_len
+				);
+                blocks.push(blk);
+            }
+            //parse commit record to extract subtrans entries
+            // xl_xact_commit starts with time of commit
+            let _xact_time = buf.get_i64_le();
+
+            let mut xinfo = 0;
+            if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
+                xinfo = buf.get_u32_le();
+            }
+            if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
+                let _dbid = buf.get_u32_le();
+                let _tsid = buf.get_u32_le();
+            }
+            if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
+                let nsubxacts = buf.get_i32_le();
+                let mut prev_blknum = u32::MAX;
+                for _i in 0..nsubxacts {
+                    let subxact = buf.get_u32_le();
+                    let blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
+                    if prev_blknum != blknum {
+                        prev_blknum = blknum;
+                        let mut blk = DecodedBkpBlock::new();
+                        blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
+                        blocks.push(blk);
+                    }
+                }
+            }
+            if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
+                let nrels = buf.get_i32_le();
+                for _i in 0..nrels {
+                    let spcnode = buf.get_u32_le();
+                    let dbnode = buf.get_u32_le();
+                    let relnode = buf.get_u32_le();
+                    let mut blk = DecodedBkpBlock::new();
+                    blk.tag = ObjectTag::RelationMetadata(RelTag {
+                        forknum: pg_constants::MAIN_FORKNUM,
+                        spcnode,
+                        dbnode,
+                        relnode,
+                    });
+                    blk.will_drop = true;
+                    blocks.push(blk);
+                    trace!(
+                        "XLOG_XACT_COMMIT relfilenode {}/{}/{}",
+                        spcnode,
+                        dbnode,
+                        relnode
+                    );
+                }
+            }
+            if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
+                let nmsgs = buf.get_i32_le();
+                for _i in 0..nmsgs {
+                    let sizeof_shared_invalidation_message = 0;
+                    buf.advance(sizeof_shared_invalidation_message);
+                }
+            }
+            if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
+                let xid = buf.get_u32_le();
+                let mut blk = DecodedBkpBlock::new();
+                let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
+                blocks.push(blk);
+                trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
+                //TODO handle this to be able to restore pg_twophase on node start
+            }
+        } else if info == pg_constants::XLOG_XACT_ABORT
+            || info == pg_constants::XLOG_XACT_ABORT_PREPARED
+        {
+            if info == pg_constants::XLOG_XACT_ABORT {
+                let mut blk = DecodedBkpBlock::new();
+                let blknum = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
+                trace!(
+					"XLOG_XACT_ABORT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
+					xlogrec.xl_info, (xlogrec.xl_prev >> 32),
+					xlogrec.xl_prev & 0xffffffff,
+					xlogrec.xl_xid,
+					blknum,
+					main_data_len
+				);
+                blocks.push(blk);
+            }
+            //parse abort record to extract subtrans entries
+            // xl_xact_abort starts with time of commit
+            let _xact_time = buf.get_i64_le();
+
+            let mut xinfo = 0;
+            if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
+                xinfo = buf.get_u32_le();
+            }
+            if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
+                let _dbid = buf.get_u32_le();
+                let _tsid = buf.get_u32_le();
+            }
+            if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
+                let nsubxacts = buf.get_i32_le();
+                let mut prev_blknum = u32::MAX;
+                for _i in 0..nsubxacts {
+                    let subxact = buf.get_u32_le();
+                    let blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
+                    if prev_blknum != blknum {
+                        prev_blknum = blknum;
+                        let mut blk = DecodedBkpBlock::new();
+                        blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
+                        blocks.push(blk);
+                    }
+                }
+            }
+            if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
+                let nrels = buf.get_i32_le();
+                for _i in 0..nrels {
+                    let spcnode = buf.get_u32_le();
+                    let dbnode = buf.get_u32_le();
+                    let relnode = buf.get_u32_le();
+                    let mut blk = DecodedBkpBlock::new();
+                    blk.tag = ObjectTag::RelationMetadata(RelTag {
+                        forknum: pg_constants::MAIN_FORKNUM,
+                        spcnode,
+                        dbnode,
+                        relnode,
+                    });
+                    blk.will_drop = true;
+                    blocks.push(blk);
+                    trace!(
+                        "XLOG_XACT_ABORT relfilenode {}/{}/{}",
+                        spcnode,
+                        dbnode,
+                        relnode
+                    );
+                }
+            }
+            if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
+                let xid = buf.get_u32_le();
+                let mut blk = DecodedBkpBlock::new();
+                let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
+                blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
+                blocks.push(blk);
+                trace!("XLOG_XACT_ABORT-XACT_XINFO_HAS_TWOPHASE");
+            }
+        } else if info == pg_constants::XLOG_XACT_PREPARE {
+            let mut blk = DecodedBkpBlock::new();
+            blk.tag = ObjectTag::TwoPhase(PrepareTag {
+                xid: xlogrec.xl_xid,
+            });
+            blk.will_init = true;
+            blocks.push(blk);
+            debug!("Prepare transaction {}", xlogrec.xl_xid);
+        }
+    } else if xlogrec.xl_rmid == pg_constants::RM_DBASE_ID {
+        let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
+        if info == pg_constants::XLOG_DBASE_CREATE {
+            //buf points to main_data
+            let db_id = buf.get_u32_le();
+            let tablespace_id = buf.get_u32_le();
+            let src_db_id = buf.get_u32_le();
+            let src_tablespace_id = buf.get_u32_le();
+            trace!(
+                "XLOG_DBASE_CREATE tablespace_id/db_id {}/{} src_db_id {}/{}",
+                tablespace_id,
+                db_id,
+                src_tablespace_id,
+                src_db_id
+            );
+            // in postgres it is implemented as copydir
+            // we need to copy all pages in page_cache
+        } else {
+            trace!("XLOG_DBASE_DROP is not handled yet");
+        }
+    } else if xlogrec.xl_rmid == pg_constants::RM_TBLSPC_ID {
+        let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
+        if info == pg_constants::XLOG_TBLSPC_CREATE {
+            //buf points to main_data
+            let ts_id = buf.get_u32_le();
+            let ts_path = str::from_utf8(&buf).unwrap();
+            trace!("XLOG_TBLSPC_CREATE ts_id {} ts_path {}", ts_id, ts_path);
+        } else {
+            trace!("XLOG_TBLSPC_DROP is not handled yet");
+        }
+    } else if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
+        let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
        if info == pg_constants::XLOG_HEAP_INSERT {
            let xlrec = XlHeapInsert::decode(&mut buf);
            if (xlrec.flags
@@ -823,52 +943,96 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                    | pg_constants::XLH_INSERT_ALL_FROZEN_SET))
                != 0
            {
-                let mut blk = DecodedBkpBlock::new();
-                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
-                blk.blkno = blkno;
-                blk.rnode_spcnode = blocks[0].rnode_spcnode;
-                blk.rnode_dbnode = blocks[0].rnode_dbnode;
-                blk.rnode_relnode = blocks[0].rnode_relnode;
-                blocks.push(blk);
+                if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
+                    let mut blk = DecodedBkpBlock::new();
+                    blk.tag = ObjectTag::RelationBuffer(BufferTag {
+                        rel: RelTag {
+                            forknum: pg_constants::VISIBILITYMAP_FORKNUM,
+                            spcnode: tag0.rel.spcnode,
+                            dbnode: tag0.rel.dbnode,
+                            relnode: tag0.rel.relnode,
+                        },
+                        blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
+                    });
+                    blocks.push(blk);
+                } else {
+                    panic!(
+                        "Block 0 is expected to be relation buffer tag but it is {:?}",
+                        blocks[0].tag
+                    );
+                }
            }
        } else if info == pg_constants::XLOG_HEAP_DELETE {
            let xlrec = XlHeapDelete::decode(&mut buf);
            if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                let mut blk = DecodedBkpBlock::new();
-                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
-                blk.blkno = blkno;
-                blk.rnode_spcnode = blocks[0].rnode_spcnode;
-                blk.rnode_dbnode = blocks[0].rnode_dbnode;
-                blk.rnode_relnode = blocks[0].rnode_relnode;
-                blocks.push(blk);
+                if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
+                    let mut blk = DecodedBkpBlock::new();
+                    blk.tag = ObjectTag::RelationBuffer(BufferTag {
+                        rel: RelTag {
+                            forknum: pg_constants::VISIBILITYMAP_FORKNUM,
+                            spcnode: tag0.rel.spcnode,
+                            dbnode: tag0.rel.dbnode,
+                            relnode: tag0.rel.relnode,
+                        },
+                        blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
+                    });
+                    blocks.push(blk);
+                } else {
+                    panic!(
+                        "Block 0 is expected to be relation buffer tag but it is {:?}",
+                        blocks[0].tag
+                    );
+                }
            }
        } else if info == pg_constants::XLOG_HEAP_UPDATE
            || info == pg_constants::XLOG_HEAP_HOT_UPDATE
        {
            let xlrec = XlHeapUpdate::decode(&mut buf);
            if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                let mut blk = DecodedBkpBlock::new();
-                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
-                blk.blkno = blkno;
-                blk.rnode_spcnode = blocks[0].rnode_spcnode;
-                blk.rnode_dbnode = blocks[0].rnode_dbnode;
-                blk.rnode_relnode = blocks[0].rnode_relnode;
-                blocks.push(blk);
+                if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
+                    let mut blk = DecodedBkpBlock::new();
+                    blk.tag = ObjectTag::RelationBuffer(BufferTag {
+                        rel: RelTag {
+                            forknum: pg_constants::VISIBILITYMAP_FORKNUM,
+                            spcnode: tag0.rel.spcnode,
+                            dbnode: tag0.rel.dbnode,
+                            relnode: tag0.rel.relnode,
+                        },
+                        blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
+                    });
+                    blocks.push(blk);
+                } else {
+                    panic!(
+                        "Block 0 is expected to be relation buffer tag but it is {:?}",
+                        blocks[0].tag
+                    );
+                }
            }
            if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0
                && blocks.len() > 1
            {
-                let mut blk = DecodedBkpBlock::new();
-                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
-                blk.blkno = blocks[1].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
-                blk.rnode_spcnode = blocks[1].rnode_spcnode;
-                blk.rnode_dbnode = blocks[1].rnode_dbnode;
-                blk.rnode_relnode = blocks[1].rnode_relnode;
-                blocks.push(blk);
+                if let ObjectTag::RelationBuffer(tag1) = blocks[1].tag {
+                    let mut blk = DecodedBkpBlock::new();
+                    blk.tag = ObjectTag::RelationBuffer(BufferTag {
+                        rel: RelTag {
+                            forknum: pg_constants::VISIBILITYMAP_FORKNUM,
+                            spcnode: tag1.rel.spcnode,
+                            dbnode: tag1.rel.dbnode,
+                            relnode: tag1.rel.relnode,
+                        },
+                        blknum: tag1.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
+                    });
+                    blocks.push(blk);
+                } else {
+                    panic!(
+                        "Block 1 is expected to be relation buffer tag but it is {:?}",
+                        blocks[1].tag
+                    );
+                }
            }
        }
    } else if xlogrec.xl_rmid == pg_constants::RM_HEAP2_ID {
-        let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+        let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
        if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
            let xlrec = XlHeapMultiInsert::decode(&mut buf);
            if (xlrec.flags
@@ -876,20 +1040,164 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
                    | pg_constants::XLH_INSERT_ALL_FROZEN_SET))
                != 0
            {
+                if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
+                    let mut blk = DecodedBkpBlock::new();
+                    blk.tag = ObjectTag::RelationBuffer(BufferTag {
+                        rel: RelTag {
+                            forknum: pg_constants::VISIBILITYMAP_FORKNUM,
+                            spcnode: tag0.rel.spcnode,
+                            dbnode: tag0.rel.dbnode,
+                            relnode: tag0.rel.relnode,
+                        },
+                        blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
+                    });
+                    blocks.push(blk);
+                } else {
+                    panic!(
+                        "Block 0 is expected to be relation buffer tag but it is {:?}",
+                        blocks[0].tag
+                    );
+                }
+            }
+        }
+    } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
+        let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
+            let mut blk = DecodedBkpBlock::new();
+            blk.tag = ObjectTag::MultiXactOffsets(SlruBufferTag {
+                blknum: buf.get_u32_le(),
+            });
+            blk.will_init = true;
+            blocks.push(blk);
+        } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
+            let mut blk = DecodedBkpBlock::new();
+            blk.tag = ObjectTag::MultiXactMembers(SlruBufferTag {
+                blknum: buf.get_u32_le(),
+            });
+            blk.will_init = true;
+            blocks.push(blk);
+        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+            let xlrec = XlMultiXactCreate::decode(&mut buf);
+            // Update offset page
+            let mut blk = DecodedBkpBlock::new();
+            let blknum = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+            blk.tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
+            blocks.push(blk);
+            let first_mbr_blkno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+            let last_mbr_blkno =
+                (xlrec.moff + xlrec.nmembers - 1) / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+            // The members SLRU can, in contrast to the offsets one, be filled to almost
+            // the full range at once. So we need to handle wraparound.
+            let mut blknum = first_mbr_blkno;
+            loop {
+                // Update members page
                let mut blk = DecodedBkpBlock::new();
-                let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
-                blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
-                blk.blkno = blkno;
-                blk.rnode_spcnode = blocks[0].rnode_spcnode;
-                blk.rnode_dbnode = blocks[0].rnode_dbnode;
-                blk.rnode_relnode = blocks[0].rnode_relnode;
+                blk.tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
                blocks.push(blk);
+
+                if blknum == last_mbr_blkno {
+                    // last block inclusive
+                    break;
+                }
+
+                // handle wraparound
+                if blknum == MAX_MBR_BLKNO {
+                    blknum = 0;
+                } else {
+                    blknum += 1;
+                }
+            }
+            if xlrec.mid >= checkpoint.nextMulti {
+                checkpoint.nextMulti = xlrec.mid + 1;
+            }
+            if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
+                checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
+            }
+            let max_mbr_xid =
+                xlrec.members.iter().fold(
+                    0u32,
+                    |acc, mbr| {
+                        if mbr.xid > acc {
+                            mbr.xid
+                        } else {
+                            acc
+                        }
+                    },
+                );
+            checkpoint.update_next_xid(max_mbr_xid);
+        } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
+            let xlrec = XlMultiXactTruncate::decode(&mut buf);
+            checkpoint.oldestMulti = xlrec.end_trunc_off;
+            checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
+            let first_off_blkno =
+                xlrec.start_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+            let last_off_blkno =
+                xlrec.end_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
+            // Delete all the segments but the last one. The last segment can still
+            // contain, possibly partially, valid data.
+            for blknum in first_off_blkno..last_off_blkno {
+                let mut blk = DecodedBkpBlock::new();
+                blk.tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
+                blk.will_drop = true;
+                blocks.push(blk);
+            }
+            let first_mbr_blkno =
+                xlrec.start_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+            let last_mbr_blkno =
+                xlrec.end_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+            // The members SLRU can, in contrast to the offsets one, be filled to almost
+            // the full range at once. So we need to handle wraparound.
+            let mut blknum = first_mbr_blkno;
+            // Delete all the segments but the last one. The last segment can still
+            // contain, possibly partially, valid data.
+            while blknum != last_mbr_blkno {
+                let mut blk = DecodedBkpBlock::new();
+                blk.tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
+                blk.will_drop = true;
+                blocks.push(blk);
+                // handle wraparound
+                if blknum == MAX_MBR_BLKNO {
+                    blknum = 0;
+                } else {
+                    blknum += 1;
+                }
+            }
+        } else {
+            panic!()
+        }
+    } else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
+        let xlrec = XlRelmapUpdate::decode(&mut buf);
+        let mut blk = DecodedBkpBlock::new();
+        blk.tag = ObjectTag::FileNodeMap(DatabaseTag {
+            spcnode: xlrec.tsid,
+            dbnode: xlrec.dbid,
+        });
+        blk.will_init = true;
+        blocks.push(blk);
+    } else if xlogrec.xl_rmid == pg_constants::RM_XLOG_ID {
+        let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_NEXTOID {
+            let next_oid = buf.get_u32_le();
+            if next_oid > checkpoint.nextOid {
+                checkpoint.nextOid = next_oid;
+            }
+        } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
+            || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+        {
+            let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
+            buf.copy_to_slice(&mut checkpoint_bytes);
+            let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap();
+            trace!(
+                "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
+                xlog_checkpoint.oldestXid, checkpoint.oldestXid
+            );
+            if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
+                checkpoint.oldestXid = xlog_checkpoint.oldestXid;
            }
        }
    }

    DecodedWALRecord {
-        xl_xid: xlogrec.xl_xid,
        xl_info: xlogrec.xl_info,
        xl_rmid: xlogrec.xl_rmid,
        record,
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -4,8 +4,8 @@
 //!
 //! We keep one WAL receiver active per timeline.

-use crate::object_key::*;
 use crate::page_cache;
+use crate::repository::*;
 use crate::restore_local_repo;
 use crate::waldecoder::*;
 use crate::PageServerConf;
@@ -190,26 +190,17 @@ fn walreceiver_main(
                waldecoder.feed_bytes(data);

                while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                    // Save old checkpoint value to compare with it after decoding WAL record
                    let old_checkpoint_bytes = checkpoint.encode();
-                    let decoded = decode_wal_record(recdata.clone());
-                    restore_local_repo::save_decoded_record(
-                        &mut checkpoint,
-                        &*timeline,
-                        &decoded,
-                        recdata,
-                        lsn,
-                    )?;
+                    let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
+                    restore_local_repo::save_decoded_record(&*timeline, &decoded, recdata, lsn)?;
                    last_rec_lsn = lsn;

                    let new_checkpoint_bytes = checkpoint.encode();
-                    // Check if checkpoint data was updated by save_decoded_record
                    if new_checkpoint_bytes != old_checkpoint_bytes {
                        timeline.put_page_image(
                            ObjectTag::Checkpoint,
                            lsn,
                            new_checkpoint_bytes,
-                            false,
                        )?;
                    }
                }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -18,6 +18,7 @@
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use log::*;
+use std::assert;
 use std::cell::RefCell;
 use std::fs;
 use std::fs::OpenOptions;
@@ -36,10 +37,7 @@ use tokio::time::timeout;
 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

-use crate::object_key::*;
-use crate::repository::BufferTag;
-use crate::repository::WALRecord;
-use crate::waldecoder::XlXactParsedRecord;
+use crate::repository::{BufferTag, ObjectTag, WALRecord};
 use crate::waldecoder::{MultiXactId, XlMultiXactCreate};
 use crate::PageServerConf;
 use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
@@ -173,7 +171,6 @@ impl WalRedoManager for PostgresRedoManager {
    ) -> Result<Bytes, WalRedoError> {
        // Create a channel where to receive the response
        let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
-
        let request = WalRedoRequest {
            tag,
            lsn,
@@ -181,7 +178,6 @@ impl WalRedoManager for PostgresRedoManager {
            records,
            response_channel: tx,
        };
-
        self.request_tx
            .lock()
            .unwrap()
@@ -313,57 +309,150 @@ impl PostgresRedoManagerInternal {
                } else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
                    // Transaction manager stuff
                    let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
+                    let mut status = 0;
                    let tag_blknum = match tag {
                        ObjectTag::Clog(slru) => slru.blknum,
                        ObjectTag::TwoPhase(_) => {
                            assert!(info == pg_constants::XLOG_XACT_PREPARE);
-                            trace!("Apply prepare {} record", xlogrec.xl_xid);
-                            page.clear();
-                            page.extend_from_slice(&buf[..]);
-                            continue;
+                            0 // not used by XLOG_XACT_PREPARE
                        }
                        _ => panic!("Not valid XACT object tag {:?}", tag),
                    };
-                    let parsed_xact =
-                        XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
-                    if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
-                        || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
+                    if info == pg_constants::XLOG_XACT_COMMIT
+                        || info == pg_constants::XLOG_XACT_COMMIT_PREPARED
                    {
-                        transaction_id_set_status(
-                            parsed_xact.xid,
-                            pg_constants::TRANSACTION_STATUS_COMMITTED,
-                            &mut page,
-                        );
-                        for subxact in &parsed_xact.subxacts {
-                            let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                            // only update xids on the requested page
-                            if tag_blknum == blkno {
-                                transaction_id_set_status(
-                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_SUB_COMMITTED,
-                                    &mut page,
-                                );
+                        status = pg_constants::TRANSACTION_STATUS_COMMITTED;
+                        if info == pg_constants::XLOG_XACT_COMMIT {
+                            // status of 2PC transaction will be set later
+                            transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
+                        }
+                        let _xact_time = buf.get_i64_le();
+                        // decode xinfo
+                        let mut xinfo = 0;
+                        if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
+                            xinfo = buf.get_u32_le();
+                            if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
+                                let _dbid = buf.get_u32_le();
+                                let _tsid = buf.get_u32_le();
                            }
                        }
-                    } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
-                        || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
-                    {
-                        transaction_id_set_status(
-                            parsed_xact.xid,
-                            pg_constants::TRANSACTION_STATUS_ABORTED,
-                            &mut page,
-                        );
-                        for subxact in &parsed_xact.subxacts {
-                            let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                            // only update xids on the requested page
-                            if tag_blknum == blkno {
-                                transaction_id_set_status(
-                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_ABORTED,
-                                    &mut page,
-                                );
+
+                        // handle subtrans
+                        if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
+                            let nsubxacts = buf.get_i32_le();
+                            for _i in 0..nsubxacts {
+                                let subxact = buf.get_u32_le();
+                                let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                                // only update xids on the requested page
+
+                                if tag_blknum == blkno {
+                                    status = pg_constants::TRANSACTION_STATUS_SUB_COMMITTED;
+                                    transaction_id_set_status(subxact, status, &mut page);
+                                }
                            }
                        }
+                        if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
+                            // Do not need to handle dropped relations here, just need to skip them
+                            if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
+                                let nrels = buf.get_i32_le();
+                                for _i in 0..nrels {
+                                    let spcnode = buf.get_u32_le();
+                                    let dbnode = buf.get_u32_le();
+                                    let relnode = buf.get_u32_le();
+                                    //TODO handle this too?
+                                    trace!(
+                                        "XLOG_XACT_COMMIT relfilenode {}/{}/{}",
+                                        spcnode,
+                                        dbnode,
+                                        relnode
+                                    );
+                                }
+                            }
+                            // Skip invalidations
+                            if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
+                                let nmsgs = buf.get_i32_le();
+                                for _i in 0..nmsgs {
+                                    let sizeof_shared_invalidation_message = 0;
+                                    buf.advance(sizeof_shared_invalidation_message);
+                                }
+                            }
+                            // Set status of 2PC transaction
+                            assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
+                            let xid = buf.get_u32_le();
+                            transaction_id_set_status(xid, status, &mut page);
+                        }
+                    } else if info == pg_constants::XLOG_XACT_ABORT
+                        || info == pg_constants::XLOG_XACT_ABORT_PREPARED
+                    {
+                        status = pg_constants::TRANSACTION_STATUS_ABORTED;
+                        if info == pg_constants::XLOG_XACT_ABORT {
+                            // status of 2PC transaction will be set later
+                            transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
+                        }
+                        //handle subtrans
+                        let _xact_time = buf.get_i64_le();
+                        // decode xinfo
+                        let mut xinfo = 0;
+                        if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
+                            xinfo = buf.get_u32_le();
+                            if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
+                                let _dbid = buf.get_u32_le();
+                                let _tsid = buf.get_u32_le();
+                            }
+                        }
+
+                        // handle subtrans
+                        if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
+                            let nsubxacts = buf.get_i32_le();
+                            for _i in 0..nsubxacts {
+                                let subxact = buf.get_u32_le();
+                                let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                                // only update xids on the requested page
+                                if tag_blknum == blkno {
+                                    status = pg_constants::TRANSACTION_STATUS_ABORTED;
+                                    transaction_id_set_status(subxact, status, &mut page);
+                                }
+                            }
+                        }
+                        if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
+                            // Do not need to handle dropped relations here, just need to skip them
+                            if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
+                                let nrels = buf.get_i32_le();
+                                for _i in 0..nrels {
+                                    let spcnode = buf.get_u32_le();
+                                    let dbnode = buf.get_u32_le();
+                                    let relnode = buf.get_u32_le();
+                                    //TODO handle this too?
+                                    trace!(
+                                        "XLOG_XACT_COMMIT relfilenode {}/{}/{}",
+                                        spcnode,
+                                        dbnode,
+                                        relnode
+                                    );
+                                }
+                            }
+                            // Skip invalidations
+                            if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
+                                let nmsgs = buf.get_i32_le();
+                                for _i in 0..nmsgs {
+                                    let sizeof_shared_invalidation_message = 0;
+                                    buf.advance(sizeof_shared_invalidation_message);
+                                }
+                            }
+                            // Set status of 2PC transaction
+                            assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
+                            let xid = buf.get_u32_le();
+                            transaction_id_set_status(xid, status, &mut page);
+                        }
+                    } else if info == pg_constants::XLOG_XACT_PREPARE {
+                        trace!("Apply prepare {} record", xlogrec.xl_xid);
+                        page.clear();
+                        page.extend_from_slice(&buf[..]);
+                    } else {
+                        error!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {} main_data_offset {}, rec.len {}",
+                               status,
+                               record.lsn,
+                               record.main_data_offset, record.rec.len());
                    }
                } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
                    // Multiexact operations
@@ -371,7 +460,7 @@ impl PostgresRedoManagerInternal {
                    if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
                        || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
                    {
-                        // Just need to zero page
+                        // Just need to ero page
                        page.copy_from_slice(&ZERO_PAGE);
                    } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
                        let xlrec = XlMultiXactCreate::decode(&mut buf);
--- a/postgres_ffi/build.rs
+++ b/postgres_ffi/build.rs
@@ -15,6 +15,7 @@ fn main() {
        // All the needed PostgreSQL headers are included from 'pg_control_ffi.h'
        //
        .header("pg_control_ffi.h")
+        .header("xlog_ffi.h")
        //
        // Tell cargo to invalidate the built crate whenever any of the
        // included header files changed.
--- a/postgres_ffi/src/pg_constants.rs
+++ b/postgres_ffi/src/pg_constants.rs
@@ -67,7 +67,6 @@ pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usi

 /* mask for filtering opcodes out of xl_info */
 pub const XLOG_XACT_OPMASK: u8 = 0x70;
-pub const XLOG_HEAP_OPMASK: u8 = 0x70;
 /* does this record have a 'xinfo' field or not */
 pub const XLOG_XACT_HAS_INFO: u8 = 0x80;

--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -390,13 +390,6 @@ impl CheckPoint {
    }
 }

-//
-// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
-// We need this segment to start compute node.
-// In order to minimize changes in Postgres core, we prefer to
-// provide WAL segment from which is can extract checkpoint record in standard way,
-// rather then implement some alternative mechanism.
-//
 pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
    let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);

--- a/postgres_ffi/xlog_ffi.h
+++ b/postgres_ffi/xlog_ffi.h
@@ -0,0 +1,3 @@
+#include "c.h"
+#include "access/xlog_internal.h"
+#include "access/xlogrecord.h"
--- a/test_runner/batch_others/test_gc.py
+++ b/test_runner/batch_others/test_gc.py
@@ -1,97 +0,0 @@
-from contextlib import closing
-import psycopg2.extras
-
-pytest_plugins = ("fixtures.zenith_fixtures")
-
-#
-# Test Garbage Collection of old page versions.
-#
-# This test is pretty tightly coupled with the current implementation of page version storage
-# and garbage collection in object_repository.rs.
-#
-def test_gc(zenith_cli, pageserver, postgres, pg_bin):
-    zenith_cli.run(["branch", "test_gc", "empty"])
-    pg = postgres.create_start('test_gc')
-
-    with closing(pg.connect()) as conn:
-        with conn.cursor() as cur:
-            with closing(pageserver.connect()) as psconn:
-                with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
-
-                    # Get the timeline ID of our branch. We need it for the 'do_gc' command
-                    cur.execute("SHOW zenith.zenith_timeline")
-                    timeline = cur.fetchone()[0]
-
-                    # Create a test table
-                    cur.execute("CREATE TABLE foo(x integer)")
-
-                    # Run GC, to clear out any old page versions left behind in the catalogs by
-                    # the CREATE TABLE command. We want to have a clean slate with no garbage
-                    # before running the actual tests below, otherwise the counts won't match
-                    # what we expect.
-                    print("Running GC before test")
-                    pscur.execute(f"do_gc {timeline} 0")
-                    row = pscur.fetchone()
-                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
-                    # remember the number of relations
-                    n_relations = row['n_relations']
-                    assert n_relations > 0
-
-                    # Insert a row. The first insert will also create a metadata entry for the
-                    # relation, with size == 1 block. Hence, bump up the expected relation count.
-                    n_relations += 1;
-                    print("Inserting one row and running GC")
-                    cur.execute("INSERT INTO foo VALUES (1)")
-                    pscur.execute(f"do_gc {timeline} 0")
-                    row = pscur.fetchone()
-                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
-                    assert row['n_relations'] == n_relations
-                    assert row['dropped'] == 0
-                    assert row['truncated'] == 30
-                    assert row['deleted'] == 3
-
-                    # Insert two more rows and run GC.
-                    print("Inserting two more rows and running GC")
-                    cur.execute("INSERT INTO foo VALUES (2)")
-                    cur.execute("INSERT INTO foo VALUES (3)")
-
-                    pscur.execute(f"do_gc {timeline} 0")
-                    row = pscur.fetchone()
-                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
-                    assert row['n_relations'] == n_relations
-                    assert row['dropped'] == 0
-                    assert row['truncated'] == 30
-                    assert row['deleted'] == 2
-
-                    # Insert one more row. It creates one more page version, but doesn't affect the
-                    # relation size.
-                    print("Inserting one more row")
-                    cur.execute("INSERT INTO foo VALUES (3)")
-
-                    pscur.execute(f"do_gc {timeline} 0")
-                    row = pscur.fetchone()
-                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
-                    assert row['n_relations'] == n_relations
-                    assert row['dropped'] == 0
-                    assert row['truncated'] == 30
-                    assert row['deleted'] == 1
-
-                    # Run GC again, with no changes in the database. Should not remove anything.
-                    pscur.execute(f"do_gc {timeline} 0")
-                    row = pscur.fetchone()
-                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
-                    assert row['n_relations'] == n_relations
-                    assert row['dropped'] == 0
-                    assert row['truncated'] == 30
-                    assert row['deleted'] == 0
-
-                    #
-                    # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
-                    #
-                    cur.execute("DROP TABLE foo")
-
-                    pscur.execute(f"do_gc {timeline} 0")
-                    row = pscur.fetchone()
-                    print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
-                    # Each relation fork is counted separately, hence 3.
-                    assert row['dropped'] == 3
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
@@ -472,11 +472,11 @@ class WalAcceptor:

        cmd = [self.wa_binpath]
        cmd.extend(["-D", self.data_dir])
-        cmd.extend(["-l", "localhost:{}".format(self.port)])
+        cmd.extend(["-l", "127.0.0.1:{}".format(self.port)])
        cmd.append("--daemonize")
        cmd.append("--no-sync")
        # Tell page server it can receive WAL from this WAL safekeeper
-        cmd.extend(["--pageserver", "localhost:{}".format(DEFAULT_PAGESERVER_PORT)])
+        cmd.extend(["--pageserver", "127.0.0.1:{}".format(DEFAULT_PAGESERVER_PORT)])
        cmd.extend(["--recall", "1 second"])
        print('Running command "{}"'.format(' '.join(cmd)))
        subprocess.run(cmd, check=True)
@@ -543,7 +543,7 @@ class WalAcceptorFactory:

    def get_connstrs(self) -> str:
        """ Get list of wal acceptor endpoints suitable for wal_acceptors GUC  """
-        return ','.join(["localhost:{}".format(wa.port) for wa in self.instances])
+        return ','.join(["127.0.0.1:{}".format(wa.port) for wa in self.instances])


@zenfixture
--- a/vendor/postgres
+++ b/vendor/postgres
--- a/walkeeper/src/bin/wal_acceptor.rs
+++ b/walkeeper/src/bin/wal_acceptor.rs
@@ -8,6 +8,7 @@ use log::*;
 use parse_duration::parse;
 use slog::Drain;
 use std::io;
+use std::net::ToSocketAddrs;
 use std::path::{Path, PathBuf};
 use std::thread;
 use std::time::Duration;
@@ -73,7 +74,7 @@ fn main() -> Result<()> {
        daemonize: false,
        no_sync: false,
        pageserver_addr: None,
-        listen_addr: "localhost:5454".to_string(),
+        listen_addr: "127.0.0.1:5454".parse()?,
        ttl: None,
        recall_period: None,
    };
@@ -94,11 +95,17 @@ fn main() -> Result<()> {
    }

    if let Some(addr) = arg_matches.value_of("listen") {
-        conf.listen_addr = addr.to_owned();
+        // TODO: keep addr vector in config and listen them all
+        // XXX: with our callmemaybe approach we need to set 'advertised address'
+        // as it is not always possible to listen it. Another reason to ditch callmemaybe.
+        let addrs: Vec<_> = addr.to_socket_addrs().unwrap().collect();
+        conf.listen_addr = addrs[0];
    }

    if let Some(addr) = arg_matches.value_of("pageserver") {
-        conf.pageserver_addr = Some(addr.to_owned());
+        // TODO: keep addr vector in config and check them all while connecting
+        let addrs: Vec<_> = addr.to_socket_addrs().unwrap().collect();
+        conf.pageserver_addr = Some(addrs[0]);
    }

    if let Some(ttl) = arg_matches.value_of("ttl") {
@@ -188,19 +195,12 @@ fn init_logging(
    if conf.daemonize {
        let decorator = slog_term::PlainSyncDecorator::new(log_file);
        let drain = slog_term::CompactFormat::new(decorator).build();
-        let drain = slog::Filter::new(drain, |record: &slog::Record| {
-            record.level().is_at_least(slog::Level::Info)
-        });
        let drain = std::sync::Mutex::new(drain).fuse();
        let logger = slog::Logger::root(drain, slog::o!());
        Ok(slog_scope::set_global_logger(logger))
    } else {
        let decorator = slog_term::TermDecorator::new().build();
        let drain = slog_term::FullFormat::new(decorator).build().fuse();
-        let drain = slog::Filter::new(drain, |record: &slog::Record| {
-            record.level().is_at_least(slog::Level::Info)
-        })
-        .fuse();
        let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse();
        let logger = slog::Logger::root(drain, slog::o!());
        Ok(slog_scope::set_global_logger(logger))
--- a/walkeeper/src/lib.rs
+++ b/walkeeper/src/lib.rs
@@ -1,4 +1,5 @@
 //
+use std::net::SocketAddr;
 use std::path::PathBuf;
 use std::time::Duration;

@@ -14,8 +15,8 @@ pub struct WalAcceptorConf {
    pub data_dir: PathBuf,
    pub daemonize: bool,
    pub no_sync: bool,
-    pub listen_addr: String,
-    pub pageserver_addr: Option<String>,
+    pub listen_addr: SocketAddr,
+    pub pageserver_addr: Option<SocketAddr>,
    pub ttl: Option<Duration>,
    pub recall_period: Option<Duration>,
 }
--- a/walkeeper/src/receive_wal.rs
+++ b/walkeeper/src/receive_wal.rs
@@ -4,9 +4,8 @@

 use anyhow::{bail, Result};
 use log::*;
-use postgres::{Client, Config, NoTls};
+use postgres::{Client, NoTls};
 use serde::{Deserialize, Serialize};
-use zenith_utils::connstring::connection_host_port;
 use std::cmp::{max, min};
 use std::fs::{self, File, OpenOptions};
 use std::io::{BufReader, Read, Seek, SeekFrom, Write};
@@ -150,18 +149,19 @@ pub struct ReceiveWalConn {
 /// If pageserver already has replication channel, it will just ignore this request
 ///
 fn request_callback(conf: WalAcceptorConf, timelineid: ZTimelineId) {
-    let ps_addr = conf.pageserver_addr.unwrap();
-    let ps_connstr = format!("postgresql://no_user@{}/no_db", ps_addr);
-
-    // use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses
-    let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_addr);
-    let me_conf: Config = me_connstr.parse().unwrap();
-    let (host, port) = connection_host_port(&me_conf);
+    let addr = conf.pageserver_addr.unwrap();
+    let ps_connstr = format!(
+        "host={} port={} dbname={} user={}",
+        addr.ip(),
+        addr.port(),
+        "no_db",
+        "no_user",
+    );
    let callme = format!(
        "callmemaybe {} host={} port={} options='-c ztimelineid={}'",
        timelineid,
-        host,
-        port,
+        conf.listen_addr.ip(),
+        conf.listen_addr.port(),
        timelineid
    );
    loop {
@@ -241,6 +241,9 @@ impl ReceiveWalConn {
        my_info.server = server_info.clone();
        my_info.server.node_id = node_id;

+        /* Need to save incompleted my_info in timeline to provide wal_seg_size for find_end_of_wal */
+        self.timeline.get().set_info(&my_info);
+
        /* Calculate WAL end based on local data */
        let (flush_lsn, timeline) = self.timeline.find_end_of_wal(&self.conf.data_dir, true);
        my_info.flush_lsn = flush_lsn;
--- a/walkeeper/src/replication.rs
+++ b/walkeeper/src/replication.rs
@@ -63,11 +63,15 @@ impl ReplicationConn {
                    let feedback = HotStandbyFeedback::des(&m)?;
                    timeline.add_hs_feedback(feedback)
                }
-                msg => {
+                None => {
+                    break;
+                }
+                Some(msg) => {
                    info!("unexpected message {:?}", msg);
                }
            }
        }
+        Err(anyhow!("Connection closed"))
    }

    /// Helper function that parses a pair of LSNs.
--- a/walkeeper/src/timeline.rs
+++ b/walkeeper/src/timeline.rs
@@ -232,6 +232,7 @@ impl TimelineTools for Option<Arc<Timeline>> {
    /// Find last WAL record. If "precise" is false then just locate last partial segment
    fn find_end_of_wal(&self, data_dir: &Path, precise: bool) -> (Lsn, TimeLineID) {
        let seg_size = self.get().get_info().server.wal_seg_size as usize;
+        assert!(seg_size > 0);
        let (lsn, timeline) = find_end_of_wal(data_dir, seg_size, precise);
        (Lsn(lsn), timeline)
    }
--- a/walkeeper/src/wal_service.rs
+++ b/walkeeper/src/wal_service.rs
@@ -16,7 +16,7 @@ use zenith_utils::postgres_backend::PostgresBackend;
 /// Accept incoming TCP connections and spawn them into a background thread.
 pub fn thread_main(conf: WalAcceptorConf) -> Result<()> {
    info!("Starting wal acceptor on {}", conf.listen_addr);
-    let listener = TcpListener::bind(conf.listen_addr.clone()).map_err(|e| {
+    let listener = TcpListener::bind(conf.listen_addr).map_err(|e| {
        error!("failed to bind to address {}: {}", conf.listen_addr, e);
        e
    })?;
--- a/zenith_utils/Cargo.toml
+++ b/zenith_utils/Cargo.toml
@@ -12,7 +12,6 @@ log = "0.4.14"
 serde = { version = "1.0", features = ["derive"] }
 bincode = "1.3"
 thiserror = "1.0"
-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 workspace_hack = { path = "../workspace_hack" }

 [dev-dependencies]
--- a/zenith_utils/src/connstring.rs
+++ b/zenith_utils/src/connstring.rs
@@ -1,34 +0,0 @@
-use postgres::Config;
-
-pub fn connection_host_port(config: &Config) -> (String, u16) {
-    assert_eq!(config.get_hosts().len(), 1, "only one pair of host and port is supported in connection string");
-    assert_eq!(config.get_ports().len(), 1, "only one pair of host and port is supported in connection string");
-    let host = match &config.get_hosts()[0] {
-        postgres::config::Host::Tcp(host) => host.as_ref(),
-        postgres::config::Host::Unix(host) => host.to_str().unwrap(),
-    };
-    (host.to_owned(), config.get_ports()[0])
-}
-
-pub fn connection_address(config: &Config) -> String {
-    let (host, port) = connection_host_port(config);
-    format!("{}:{}", host, port)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_connection_host_port() {
-        let config: Config = "postgresql://no_user@localhost:64000/no_db".parse().unwrap();
-        assert_eq!(connection_host_port(&config), ("localhost".to_owned(), 64000));
-    }
-
-    #[test]
-    #[should_panic(expected = "only one pair of host and port is supported in connection string")]
-    fn test_connection_host_port_multiple_ports() {
-        let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db".parse().unwrap();
-        assert_eq!(connection_host_port(&config), ("localhost".to_owned(), 64000));
-    }
-}
--- a/zenith_utils/src/lib.rs
+++ b/zenith_utils/src/lib.rs
@@ -13,6 +13,3 @@ pub mod bin_ser;

 pub mod postgres_backend;
 pub mod pq_proto;
-
-// dealing with connstring parsing and handy access to it's parts
-pub mod connstring;
Author	SHA1	Message	Date
Konstantin Knizhnik	600588034b	Store page image with the same LSN as replaced WAL record	2021-07-09 12:06:46 +03:00
Konstantin Knizhnik	9f015bdc60	Include zenith.signal file in tarball	2021-06-21 16:01:10 +03:00
Konstantin Knizhnik	c564272142	[refer #258 ] Handle CHECKOINT_ONLINE WAL record	2021-06-18 23:55:59 +03:00
Konstantin Knizhnik	14a0ae5456	Start compute node without generation of new WAL segment	2021-06-17 22:55:13 +03:00
Konstantin Knizhnik	0d9805a505	Handle non-relational data in PUSH command	2021-06-17 22:49:44 +03:00
Konstantin Knizhnik	87fce3fcd5	1. Always start repliction from the begging of WAL segment (to be able to skip missed segments) 2. Do not materialize always last version of objects in GC (only when needed) 3. Fix history test 4. Fix CPU consumption in wal_keeper when connection is broken 5. Fix handling of --recall parameter in walkeeper	2021-06-17 22:49:44 +03:00
Konstantin Knizhnik	04e1ee5ce3	Correctly handle Unlink message in GC for SLRU	2021-06-17 22:49:44 +03:00
anastasia	ec675bbdd6	use correct req_lsn when gathering basebackup tar	2021-06-17 22:49:44 +03:00
Konstantin Knizhnik	a3b70745a9	Support collecting nonrel object in GC	2021-06-17 22:49:44 +03:00
Konstantin Knizhnik	41c352be7f	Batch nextXid updates in object storage checkpoints	2021-06-17 22:49:44 +03:00
Konstantin Knizhnik	8a5fcf52c0	Remove special handling of shared catalogs	2021-06-17 22:49:44 +03:00
Konstantin Knizhnik	abb114decd	Fix review issues: - Add comments - Handle members multixact wraparoud - Extract WAL segment creation (pg_resetwal) to postgres_ffi - Fix adding prepared 2PC files to basebackup	2021-06-17 22:49:44 +03:00
Konstantin Knizhnik	e7587ceb81	Makecheckpoint and pg_control records in object storage also versioned	2021-06-17 22:49:44 +03:00
Konstantin Knizhnik	6d38b9ce6a	Fix unit tests after merging	2021-06-17 22:49:44 +03:00
Konstantin Knizhnik	47ef9c7ef4	Fix various bugs caused by switch to new storage model	2021-06-17 22:49:44 +03:00
Konstantin Knizhnik	d73cb49f89	Support non-rel objects	2021-06-17 22:49:44 +03:00