pageserver - walredo nits

pageserver - walredo request serialization module
pageserver - reduce # of copies in walredo request
2026-08-03 03:10:38 +00:00 · 2021-10-05 23:47:37 -07:00 · 2021-10-05 23:45:20 -07:00 · 2021-10-05 23:36:02 -07:00 · 2021-10-05 23:33:47 -07:00 · 2021-10-05 23:26:01 -07:00
34 changed files with 671 additions and 1194 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -24,12 +24,6 @@ jobs:
  # A job to build postgres
  build-postgres:
    executor: zenith-build-executor
-    parameters:
-      build_type:
-        type: enum
-        enum: ["debug", "release"]
-    environment:
-      BUILD_TYPE: << parameters.build_type >>
    steps:
        # Checkout the git repo (circleci doesn't have a flag to enable submodules here)
      - checkout
@@ -45,7 +39,7 @@ jobs:
          name: Restore postgres cache
          keys:
            # Restore ONLY if the rev key matches exactly
-            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
+            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}

        # FIXME We could cache our own docker container, instead of installing packages every time.
      - run:
@@ -65,12 +59,12 @@ jobs:
            if [ ! -e tmp_install/bin/postgres ]; then
              # "depth 1" saves some time by not cloning the whole repo
              git submodule update --init --depth 1
-              make postgres -j8
+              make postgres
            fi

      - save_cache:
          name: Save postgres cache
-          key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
+          key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
          paths:
            - tmp_install

@@ -102,7 +96,7 @@ jobs:
          name: Restore postgres cache
          keys:
            # Restore ONLY if the rev key matches exactly
-            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
+            - v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}

      - restore_cache:
          name: Restore rust cache
@@ -334,18 +328,14 @@ workflows:
  build_and_test:
    jobs:
      - check-codestyle
-      - build-postgres:
-          name: build-postgres-<< matrix.build_type >>
-          matrix:
-            parameters:
-              build_type: ["debug", "release"]
+      - build-postgres
      - build-zenith:
          name: build-zenith-<< matrix.build_type >>
          matrix:
            parameters:
              build_type: ["debug", "release"]
          requires:
-            - build-postgres-<< matrix.build_type >>
+            - build-postgres
      - run-pytest:
          name: pg_regress-tests-<< matrix.build_type >>
          matrix:
--- a/.dockerignore
+++ b/.dockerignore
@@ -11,8 +11,4 @@ test_output
 .vscode
 .zenith
 integration_tests/.zenith
-.mypy_cache
-
-Dockerfile
-.dockerignore
-
+.mypy_cache
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2339,7 +2339,6 @@ dependencies = [
 "byteorder",
 "bytes",
 "clap",
- "const_format",
 "crc32c",
 "daemonize",
 "fs2",
@@ -2359,7 +2358,6 @@ dependencies = [
 "tokio-stream",
 "walkdir",
 "workspace_hack",
- "zenith_metrics",
 "zenith_utils",
 ]

--- a/1
+++ b/1
@@ -10,7 +10,6 @@ FROM zenithdb/build:buster AS pg-build
 WORKDIR /zenith
 COPY ./vendor/postgres vendor/postgres
 COPY ./Makefile Makefile
-ENV BUILD_TYPE release
 RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
 RUN rm -rf postgres_install/build

--- a/18
+++ b/18
@@ -6,18 +6,6 @@ else
 	SECCOMP =
 endif

-#
-# We differentiate between release / debug build types using the BUILD_TYPE
-# environment variable.
-#
-ifeq ($(BUILD_TYPE),release)
-	PG_CONFIGURE_OPTS = --enable-debug
-	PG_CFLAGS = -O2 -g3 ${CFLAGS}
-else
-	PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
-	PG_CFLAGS = -O0 -g3 ${CFLAGS}
-endif
-
 #
 # Top level Makefile to build Zenith and PostgreSQL
 #
@@ -42,8 +30,10 @@ tmp_install/build/config.status:
 	+@echo "Configuring postgres build"
 	mkdir -p tmp_install/build
 	(cd tmp_install/build && \
-	../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
-		$(PG_CONFIGURE_OPTS) \
+	../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
+		--enable-cassert \
+		--enable-debug \
+		--enable-depend \
 		$(SECCOMP) \
 		--prefix=$(abspath tmp_install) > configure.log)

--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -419,6 +419,7 @@ fn create_timeline(
    let timelinedir = conf.timeline_path(&timelineid, tenantid);

    fs::create_dir(&timelinedir)?;
+    fs::create_dir(&timelinedir.join("wal"))?;

    if let Some(ancestor) = ancestor {
        let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -23,7 +23,6 @@ use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::collections::{BTreeSet, HashSet};
 use std::convert::TryInto;
-use std::fs;
 use std::fs::{File, OpenOptions};
 use std::io::Write;
 use std::ops::Bound::Included;
@@ -31,6 +30,7 @@ use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, MutexGuard};
 use std::time::{Duration, Instant};
+use std::{fs, thread};

 use crate::layered_repository::inmemory_layer::FreezeLayers;
 use crate::relish::*;
@@ -57,7 +57,6 @@ mod image_layer;
 mod inmemory_layer;
 mod interval_tree;
 mod layer_map;
-mod page_versions;
 mod storage_layer;

 use delta_layer::DeltaLayer;
@@ -258,6 +257,19 @@ impl LayeredRepository {
                timeline.init_current_logical_size()?;

                let timeline = Arc::new(timeline);
+
+                // Load any new WAL after the last checkpoint into memory.
+                info!(
+                    "Loading WAL for timeline {} starting at {}",
+                    timelineid,
+                    timeline.get_last_record_lsn()
+                );
+
+                if cfg!(debug_assertions) {
+                    // check again after wal loading
+                    Self::assert_size_calculation_matches_offloaded(Arc::clone(&timeline));
+                }
+
                timelines.insert(timelineid, timeline.clone());
                Ok(timeline)
            }
@@ -523,6 +535,24 @@ impl LayeredRepository {
        totals.elapsed = now.elapsed();
        Ok(totals)
    }
+
+    fn assert_size_calculation_matches(incremental: usize, timeline: &LayeredTimeline) {
+        match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) {
+            Ok(non_incremental) => {
+                if incremental != non_incremental {
+                    error!("timeline size calculation diverged, incremental doesn't match non incremental. incremental={} non_incremental={}", incremental, non_incremental);
+                }
+            }
+            Err(e) => error!("failed to calculate non incremental timeline size: {:#}", e),
+        }
+    }
+
+    fn assert_size_calculation_matches_offloaded(timeline: Arc<LayeredTimeline>) {
+        let incremental = timeline.get_current_logical_size();
+        thread::spawn(move || {
+            Self::assert_size_calculation_matches(incremental, &timeline);
+        });
+    }
 }

 /// Metadata stored on disk for each timeline
@@ -1031,7 +1061,7 @@ impl LayeredTimeline {
            self.timelineid
        );
        let mut layers = self.layers.lock().unwrap();
-        let (imgfilenames, deltafilenames) =
+        let (imgfilenames, mut deltafilenames) =
            filename::list_files(self.conf, self.timelineid, self.tenantid)?;

        let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid);
@@ -1059,7 +1089,11 @@ impl LayeredTimeline {
            layers.insert_historic(Arc::new(layer));
        }

-        // Then for the Delta files.
+        // Then for the Delta files. The delta files are created in order starting
+        // from the oldest file, because each DeltaLayer needs a reference to its
+        // predecessor.
+        deltafilenames.sort();
+
        for filename in deltafilenames.iter() {
            ensure!(filename.start_lsn < filename.end_lsn);
            if filename.end_lsn > disk_consistent_lsn {
@@ -1072,12 +1106,27 @@ impl LayeredTimeline {
                continue;
            }

-            let layer = DeltaLayer::new(self.conf, self.timelineid, self.tenantid, filename);
+            let predecessor = layers.get(&filename.seg, filename.start_lsn);
+
+            let predecessor_str: String = if let Some(prec) = &predecessor {
+                prec.filename().display().to_string()
+            } else {
+                "none".to_string()
+            };
+
+            let layer = DeltaLayer::new(
+                self.conf,
+                self.timelineid,
+                self.tenantid,
+                filename,
+                predecessor,
+            );

            info!(
-                "found layer {} on timeline {}",
+                "found layer {} on timeline {}, predecessor: {}",
                layer.filename().display(),
                self.timelineid,
+                predecessor_str,
            );
            layers.insert_historic(Arc::new(layer));
        }
@@ -1252,7 +1301,7 @@ impl LayeredTimeline {
            let start_lsn;
            if prev_layer.get_timeline_id() != self.timelineid {
                // First modification on this timeline
-                start_lsn = self.ancestor_lsn + 1;
+                start_lsn = self.ancestor_lsn;
                trace!(
                    "creating layer for write for {} at branch point {}/{}",
                    seg,
@@ -1404,6 +1453,19 @@ impl LayeredTimeline {
            // Finally, replace the frozen in-memory layer with the new on-disk layers
            layers.remove_historic(frozen.clone());

+            // If we created a successor InMemoryLayer, its predecessor is
+            // currently the frozen layer. We need to update the predecessor
+            // to be the latest on-disk layer.
+            if let Some(last_historic) = new_historics.last() {
+                if let Some(new_open) = &maybe_new_open {
+                    let maybe_old_predecessor =
+                        new_open.update_predecessor(Arc::clone(last_historic));
+                    let old_predecessor = maybe_old_predecessor
+                        .expect("new_open should always be a successor to frozen");
+                    assert!(layer_ptr_eq(frozen.as_ref(), old_predecessor.as_ref()));
+                }
+            }
+
            // Add the historics to the LayerMap
            for n in new_historics {
                layers.insert_historic(n);
@@ -1710,20 +1772,12 @@ impl LayeredTimeline {
        loop {
            match layer_ref.get_page_reconstruct_data(blknum, curr_lsn, &mut data)? {
                PageReconstructResult::Complete => break,
-                PageReconstructResult::Continue(cont_lsn) => {
+                PageReconstructResult::Continue(cont_lsn, cont_layer) => {
                    // Fetch base image / more WAL from the returned predecessor layer
-                    if let Some((cont_layer, cont_lsn)) = self.get_layer_for_read(seg, cont_lsn)? {
-                        layer_arc = cont_layer;
-                        layer_ref = &*layer_arc;
-                        curr_lsn = cont_lsn;
-                        continue;
-                    } else {
-                        bail!(
-                            "could not find predecessor layer of segment {} at {}",
-                            seg.rel,
-                            cont_lsn
-                        );
-                    }
+                    layer_arc = cont_layer;
+                    layer_ref = &*layer_arc;
+                    curr_lsn = cont_lsn;
+                    continue;
                }
                PageReconstructResult::Missing(lsn) => {
                    // Oops, we could not reconstruct the page.
@@ -1896,6 +1950,17 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> {
    Ok(())
 }

+/// Check for equality of Layer memory addresses
+fn layer_ptr_eq(l1: &dyn Layer, l2: &dyn Layer) -> bool {
+    let l1_ptr = l1 as *const dyn Layer;
+    let l2_ptr = l2 as *const dyn Layer;
+    // comparing *const dyn Layer will not only check for data address equality,
+    // but also for vtable address equality.
+    // to avoid this, we compare *const ().
+    // see here for more https://github.com/rust-lang/rust/issues/46139
+    std::ptr::eq(l1_ptr as *const (), l2_ptr as *const ())
+}
+
 /// Add a suffix to a layer file's name: .{num}.old
 /// Uses the first available num (starts at 0)
 fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -45,10 +45,10 @@ use crate::layered_repository::storage_layer::{
 use crate::waldecoder;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, ensure, Result};
+use anyhow::{bail, Result};
 use log::*;
 use serde::{Deserialize, Serialize};
-use zenith_utils::vec_map::VecMap;
+use std::collections::BTreeMap;
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
@@ -57,7 +57,7 @@ use std::fs::File;
 use std::io::{BufWriter, Write};
 use std::ops::Bound::Included;
 use std::path::{Path, PathBuf};
-use std::sync::{Mutex, MutexGuard};
+use std::sync::{Arc, Mutex, MutexGuard};

 use bookfile::{Book, BookWriter};

@@ -65,7 +65,6 @@ use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

 use super::blob::{read_blob, BlobRange};
-use super::page_versions::OrderedBlockIter;

 // Magic constant to identify a Zenith delta file
 pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01;
@@ -132,6 +131,9 @@ pub struct DeltaLayer {

    dropped: bool,

+    /// Predecessor layer
+    predecessor: Option<Arc<dyn Layer>>,
+
    inner: Mutex<DeltaLayerInner>,
 }

@@ -142,10 +144,10 @@ pub struct DeltaLayerInner {

    /// All versions of all pages in the file are are kept here.
    /// Indexed by block number and LSN.
-    page_version_metas: VecMap<(u32, Lsn), BlobRange>,
+    page_version_metas: BTreeMap<(u32, Lsn), BlobRange>,

    /// `relsizes` tracks the size of the relation at different points in time.
-    relsizes: VecMap<Lsn, u32>,
+    relsizes: BTreeMap<Lsn, u32>,
 }

 impl Layer for DeltaLayer {
@@ -216,12 +218,10 @@ impl Layer for DeltaLayer {
            // Scan the metadata BTreeMap backwards, starting from the given entry.
            let minkey = (blknum, Lsn(0));
            let maxkey = (blknum, lsn);
-            let iter = inner
+            let mut iter = inner
                .page_version_metas
-                .slice_range((Included(&minkey), Included(&maxkey)))
-                .iter()
-                .rev();
-            for ((_blknum, _lsn), blob_range) in iter {
+                .range((Included(&minkey), Included(&maxkey)));
+            while let Some(((_blknum, _entry_lsn), blob_range)) = iter.next_back() {
                let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;

                if let Some(img) = pv.page_image {
@@ -247,9 +247,16 @@ impl Layer for DeltaLayer {
        }

        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
+        // caller know about the predecessor layer.
        if need_image {
-            Ok(PageReconstructResult::Continue(self.start_lsn))
+            if let Some(cont_layer) = &self.predecessor {
+                Ok(PageReconstructResult::Continue(
+                    self.start_lsn,
+                    Arc::clone(cont_layer),
+                ))
+            } else {
+                Ok(PageReconstructResult::Missing(self.start_lsn))
+            }
        } else {
            Ok(PageReconstructResult::Complete)
        }
@@ -258,22 +265,21 @@ impl Layer for DeltaLayer {
    /// Get size of the relation at given LSN
    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
        assert!(lsn >= self.start_lsn);
-        ensure!(
-            self.seg.rel.is_blocky(),
-            "get_seg_size() called on a non-blocky rel"
-        );

        // Scan the BTreeMap backwards, starting from the given entry.
        let inner = self.load()?;
-        let slice = inner
-            .relsizes
-            .slice_range((Included(&Lsn(0)), Included(&lsn)));
+        let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));

-        if let Some((_entry_lsn, entry)) = slice.last() {
-            Ok(*entry)
+        let result;
+        if let Some((_entry_lsn, entry)) = iter.next_back() {
+            result = *entry;
+        // Use the base image if needed
+        } else if let Some(predecessor) = &self.predecessor {
+            result = predecessor.get_seg_size(lsn)?;
        } else {
-            Err(anyhow::anyhow!("could not find seg size in delta layer"))
+            result = 0;
        }
+        Ok(result)
    }

    /// Does this segment exist at given LSN?
@@ -293,8 +299,8 @@ impl Layer for DeltaLayer {
    ///
    fn unload(&self) -> Result<()> {
        let mut inner = self.inner.lock().unwrap();
-        inner.page_version_metas = VecMap::default();
-        inner.relsizes = VecMap::default();
+        inner.page_version_metas = BTreeMap::new();
+        inner.relsizes = BTreeMap::new();
        inner.loaded = false;
        Ok(())
    }
@@ -320,13 +326,13 @@ impl Layer for DeltaLayer {

        println!("--- relsizes ---");
        let inner = self.load()?;
-        for (k, v) in inner.relsizes.as_slice() {
+        for (k, v) in inner.relsizes.iter() {
            println!("  {}: {}", k, v);
        }
        println!("--- page versions ---");
        let (_path, book) = self.open_book()?;
        let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
-        for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
+        for ((blk, lsn), blob_range) in inner.page_version_metas.iter() {
            let mut desc = String::new();

            let buf = read_blob(&chapter, blob_range)?;
@@ -375,7 +381,7 @@ impl DeltaLayer {
    /// data structure with two btreemaps as we do, so passing the btreemaps is currently
    /// expedient.
    #[allow(clippy::too_many_arguments)]
-    pub fn create(
+    pub fn create<'a>(
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
@@ -383,13 +389,10 @@ impl DeltaLayer {
        start_lsn: Lsn,
        end_lsn: Lsn,
        dropped: bool,
-        page_versions: OrderedBlockIter,
-        relsizes: VecMap<Lsn, u32>,
+        predecessor: Option<Arc<dyn Layer>>,
+        page_versions: impl Iterator<Item = (&'a (u32, Lsn), &'a PageVersion)>,
+        relsizes: BTreeMap<Lsn, u32>,
    ) -> Result<DeltaLayer> {
-        if seg.rel.is_blocky() {
-            assert!(!relsizes.is_empty());
-        }
-
        let delta_layer = DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
            timelineid,
@@ -400,9 +403,10 @@ impl DeltaLayer {
            dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: true,
-                page_version_metas: VecMap::default(),
+                page_version_metas: BTreeMap::new(),
                relsizes,
            }),
+            predecessor,
        };
        let mut inner = delta_layer.inner.lock().unwrap();

@@ -419,33 +423,26 @@ impl DeltaLayer {

        let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);

-        for (blknum, history) in page_versions {
-            for (lsn, page_version) in history.as_slice() {
-                if lsn >= &end_lsn {
-                    continue;
-                }
+        for (key, page_version) in page_versions {
+            let buf = PageVersion::ser(page_version)?;
+            let blob_range = page_version_writer.write_blob(&buf)?;

-                let buf = PageVersion::ser(page_version)?;
-                let blob_range = page_version_writer.write_blob(&buf)?;
+            let old = inner.page_version_metas.insert(*key, blob_range);

-                inner
-                    .page_version_metas
-                    .append((blknum, *lsn), blob_range)
-                    .unwrap();
-            }
+            assert!(old.is_none());
        }

        let book = page_version_writer.close()?;

        // Write out page versions
        let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
-        let buf = VecMap::ser(&inner.page_version_metas)?;
+        let buf = BTreeMap::ser(&inner.page_version_metas)?;
        chapter.write_all(&buf)?;
        let book = chapter.close()?;

        // and relsizes to separate chapter
        let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
-        let buf = VecMap::ser(&inner.relsizes)?;
+        let buf = BTreeMap::ser(&inner.relsizes)?;
        chapter.write_all(&buf)?;
        let book = chapter.close()?;

@@ -532,10 +529,10 @@ impl DeltaLayer {
        }

        let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
-        let page_version_metas = VecMap::des(&chapter)?;
+        let page_version_metas = BTreeMap::des(&chapter)?;

        let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
-        let relsizes = VecMap::des(&chapter)?;
+        let relsizes = BTreeMap::des(&chapter)?;

        debug!("loaded from {}", &path.display());

@@ -554,6 +551,7 @@ impl DeltaLayer {
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
        filename: &DeltaFileName,
+        predecessor: Option<Arc<dyn Layer>>,
    ) -> DeltaLayer {
        DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
@@ -565,9 +563,10 @@ impl DeltaLayer {
            dropped: filename.dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: false,
-                page_version_metas: VecMap::default(),
-                relsizes: VecMap::default(),
+                page_version_metas: BTreeMap::new(),
+                relsizes: BTreeMap::new(),
            }),
+            predecessor,
        }
    }

@@ -588,9 +587,10 @@ impl DeltaLayer {
            dropped: summary.dropped,
            inner: Mutex::new(DeltaLayerInner {
                loaded: false,
-                page_version_metas: VecMap::default(),
-                relsizes: VecMap::default(),
+                page_version_metas: BTreeMap::new(),
+                relsizes: BTreeMap::new(),
            }),
+            predecessor: None,
        })
    }
 }
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -290,7 +290,11 @@ pub fn list_files(
            deltafiles.push(deltafilename);
        } else if let Some(imgfilename) = ImageFileName::from_str(fname) {
            imgfiles.push(imgfilename);
-        } else if fname == "metadata" || fname == "ancestor" || fname.ends_with(".old") {
+        } else if fname == "wal"
+            || fname == "metadata"
+            || fname == "ancestor"
+            || fname.ends_with(".old")
+        {
            // ignore these
        } else {
            warn!("unrecognized filename in timeline dir: {}", fname);
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -12,19 +12,18 @@ use crate::layered_repository::{DeltaLayer, ImageLayer};
 use crate::repository::WALRecord;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
-use anyhow::{bail, ensure, Result};
+use anyhow::{bail, Result};
 use bytes::Bytes;
 use log::*;
 use std::cmp::min;
+use std::collections::BTreeMap;
+use std::ops::Bound::Included;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
-use zenith_utils::vec_map::VecMap;

 use zenith_utils::accum::Accum;
 use zenith_utils::lsn::Lsn;

-use super::page_versions::PageVersions;
-
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
@@ -46,9 +45,6 @@ pub struct InMemoryLayer {
    /// The above fields never change. The parts that do change are in 'inner',
    /// and protected by mutex.
    inner: RwLock<InMemoryLayerInner>,
-
-    /// Predecessor layer might be needed?
-    incremental: bool,
 }

 pub struct InMemoryLayerInner {
@@ -59,20 +55,19 @@ pub struct InMemoryLayerInner {
    /// All versions of all pages in the layer are are kept here.
    /// Indexed by block number and LSN.
    ///
-    page_versions: PageVersions,
+    page_versions: BTreeMap<(u32, Lsn), PageVersion>,

    ///
    /// `segsizes` tracks the size of the segment at different points in time.
    ///
-    /// For a blocky rel, there is always one entry, at the layer's start_lsn,
-    /// so that determining the size never depends on the predecessor layer. For
-    /// a non-blocky rel, 'segsizes' is not used and is always empty.
-    ///
-    segsizes: VecMap<Lsn, u32>,
+    segsizes: BTreeMap<Lsn, u32>,

    /// Writes are only allowed when true.
    /// Set to false when this layer is in the process of being replaced.
    writeable: bool,
+
+    /// Predecessor layer
+    predecessor: Option<Arc<dyn Layer>>,
 }

 impl InMemoryLayerInner {
@@ -86,13 +81,12 @@ impl InMemoryLayerInner {

    fn get_seg_size(&self, lsn: Lsn) -> u32 {
        // Scan the BTreeMap backwards, starting from the given entry.
-        let slice = self.segsizes.slice_range(..=lsn);
+        let mut iter = self.segsizes.range((Included(&Lsn(0)), Included(&lsn)));

-        // We make sure there is always at least one entry
-        if let Some((_entry_lsn, entry)) = slice.last() {
+        if let Some((_entry_lsn, entry)) = iter.next_back() {
            *entry
        } else {
-            panic!("could not find seg size in in-memory layer");
+            0
        }
    }
 }
@@ -170,16 +164,18 @@ impl Layer for InMemoryLayer {

        assert!(self.seg.blknum_in_seg(blknum));

+        let predecessor: Option<Arc<dyn Layer>>;
+
        {
            let inner = self.inner.read().unwrap();

-            // Scan the page versions backwards, starting from `lsn`.
-            let iter = inner
+            // Scan the BTreeMap backwards, starting from reconstruct_data.lsn.
+            let minkey = (blknum, Lsn(0));
+            let maxkey = (blknum, lsn);
+            let mut iter = inner
                .page_versions
-                .get_block_lsn_range(blknum, ..=lsn)
-                .iter()
-                .rev();
-            for (_entry_lsn, entry) in iter {
+                .range((Included(&minkey), Included(&maxkey)));
+            while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
                if let Some(img) = &entry.page_image {
                    reconstruct_data.page_img = Some(img.clone());
                    need_image = false;
@@ -196,14 +192,16 @@ impl Layer for InMemoryLayer {
                    bail!("no page image or WAL record for requested page");
                }
            }
+
+            predecessor = inner.predecessor.clone();
            // release lock on 'inner'
        }

        // If an older page image is needed to reconstruct the page, let the
-        // caller know
+        // caller know about the predecessor layer.
        if need_image {
-            if self.incremental {
-                Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
+            if let Some(cont_layer) = predecessor {
+                Ok(PageReconstructResult::Continue(self.start_lsn, cont_layer))
            } else {
                Ok(PageReconstructResult::Missing(self.start_lsn))
            }
@@ -215,10 +213,6 @@ impl Layer for InMemoryLayer {
    /// Get size of the relation at given LSN
    fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
        assert!(lsn >= self.start_lsn);
-        ensure!(
-            self.seg.rel.is_blocky(),
-            "get_seg_size() called on a non-blocky rel"
-        );

        let inner = self.inner.read().unwrap();
        Ok(inner.get_seg_size(lsn))
@@ -258,7 +252,8 @@ impl Layer for InMemoryLayer {
    }

    fn is_incremental(&self) -> bool {
-        self.incremental
+        let inner = self.inner.read().unwrap();
+        inner.predecessor.is_some()
    }

    /// debugging function to print out the contents of the layer
@@ -276,20 +271,18 @@ impl Layer for InMemoryLayer {
            self.timelineid, self.seg, self.start_lsn, end_str
        );

-        for (k, v) in inner.segsizes.as_slice() {
+        for (k, v) in inner.segsizes.iter() {
            println!("segsizes {}: {}", k, v);
        }

-        for (blknum, history) in inner.page_versions.ordered_block_iter() {
-            for (lsn, pv) in history.as_slice() {
-                println!(
-                    "blk {} at {}: {}/{}\n",
-                    blknum,
-                    lsn,
-                    pv.page_image.is_some(),
-                    pv.record.is_some()
-                );
-            }
+        for (k, v) in inner.page_versions.iter() {
+            println!(
+                "blk {} at {}: {}/{}\n",
+                k.0,
+                k.1,
+                v.page_image.is_some(),
+                v.record.is_some()
+            );
        }

        Ok(())
@@ -339,12 +332,6 @@ impl InMemoryLayer {
            start_lsn
        );

-        // The segment is initially empty, so initialize 'segsizes' with 0.
-        let mut segsizes = VecMap::default();
-        if seg.rel.is_blocky() {
-            segsizes.append(start_lsn, 0).unwrap();
-        }
-
        Ok(InMemoryLayer {
            conf,
            timelineid,
@@ -353,12 +340,12 @@ impl InMemoryLayer {
            start_lsn,
            end_lsn: None,
            oldest_pending_lsn,
-            incremental: false,
            inner: RwLock::new(InMemoryLayerInner {
                drop_lsn: None,
-                page_versions: PageVersions::default(),
-                segsizes,
+                page_versions: BTreeMap::new(),
+                segsizes: BTreeMap::new(),
                writeable: true,
+                predecessor: None,
            }),
        })
    }
@@ -406,7 +393,7 @@ impl InMemoryLayer {

        inner.check_writeable()?;

-        let old = inner.page_versions.append_or_update_last(blknum, lsn, pv);
+        let old = inner.page_versions.insert((blknum, lsn), pv);

        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -451,9 +438,7 @@ impl InMemoryLayer {
                        gapblknum,
                        blknum
                    );
-                    let old = inner
-                        .page_versions
-                        .append_or_update_last(gapblknum, lsn, zeropv);
+                    let old = inner.page_versions.insert((gapblknum, lsn), zeropv);
                    // We already had an entry for this LSN. That's odd..

                    if old.is_some() {
@@ -464,7 +449,7 @@ impl InMemoryLayer {
                    }
                }

-                inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
+                inner.segsizes.insert(lsn, newsize);
                return Ok(newsize - oldsize);
            }
        }
@@ -473,10 +458,6 @@ impl InMemoryLayer {

    /// Remember that the relation was truncated at given LSN
    pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> WriteResult<()> {
-        assert!(
-            self.seg.rel.is_blocky(),
-            "put_truncation() called on a non-blocky rel"
-        );
        self.assert_not_frozen();

        let mut inner = self.inner.write().unwrap();
@@ -486,7 +467,7 @@ impl InMemoryLayer {
        let oldsize = inner.get_seg_size(lsn);
        assert!(segsize < oldsize);

-        let old = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
+        let old = inner.segsizes.insert(lsn, segsize);

        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -537,11 +518,11 @@ impl InMemoryLayer {
            start_lsn,
        );

-        // Copy the segment size at the start LSN from the predecessor layer.
-        let mut segsizes = VecMap::default();
+        // For convenience, copy the segment size from the predecessor layer
+        let mut segsizes = BTreeMap::new();
        if seg.rel.is_blocky() {
            let size = src.get_seg_size(start_lsn)?;
-            segsizes.append(start_lsn, size).unwrap();
+            segsizes.insert(start_lsn, size);
        }

        Ok(InMemoryLayer {
@@ -552,12 +533,12 @@ impl InMemoryLayer {
            start_lsn,
            end_lsn: None,
            oldest_pending_lsn,
-            incremental: true,
            inner: RwLock::new(InMemoryLayerInner {
                drop_lsn: None,
-                page_versions: PageVersions::default(),
+                page_versions: BTreeMap::new(),
                segsizes,
                writeable: true,
+                predecessor: Some(src),
            }),
        })
    }
@@ -608,18 +589,28 @@ impl InMemoryLayer {

        // Divide all the page versions into old and new
        // at the 'cutoff_lsn' point.
+        let mut before_segsizes = BTreeMap::new();
+        let mut after_segsizes = BTreeMap::new();
        let mut after_oldest_lsn: Accum<Lsn> = Accum(None);
-
-        let cutoff_lsn_exclusive = Lsn(cutoff_lsn.0 + 1);
-
-        let (before_segsizes, mut after_segsizes) = inner.segsizes.split_at(&cutoff_lsn_exclusive);
-        if let Some((lsn, _size)) = after_segsizes.as_slice().first() {
-            after_oldest_lsn.accum(min, *lsn);
+        for (lsn, size) in inner.segsizes.iter() {
+            if *lsn > cutoff_lsn {
+                after_segsizes.insert(*lsn, *size);
+                after_oldest_lsn.accum(min, *lsn);
+            } else {
+                before_segsizes.insert(*lsn, *size);
+            }
        }

-        let (before_page_versions, after_page_versions) = inner
-            .page_versions
-            .split_at(cutoff_lsn_exclusive, &mut after_oldest_lsn);
+        let mut before_page_versions = BTreeMap::new();
+        let mut after_page_versions = BTreeMap::new();
+        for ((blknum, lsn), pv) in inner.page_versions.iter() {
+            if *lsn > cutoff_lsn {
+                after_page_versions.insert((*blknum, *lsn), pv.clone());
+                after_oldest_lsn.accum(min, *lsn);
+            } else {
+                before_page_versions.insert((*blknum, *lsn), pv.clone());
+            }
+        }

        let frozen = Arc::new(InMemoryLayer {
            conf: self.conf,
@@ -629,12 +620,12 @@ impl InMemoryLayer {
            start_lsn: self.start_lsn,
            end_lsn: Some(cutoff_lsn),
            oldest_pending_lsn: self.start_lsn,
-            incremental: self.incremental,
            inner: RwLock::new(InMemoryLayerInner {
                drop_lsn: inner.drop_lsn,
                page_versions: before_page_versions,
                segsizes: before_segsizes,
                writeable: false,
+                predecessor: inner.predecessor.clone(),
            }),
        });

@@ -649,11 +640,8 @@ impl InMemoryLayer {
            )?;

            let new_inner = new_open.inner.get_mut().unwrap();
-            // Ensure page_versions doesn't contain anything
-            // so we can just replace it
-            assert!(new_inner.page_versions.is_empty());
-            new_inner.page_versions = after_page_versions;
-            new_inner.segsizes.extend(&mut after_segsizes).unwrap();
+            new_inner.page_versions.append(&mut after_page_versions);
+            new_inner.segsizes.append(&mut after_segsizes);

            Some(Arc::new(new_open))
        } else {
@@ -691,6 +679,8 @@ impl InMemoryLayer {
        let inner = self.inner.read().unwrap();
        assert!(!inner.writeable);

+        let predecessor = inner.predecessor.clone();
+
        if let Some(drop_lsn) = inner.drop_lsn {
            let delta_layer = DeltaLayer::create(
                self.conf,
@@ -700,7 +690,8 @@ impl InMemoryLayer {
                self.start_lsn,
                drop_lsn,
                true,
-                inner.page_versions.ordered_block_iter(),
+                predecessor,
+                inner.page_versions.iter(),
                inner.segsizes.clone(),
            )?;
            trace!(
@@ -714,11 +705,21 @@ impl InMemoryLayer {

        let end_lsn = self.end_lsn.unwrap();

+        let mut before_segsizes = BTreeMap::new();
+        for (lsn, size) in inner.segsizes.iter() {
+            if *lsn <= end_lsn {
+                before_segsizes.insert(*lsn, *size);
+            }
+        }
+        let mut before_page_versions = inner.page_versions.iter().filter(|tup| {
+            let ((_blknum, lsn), _pv) = tup;
+
+            *lsn < end_lsn
+        });
+
        let mut frozen_layers: Vec<Arc<dyn Layer>> = Vec::new();

        if self.start_lsn != end_lsn {
-            let (before_segsizes, _after_segsizes) = inner.segsizes.split_at(&Lsn(end_lsn.0 + 1));
-
            // Write the page versions before the cutoff to disk.
            let delta_layer = DeltaLayer::create(
                self.conf,
@@ -728,7 +729,8 @@ impl InMemoryLayer {
                self.start_lsn,
                end_lsn,
                false,
-                inner.page_versions.ordered_block_iter(),
+                predecessor,
+                before_page_versions,
                before_segsizes,
            )?;
            frozen_layers.push(Arc::new(delta_layer));
@@ -739,10 +741,7 @@ impl InMemoryLayer {
                end_lsn
            );
        } else {
-            for (_blknum, history) in inner.page_versions.ordered_block_iter() {
-                let (lsn, _pv) = history.as_slice().first().unwrap();
-                assert!(lsn >= &end_lsn);
-            }
+            assert!(before_page_versions.next().is_none());
        }

        drop(inner);
@@ -754,4 +753,9 @@ impl InMemoryLayer {

        Ok(frozen_layers)
    }
+
+    pub fn update_predecessor(&self, predecessor: Arc<dyn Layer>) -> Option<Arc<dyn Layer>> {
+        let mut inner = self.inner.write().unwrap();
+        inner.predecessor.replace(predecessor)
+    }
 }
--- a/pageserver/src/layered_repository/page_versions.rs
+++ b/pageserver/src/layered_repository/page_versions.rs
@@ -1,132 +0,0 @@
-use std::{collections::HashMap, ops::RangeBounds};
-
-use zenith_utils::{accum::Accum, lsn::Lsn, vec_map::VecMap};
-
-use super::storage_layer::PageVersion;
-
-const EMPTY_SLICE: &[(Lsn, PageVersion)] = &[];
-
-#[derive(Debug, Default)]
-pub struct PageVersions(HashMap<u32, VecMap<Lsn, PageVersion>>);
-
-impl PageVersions {
-    pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
-    }
-
-    pub fn append_or_update_last(
-        &mut self,
-        blknum: u32,
-        lsn: Lsn,
-        page_version: PageVersion,
-    ) -> Option<PageVersion> {
-        let map = self.0.entry(blknum).or_insert_with(VecMap::default);
-        map.append_or_update_last(lsn, page_version).unwrap()
-    }
-
-    /// Get a range of [`PageVersions`] in a block
-    pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(
-        &self,
-        blknum: u32,
-        range: R,
-    ) -> &[(Lsn, PageVersion)] {
-        self.0
-            .get(&blknum)
-            .map(|vec_map| vec_map.slice_range(range))
-            .unwrap_or(EMPTY_SLICE)
-    }
-
-    /// Split the page version map into two.
-    ///
-    /// Left contains everything up to and not including [`cutoff_lsn`].
-    /// Right contains [`cutoff_lsn`] and everything after.
-    pub fn split_at(&self, cutoff_lsn: Lsn, after_oldest_lsn: &mut Accum<Lsn>) -> (Self, Self) {
-        let mut before_blocks = HashMap::new();
-        let mut after_blocks = HashMap::new();
-
-        for (blknum, vec_map) in self.0.iter() {
-            let (before_versions, after_versions) = vec_map.split_at(&cutoff_lsn);
-
-            if !before_versions.is_empty() {
-                let old = before_blocks.insert(*blknum, before_versions);
-                assert!(old.is_none());
-            }
-
-            if !after_versions.is_empty() {
-                let (first_lsn, _first_pv) = &after_versions.as_slice()[0];
-                after_oldest_lsn.accum(std::cmp::min, *first_lsn);
-
-                let old = after_blocks.insert(*blknum, after_versions);
-                assert!(old.is_none());
-            }
-        }
-
-        (Self(before_blocks), Self(after_blocks))
-    }
-
-    /// Iterate through block-history pairs in block order.
-    pub fn ordered_block_iter(&self) -> OrderedBlockIter<'_> {
-        let mut ordered_blocks: Vec<u32> = self.0.keys().cloned().collect();
-        ordered_blocks.sort_unstable();
-
-        OrderedBlockIter {
-            page_versions: self,
-            ordered_blocks,
-            cur_block_idx: 0,
-        }
-    }
-}
-
-pub struct OrderedBlockIter<'a> {
-    page_versions: &'a PageVersions,
-
-    ordered_blocks: Vec<u32>,
-    cur_block_idx: usize,
-}
-
-impl<'a> Iterator for OrderedBlockIter<'a> {
-    type Item = (u32, &'a VecMap<Lsn, PageVersion>);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let blknum: u32 = *self.ordered_blocks.get(self.cur_block_idx)?;
-        self.cur_block_idx += 1;
-        Some((blknum, self.page_versions.0.get(&blknum).unwrap()))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    const EMPTY_PAGE_VERSION: PageVersion = PageVersion {
-        page_image: None,
-        record: None,
-    };
-
-    #[test]
-    fn test_ordered_iter() {
-        let mut page_versions = PageVersions::default();
-        const BLOCKS: u32 = 1000;
-        const LSNS: u64 = 50;
-
-        for blknum in 0..BLOCKS {
-            for lsn in 0..LSNS {
-                let old = page_versions.append_or_update_last(blknum, Lsn(lsn), EMPTY_PAGE_VERSION);
-                assert!(old.is_none());
-            }
-        }
-
-        let mut iter = page_versions.ordered_block_iter();
-        for blknum in 0..BLOCKS {
-            let (actual_blknum, vec_map) = iter.next().unwrap();
-            let slice = vec_map.as_slice();
-            assert_eq!(actual_blknum, blknum);
-            assert_eq!(slice.len(), LSNS as usize);
-            for lsn in 0..LSNS {
-                assert_eq!(Lsn(lsn), slice[lsn as usize].0);
-            }
-        }
-        assert!(iter.next().is_none());
-        assert!(iter.next().is_none()); // should be robust against excessive next() calls
-    }
-}
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -10,6 +10,7 @@ use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::path::PathBuf;
+use std::sync::Arc;

 use zenith_utils::lsn::Lsn;

@@ -86,9 +87,9 @@ pub struct PageReconstructData {
 pub enum PageReconstructResult {
    /// Got all the data needed to reconstruct the requested page
    Complete,
-    /// This layer didn't contain all the required data, the caller should look up
-    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue(Lsn),
+    /// This layer didn't contain all the required data, the caller should collect
+    /// more data from the returned predecessor layer at the returned LSN.
+    Continue(Lsn, Arc<dyn Layer>),
    /// This layer didn't contain data needed to reconstruct the page version at
    /// the returned LSN. This is usually considered an error, but might be OK
    /// in some circumstances.
@@ -144,8 +145,8 @@ pub trait Layer: Send + Sync {
    ///
    /// See PageReconstructResult for possible return values. The collected data
    /// is appended to reconstruct_data; the caller should pass an empty struct
-    /// on first call. If this returns PageReconstructResult::Continue, look up
-    /// the predecessor layer and call again with the same 'reconstruct_data'
+    /// on first call. If this returns PageReconstructResult::Continue, call
+    /// again on the returned predecessor layer with the same 'reconstruct_data'
    /// to collect more data.
    fn get_page_reconstruct_data(
        &self,
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -126,6 +126,10 @@ impl PageServerConf {
        self.timeline_path(timelineid, tenantid).join("ancestor")
    }

+    fn wal_dir_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
+        self.timeline_path(timelineid, tenantid).join("wal")
+    }
+
    //
    // Postgres distribution paths
    //
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -100,6 +100,10 @@ pub fn create_repository_for_tenant(
    Ok(())
 }

+pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
+    access_repository().insert(tenantid, repo);
+}
+
 pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
    access_repository()
        .get(&tenantid)
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -16,11 +16,14 @@ use log::*;
 use postgres::fallible_iterator::FallibleIterator;
 use postgres::replication::ReplicationIter;
 use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
+use postgres_ffi::xlog_utils::*;
 use postgres_ffi::*;
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
 use std::cell::Cell;
+use std::cmp::{max, min};
 use std::collections::HashMap;
+use std::fs;
 use std::str::FromStr;
 use std::sync::Mutex;
 use std::thread;
@@ -122,7 +125,7 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
 }

 fn walreceiver_main(
-    _conf: &PageServerConf,
+    conf: &PageServerConf,
    timelineid: ZTimelineId,
    wal_producer_connstr: &str,
    tenantid: ZTenantId,
@@ -192,6 +195,7 @@ fn walreceiver_main(
                let data = xlog_data.data();
                let startlsn = Lsn::from(xlog_data.wal_start());
                let endlsn = startlsn + data.len() as u64;
+                let prev_last_rec_lsn = last_rec_lsn;

                trace!("received XLogData between {} and {}", startlsn, endlsn);

@@ -232,6 +236,34 @@ fn walreceiver_main(
                    last_rec_lsn = lsn;
                }

+                // Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
+                // "checkpoint" the repository to flush all the changes from WAL we've processed
+                // so far to disk. After this, we don't need the original WAL anymore, and it
+                // can be removed. This is probably too aggressive for production, but it's useful
+                // to expose bugs now.
+                //
+                // TODO: We don't actually dare to remove the WAL. It's useful for debugging,
+                // and we might it for logical decoding other things in the future. Although
+                // we should also be able to fetch it back from the WAL safekeepers or S3 if
+                // needed.
+                if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
+                    != last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
+                {
+                    info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
+                    let (oldest_segno, newest_segno) = find_wal_file_range(
+                        conf,
+                        &timelineid,
+                        pg_constants::WAL_SEGMENT_SIZE,
+                        last_rec_lsn,
+                        &tenantid,
+                    )?;
+
+                    if newest_segno - oldest_segno >= 10 {
+                        // TODO: This is where we could remove WAL older than last_rec_lsn.
+                        //remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
+                    }
+                }
+
                if !caught_up && endlsn >= end_of_wal {
                    info!("caught up at LSN {}", endlsn);
                    caught_up = true;
@@ -253,7 +285,7 @@ fn walreceiver_main(
                );

                if reply_requested {
-                    Some(last_rec_lsn)
+                    Some(timeline.get_last_record_lsn())
                } else {
                    None
                }
@@ -277,6 +309,47 @@ fn walreceiver_main(
    Ok(())
 }

+fn find_wal_file_range(
+    conf: &PageServerConf,
+    timeline: &ZTimelineId,
+    wal_seg_size: usize,
+    written_upto: Lsn,
+    tenant: &ZTenantId,
+) -> Result<(u64, u64)> {
+    let written_upto_segno = written_upto.segment_number(wal_seg_size);
+
+    let mut oldest_segno = written_upto_segno;
+    let mut newest_segno = written_upto_segno;
+    // Scan the wal directory, and count how many WAL filed we could remove
+    let wal_dir = conf.wal_dir_path(timeline, tenant);
+    for entry in fs::read_dir(wal_dir)? {
+        let entry = entry?;
+        let path = entry.path();
+
+        if path.is_dir() {
+            continue;
+        }
+
+        let filename = path.file_name().unwrap().to_str().unwrap();
+
+        if IsXLogFileName(filename) {
+            let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
+
+            if segno > written_upto_segno {
+                // that's strange.
+                warn!("there is a WAL file from future at {}", path.display());
+                continue;
+            }
+
+            oldest_segno = min(oldest_segno, segno);
+            newest_segno = max(newest_segno, segno);
+        }
+    }
+    // FIXME: would be good to assert that there are no gaps in the WAL files
+
+    Ok((oldest_segno, newest_segno))
+}
+
 /// Data returned from the postgres `IDENTIFY_SYSTEM` command
 ///
 /// See the [postgres docs] for more details.
--- a/pageserver/src/walredo/mod.rs
+++ b/pageserver/src/walredo/mod.rs
@@ -17,9 +17,11 @@
 //! records. It achieves it by dropping privileges before replaying
 //! any WAL records, so that even if an attacker hijacks the Postgres
 //! process, he cannot escape out of it.
-//!
-use byteorder::{ByteOrder, LittleEndian};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+
+mod nonrel;
+mod request;
+
+use bytes::Bytes;
 use lazy_static::lazy_static;
 use log::*;
 use serde::Serialize;
@@ -37,21 +39,12 @@ use tokio::io::{AsyncReadExt, AsyncWriteExt};
 use tokio::process::{ChildStdin, ChildStdout, Command};
 use tokio::time::timeout;
 use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
-use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::zid::ZTenantId;

 use crate::relish::*;
 use crate::repository::WALRecord;
-use crate::waldecoder::XlMultiXactCreate;
-use crate::waldecoder::XlXactParsedRecord;
 use crate::PageServerConf;
-use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift;
-use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset;
-use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset;
-use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
-use postgres_ffi::pg_constants;
-use postgres_ffi::XLogRecord;

 ///
 /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
@@ -92,7 +85,7 @@ pub trait WalRedoManager: Send + Sync {
 /// a Repository object without launching the real WAL redo process.
 ///
 pub struct DummyRedoManager {}
-impl crate::walredo::WalRedoManager for DummyRedoManager {
+impl WalRedoManager for DummyRedoManager {
    fn request_redo(
        &self,
        _rel: RelishTag,
@@ -105,7 +98,7 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
    }
 }

-static TIMEOUT: Duration = Duration::from_secs(20);
+const TIMEOUT: Duration = Duration::from_secs(20);

 // Metrics collected on WAL redo operations
 //
@@ -249,166 +242,20 @@ impl PostgresRedoManager {
        process: &mut PostgresRedoProcess,
        request: &WalRedoRequest,
    ) -> Result<Bytes, WalRedoError> {
-        let rel = request.rel;
-        let blknum = request.blknum;
-        let lsn = request.lsn;
-        let base_img = request.base_img.clone();
-        let records = &request.records;
-
-        let nrecords = records.len();
-
        let start = Instant::now();

-        let apply_result: Result<Bytes, Error>;
-        if let RelishTag::Relation(rel) = rel {
+        let apply_result = if let RelishTag::Relation(rel) = request.rel {
            // Relational WAL records are applied using wal-redo-postgres
-            let buf_tag = BufferTag { rel, blknum };
-            apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
+            let buf_tag = BufferTag {
+                rel,
+                blknum: request.blknum,
+            };
+            process
+                .apply_wal_records(buf_tag, &request.base_img, &request.records)
+                .await
        } else {
-            // Non-relational WAL records are handled here, with custom code that has the
-            // same effects as the corresponding Postgres WAL redo function.
-            const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
-            let mut page = BytesMut::new();
-            if let Some(fpi) = base_img {
-                // If full-page image is provided, then use it...
-                page.extend_from_slice(&fpi[..]);
-            } else {
-                // otherwise initialize page with zeros
-                page.extend_from_slice(&ZERO_PAGE);
-            }
-            // Apply all collected WAL records
-            for record in records {
-                let mut buf = record.rec.clone();
-
-                WAL_REDO_RECORD_COUNTER.inc();
-
-                // 1. Parse XLogRecord struct
-                // FIXME: refactor to avoid code duplication.
-                let xlogrec = XLogRecord::from_bytes(&mut buf);
-
-                //move to main data
-                // TODO probably, we should store some records in our special format
-                // to avoid this weird parsing on replay
-                let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
-                if buf.remaining() > skip {
-                    buf.advance(skip);
-                }
-
-                if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
-                    // Transaction manager stuff
-                    let rec_segno = match rel {
-                        RelishTag::Slru { slru, segno } => {
-                            assert!(
-                                slru == SlruKind::Clog,
-                                "Not valid XACT relish tag {:?}",
-                                rel
-                            );
-                            segno
-                        }
-                        _ => panic!("Not valid XACT relish tag {:?}", rel),
-                    };
-                    let parsed_xact =
-                        XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
-                    if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
-                        || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
-                    {
-                        transaction_id_set_status(
-                            parsed_xact.xid,
-                            pg_constants::TRANSACTION_STATUS_COMMITTED,
-                            &mut page,
-                        );
-                        for subxact in &parsed_xact.subxacts {
-                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            // only update xids on the requested page
-                            if rec_segno == segno && blknum == rpageno {
-                                transaction_id_set_status(
-                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_COMMITTED,
-                                    &mut page,
-                                );
-                            }
-                        }
-                    } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
-                        || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
-                    {
-                        transaction_id_set_status(
-                            parsed_xact.xid,
-                            pg_constants::TRANSACTION_STATUS_ABORTED,
-                            &mut page,
-                        );
-                        for subxact in &parsed_xact.subxacts {
-                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                            // only update xids on the requested page
-                            if rec_segno == segno && blknum == rpageno {
-                                transaction_id_set_status(
-                                    *subxact,
-                                    pg_constants::TRANSACTION_STATUS_ABORTED,
-                                    &mut page,
-                                );
-                            }
-                        }
-                    }
-                } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
-                    // Multixact operations
-                    let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                    if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-                        let xlrec = XlMultiXactCreate::decode(&mut buf);
-                        if let RelishTag::Slru {
-                            slru,
-                            segno: rec_segno,
-                        } = rel
-                        {
-                            if slru == SlruKind::MultiXactMembers {
-                                for i in 0..xlrec.nmembers {
-                                    let pageno =
-                                        i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-                                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                    if segno == rec_segno && rpageno == blknum {
-                                        // update only target block
-                                        let offset = xlrec.moff + i;
-                                        let memberoff = mx_offset_to_member_offset(offset);
-                                        let flagsoff = mx_offset_to_flags_offset(offset);
-                                        let bshift = mx_offset_to_flags_bitshift(offset);
-                                        let mut flagsval =
-                                            LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
-                                        flagsval &= !(((1
-                                            << pg_constants::MXACT_MEMBER_BITS_PER_XACT)
-                                            - 1)
-                                            << bshift);
-                                        flagsval |= xlrec.members[i as usize].status << bshift;
-                                        LittleEndian::write_u32(
-                                            &mut page[flagsoff..flagsoff + 4],
-                                            flagsval,
-                                        );
-                                        LittleEndian::write_u32(
-                                            &mut page[memberoff..memberoff + 4],
-                                            xlrec.members[i as usize].xid,
-                                        );
-                                    }
-                                }
-                            } else {
-                                // Multixact offsets SLRU
-                                let offs = (xlrec.mid
-                                    % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
-                                    * 4) as usize;
-                                LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
-                            }
-                        } else {
-                            panic!();
-                        }
-                    } else {
-                        panic!();
-                    }
-                }
-            }
-
-            apply_result = Ok::<Bytes, Error>(page.freeze());
-        }
+            Ok(nonrel::apply_nonrel(request))
+        };

        let duration = start.elapsed();

@@ -416,9 +263,9 @@ impl PostgresRedoManager {

        debug!(
            "applied {} WAL records in {} ms to reconstruct page image at LSN {}",
-            nrecords,
+            request.records.len(),
            duration.as_millis(),
-            lsn
+            request.lsn
        );

        if let Err(e) = apply_result {
@@ -543,12 +390,11 @@ impl PostgresRedoProcess {
    async fn apply_wal_records(
        &mut self,
        tag: BufferTag,
-        base_img: Option<Bytes>,
+        base_img: &Option<Bytes>,
        records: &[WALRecord],
    ) -> Result<Bytes, std::io::Error> {
+        let stdin = &mut self.stdin;
        let stdout = &mut self.stdout;
-        // Buffer the writes to avoid a lot of small syscalls.
-        let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);

        // We do three things simultaneously: send the old base image and WAL records to
        // the child process's stdin, read the result from child's stdout, and forward any logging
@@ -558,140 +404,24 @@ impl PostgresRedoProcess {
        // 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
        // tokio runtime in the 'launch' function already, forwards the logging.
        let f_stdin = async {
-            // Send base image, if any. (If the record initializes the page, previous page
-            // version is not needed.)
-            timeout(
-                TIMEOUT,
-                stdin.write_all(&build_begin_redo_for_block_msg(tag)),
-            )
-            .await??;
-            if base_img.is_some() {
-                timeout(
-                    TIMEOUT,
-                    stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
-                )
-                .await??;
-            }
+            let buf = request::serialize_request(tag, base_img, records);

-            // Send WAL records.
-            for rec in records.iter() {
-                let r = rec.clone();
-
-                WAL_REDO_RECORD_COUNTER.inc();
-
-                stdin
-                    .write_all(&build_apply_record_msg(r.lsn, r.rec))
-                    .await?;
-
-                //debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
-                //       r.lsn >> 32, r.lsn & 0xffff_ffff);
-            }
-            //debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
-            //       records.len(), lsn >> 32, lsn & 0xffff_ffff);
-
-            // Send GetPage command to get the result back
-            timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
+            timeout(TIMEOUT, stdin.write_all(&buf)).await??;
            timeout(TIMEOUT, stdin.flush()).await??;
-            //debug!("sent GetPage for {}", tag.blknum);
+
            Ok::<(), Error>(())
        };

        // Read back new page image
        let f_stdout = async {
-            let mut buf = [0u8; 8192];
+            let mut buf = vec![0u8; 8192];

            timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
            //debug!("got response for {}", tag.blknum);
-            Ok::<[u8; 8192], Error>(buf)
+            Ok::<Vec<u8>, Error>(buf)
        };

-        let res = tokio::try_join!(f_stdout, f_stdin)?;
-
-        let buf = res.0;
-
-        Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
+        let (buf, _) = tokio::try_join!(f_stdout, f_stdin)?;
+        Ok::<Bytes, Error>(Bytes::from(buf))
    }
 }
-
-// Functions for constructing messages to send to the postgres WAL redo
-// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
-// explanation of the protocol.
-
-fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
-    let len = 4 + 1 + 4 * 4;
-    let mut buf = BytesMut::with_capacity(1 + len);
-
-    buf.put_u8(b'B');
-    buf.put_u32(len as u32);
-
-    // FIXME: this is a temporary hack that should go away when we refactor
-    // the postgres protocol serialization + handlers.
-    //
-    // BytesMut is a dynamic growable buffer, used a lot in tokio code but
-    // not in the std library. To write to a BytesMut from a serde serializer,
-    // we need to either:
-    // - pre-allocate the required buffer space. This is annoying because we
-    //   shouldn't care what the exact serialized size is-- that's the
-    //   serializer's job.
-    // - Or, we need to create a temporary "writer" (which implements the
-    //   `Write` trait). It's a bit awkward, because the writer consumes the
-    //   underlying BytesMut, and we need to extract it later with
-    //   `into_inner`.
-    let mut writer = buf.writer();
-    tag.ser_into(&mut writer)
-        .expect("serialize BufferTag should always succeed");
-    let buf = writer.into_inner();
-
-    debug_assert!(buf.len() == 1 + len);
-
-    buf.freeze()
-}
-
-fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
-    assert!(base_img.len() == 8192);
-
-    let len = 4 + 1 + 4 * 4 + base_img.len();
-    let mut buf = BytesMut::with_capacity(1 + len);
-
-    buf.put_u8(b'P');
-    buf.put_u32(len as u32);
-    let mut writer = buf.writer();
-    tag.ser_into(&mut writer)
-        .expect("serialize BufferTag should always succeed");
-    let mut buf = writer.into_inner();
-    buf.put(base_img);
-
-    debug_assert!(buf.len() == 1 + len);
-
-    buf.freeze()
-}
-
-fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
-    let len = 4 + 8 + rec.len();
-    let mut buf = BytesMut::with_capacity(1 + len);
-
-    buf.put_u8(b'A');
-    buf.put_u32(len as u32);
-    buf.put_u64(endlsn.0);
-    buf.put(rec);
-
-    debug_assert!(buf.len() == 1 + len);
-
-    buf.freeze()
-}
-
-fn build_get_page_msg(tag: BufferTag) -> Bytes {
-    let len = 4 + 1 + 4 * 4;
-    let mut buf = BytesMut::with_capacity(1 + len);
-
-    buf.put_u8(b'G');
-    buf.put_u32(len as u32);
-    let mut writer = buf.writer();
-    tag.ser_into(&mut writer)
-        .expect("serialize BufferTag should always succeed");
-    let buf = writer.into_inner();
-
-    debug_assert!(buf.len() == 1 + len);
-
-    buf.freeze()
-}
--- a/pageserver/src/walredo/nonrel.rs
+++ b/pageserver/src/walredo/nonrel.rs
@@ -0,0 +1,161 @@
+use byteorder::{ByteOrder, LittleEndian};
+use bytes::{Buf, Bytes, BytesMut};
+use postgres_ffi::{
+    nonrelfile_utils::{
+        mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
+        transaction_id_set_status,
+    },
+    pg_constants, XLogRecord,
+};
+
+use crate::{
+    relish::{RelishTag, SlruKind},
+    waldecoder::{XlMultiXactCreate, XlXactParsedRecord},
+};
+
+use super::WalRedoRequest;
+
+pub(super) fn apply_nonrel(request: &WalRedoRequest) -> Bytes {
+    let rel = request.rel;
+    let blknum = request.blknum;
+
+    // Non-relational WAL records are handled here, with custom code that has the
+    // same effects as the corresponding Postgres WAL redo function.
+    const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
+    let mut page = BytesMut::new();
+    if let Some(fpi) = &request.base_img {
+        // If full-page image is provided, then use it...
+        page.extend_from_slice(&fpi[..]);
+    } else {
+        // otherwise initialize page with zeros
+        page.extend_from_slice(&ZERO_PAGE);
+    }
+    // Apply all collected WAL records
+    for record in &request.records {
+        let mut buf = record.rec.clone();
+
+        super::WAL_REDO_RECORD_COUNTER.inc();
+
+        // 1. Parse XLogRecord struct
+        // FIXME: refactor to avoid code duplication.
+        let xlogrec = XLogRecord::from_bytes(&mut buf);
+
+        //move to main data
+        // TODO probably, we should store some records in our special format
+        // to avoid this weird parsing on replay
+        let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
+        if buf.remaining() > skip {
+            buf.advance(skip);
+        }
+
+        if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
+            // Transaction manager stuff
+            let rec_segno = match rel {
+                RelishTag::Slru { slru, segno } => {
+                    assert!(
+                        slru == SlruKind::Clog,
+                        "Not valid XACT relish tag {:?}",
+                        rel
+                    );
+                    segno
+                }
+                _ => panic!("Not valid XACT relish tag {:?}", rel),
+            };
+            let parsed_xact = XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
+            if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
+                || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
+            {
+                transaction_id_set_status(
+                    parsed_xact.xid,
+                    pg_constants::TRANSACTION_STATUS_COMMITTED,
+                    &mut page,
+                );
+                for subxact in &parsed_xact.subxacts {
+                    let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    // only update xids on the requested page
+                    if rec_segno == segno && blknum == rpageno {
+                        transaction_id_set_status(
+                            *subxact,
+                            pg_constants::TRANSACTION_STATUS_COMMITTED,
+                            &mut page,
+                        );
+                    }
+                }
+            } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
+                || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
+            {
+                transaction_id_set_status(
+                    parsed_xact.xid,
+                    pg_constants::TRANSACTION_STATUS_ABORTED,
+                    &mut page,
+                );
+                for subxact in &parsed_xact.subxacts {
+                    let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    // only update xids on the requested page
+                    if rec_segno == segno && blknum == rpageno {
+                        transaction_id_set_status(
+                            *subxact,
+                            pg_constants::TRANSACTION_STATUS_ABORTED,
+                            &mut page,
+                        );
+                    }
+                }
+            }
+        } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
+            // Multixact operations
+            let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+            if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+                let xlrec = XlMultiXactCreate::decode(&mut buf);
+                if let RelishTag::Slru {
+                    slru,
+                    segno: rec_segno,
+                } = rel
+                {
+                    if slru == SlruKind::MultiXactMembers {
+                        for i in 0..xlrec.nmembers {
+                            let pageno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                            if segno == rec_segno && rpageno == blknum {
+                                // update only target block
+                                let offset = xlrec.moff + i;
+                                let memberoff = mx_offset_to_member_offset(offset);
+                                let flagsoff = mx_offset_to_flags_offset(offset);
+                                let bshift = mx_offset_to_flags_bitshift(offset);
+                                let mut flagsval =
+                                    LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                                flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT)
+                                    - 1)
+                                    << bshift);
+                                flagsval |= xlrec.members[i as usize].status << bshift;
+                                LittleEndian::write_u32(
+                                    &mut page[flagsoff..flagsoff + 4],
+                                    flagsval,
+                                );
+                                LittleEndian::write_u32(
+                                    &mut page[memberoff..memberoff + 4],
+                                    xlrec.members[i as usize].xid,
+                                );
+                            }
+                        }
+                    } else {
+                        // Multixact offsets SLRU
+                        let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32 * 4)
+                            as usize;
+                        LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
+                    }
+                } else {
+                    panic!();
+                }
+            } else {
+                panic!();
+            }
+        }
+    }
+
+    page.freeze()
+}
--- a/pageserver/src/walredo/request.rs
+++ b/pageserver/src/walredo/request.rs
@@ -0,0 +1,86 @@
+///! Functions for constructing messages to send to the postgres WAL redo
+///! process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
+///! explanation of the protocol.
+use bytes::{BufMut, Bytes, BytesMut};
+use zenith_utils::{bin_ser::BeSer, lsn::Lsn};
+
+use crate::repository::WALRecord;
+
+use super::BufferTag;
+
+pub fn serialize_request(
+    tag: BufferTag,
+    base_img: &Option<Bytes>,
+    records: &[WALRecord],
+) -> BytesMut {
+    let mut capacity = 1 + BEGIN_REDO_MSG_LEN;
+    if base_img.is_some() {
+        capacity += 1 + PUSH_PAGE_MSG_LEN;
+    }
+    capacity += (1 + APPLY_MSG_HEADER_LEN) * records.len();
+    capacity += records.iter().map(|rec| rec.rec.len()).sum::<usize>();
+    capacity += 1 + GET_PAGE_MSG_LEN;
+
+    let mut buf = BytesMut::with_capacity(capacity);
+
+    build_begin_redo_for_block_msg(&mut buf, tag);
+
+    if let Some(base_img) = base_img.as_ref() {
+        build_push_page_msg(&mut buf, tag, base_img);
+    }
+
+    for record in records {
+        build_apply_record_msg(&mut buf, record.lsn, &record.rec);
+    }
+
+    build_get_page_msg(&mut buf, tag);
+
+    debug_assert_eq!(capacity, buf.len());
+
+    buf
+}
+
+const TAG_LEN: usize = 4 * 4;
+const PAGE_SIZE: usize = 8192;
+const BEGIN_REDO_MSG_LEN: usize = 4 + 1 + TAG_LEN;
+const PUSH_PAGE_MSG_LEN: usize = 4 + 1 + TAG_LEN + PAGE_SIZE;
+const APPLY_MSG_HEADER_LEN: usize = 4 + 8;
+const GET_PAGE_MSG_LEN: usize = 4 + 1 + TAG_LEN;
+
+fn build_begin_redo_for_block_msg(buf: &mut BytesMut, tag: BufferTag) {
+    buf.put_u8(b'B');
+    buf.put_u32(BEGIN_REDO_MSG_LEN as u32);
+
+    // TODO tag is serialized multiple times
+    // let's try to serialize it just once
+    // or make the protocol less repetitive
+    tag.ser_into(&mut buf.writer())
+        .expect("serialize BufferTag should always succeed");
+}
+
+fn build_push_page_msg(buf: &mut BytesMut, tag: BufferTag, base_img: &Bytes) {
+    debug_assert_eq!(base_img.len(), PAGE_SIZE);
+
+    buf.put_u8(b'P');
+    buf.put_u32(PUSH_PAGE_MSG_LEN as u32);
+    tag.ser_into(&mut buf.writer())
+        .expect("serialize BufferTag should always succeed");
+    buf.extend(base_img);
+}
+
+fn build_apply_record_msg(buf: &mut BytesMut, endlsn: Lsn, rec: &Bytes) {
+    buf.put_u8(b'A');
+
+    let len = APPLY_MSG_HEADER_LEN + rec.len();
+    buf.put_u32(len as u32);
+
+    buf.put_u64(endlsn.0);
+    buf.extend(rec);
+}
+
+fn build_get_page_msg(buf: &mut BytesMut, tag: BufferTag) {
+    buf.put_u8(b'G');
+    buf.put_u32(GET_PAGE_MSG_LEN as u32);
+    tag.ser_into(&mut buf.writer())
+        .expect("serialize BufferTag should always succeed");
+}
--- a/test_runner/batch_others/test_branch_behind.py
+++ b/test_runner/batch_others/test_branch_behind.py
@@ -33,11 +33,11 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    main_cur.execute('''
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
-            FROM generate_series(1, 200000) g
+            FROM generate_series(1, 100000) g
    ''')
    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
    lsn_b = main_cur.fetchone()[0]
-    print('LSN after 200100 rows: ' + lsn_b)
+    print('LSN after 100100 rows: ' + lsn_b)

    # Branch at the point where only 100 rows were inserted
    zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
@@ -46,15 +46,15 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    main_cur.execute('''
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
-            FROM generate_series(1, 200000) g
+            FROM generate_series(1, 100000) g
    ''')
    main_cur.execute('SELECT pg_current_wal_insert_lsn()')

    main_cur.execute('SELECT pg_current_wal_insert_lsn()')
    lsn_c = main_cur.fetchone()[0]
-    print('LSN after 400100 rows: ' + lsn_c)
+    print('LSN after 200100 rows: ' + lsn_c)

-    # Branch at the point where only 200100 rows were inserted
+    # Branch at the point where only 200 rows were inserted
    zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])

    pg_hundred = postgres.create_start("test_branch_behind_hundred")
@@ -70,11 +70,11 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
    more_pg_conn = pg_more.connect()
    more_cur = more_pg_conn.cursor()
    more_cur.execute('SELECT count(*) FROM foo')
-    assert more_cur.fetchone() == (200100, )
+    assert more_cur.fetchone() == (100100, )

    # All the rows are visible on the main branch
    main_cur.execute('SELECT count(*) FROM foo')
-    assert main_cur.fetchone() == (400100, )
+    assert main_cur.fetchone() == (200100, )

    # Check bad lsn's for branching

--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -140,30 +140,6 @@ class ZenithBenchmarker:
                            re.MULTILINE)
        return int(round(float(matches.group(1))))

-    def get_peak_mem(self, pageserver) -> int:
-        """
-        Fetch the "maxrss" metric from the pageserver
-        """
-        # Fetch all the exposed prometheus metrics from page server
-        all_metrics = pageserver.http_client().get_metrics()
-        # See comment in get_io_writes()
-        matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics,
-                            re.MULTILINE)
-        return int(round(float(matches.group(1))))
-
-    def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):
-        """
-        Calculate the on-disk size of a timeline
-        """
-        path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
-
-        totalbytes = 0
-        for root, dirs, files in os.walk(path):
-            for name in files:
-                totalbytes += os.path.getsize(os.path.join(root, name))
-
-        return totalbytes
-
    @contextmanager
    def record_pageserver_writes(self, pageserver, metric_name):
        """
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
@@ -794,17 +794,12 @@ def read_pid(path: Path):
    return int(path.read_text())


-@dataclass
-class WalAcceptorPort:
-    pg: int
-    http: int
-
@dataclass
 class WalAcceptor:
    """ An object representing a running wal acceptor daemon. """
    wa_bin_path: Path
    data_dir: Path
-    port: WalAcceptorPort
+    port: int
    num: int # identifier for logging
    pageserver_port: int
    auth_token: Optional[str] = None
@@ -816,8 +811,7 @@ class WalAcceptor:

        cmd = [str(self.wa_bin_path)]
        cmd.extend(["-D", str(self.data_dir)])
-        cmd.extend(["--listen-pg", f"localhost:{self.port.pg}"])
-        cmd.extend(["--listen-http", f"localhost:{self.port.http}"])
+        cmd.extend(["-l", f"localhost:{self.port}"])
        cmd.append("--daemonize")
        cmd.append("--no-sync")
        # Tell page server it can receive WAL from this WAL safekeeper
@@ -874,14 +868,14 @@ class WalAcceptor:

        # "replication=0" hacks psycopg not to send additional queries
        # on startup, see https://github.com/psycopg/psycopg2/pull/482
-        connstr = f"host=localhost port={self.port.pg} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'"
+        connstr = f"host=localhost port={self.port} replication=0 options='-c ztimelineid={timeline_id} ztenantid={tenant_id}'"

        with closing(psycopg2.connect(connstr)) as conn:
            # server doesn't support transactions
            conn.autocommit = True
            with conn.cursor() as cur:
                request_json = json.dumps(request)
-                print(f"JSON_CTRL request on port {self.port.pg}: {request_json}")
+                print(f"JSON_CTRL request on port {self.port}: {request_json}")
                cur.execute("JSON_CTRL " + request_json)
                all = cur.fetchall()
                print(f"JSON_CTRL response: {all[0][0]}")
@@ -904,10 +898,7 @@ class WalAcceptorFactory:
        wa = WalAcceptor(
            wa_bin_path=self.wa_bin_path,
            data_dir=self.data_dir / "wal_acceptor_{}".format(wa_num),
-            port=WalAcceptorPort(
-                pg=self.port_distributor.get_port(),
-                http=self.port_distributor.get_port(),
-            ),
+            port=self.port_distributor.get_port(),
            num=wa_num,
            pageserver_port=self.pageserver_port,
            auth_token=auth_token,
@@ -931,7 +922,7 @@ class WalAcceptorFactory:

    def get_connstrs(self) -> str:
        """ Get list of wal acceptor endpoints suitable for wal_acceptors GUC  """
-        return ','.join(["localhost:{}".format(wa.port.pg) for wa in self.instances])
+        return ','.join(["localhost:{}".format(wa.port) for wa in self.instances])


@zenfixture
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -4,6 +4,19 @@ from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

+def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
+    path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
+
+    totalbytes = 0
+    for root, dirs, files in os.walk(path):
+        for name in files:
+            totalbytes += os.path.getsize(os.path.join(root, name))
+
+        if 'wal' in dirs:
+            dirs.remove('wal')  # don't visit 'wal' subdirectory
+
+    return totalbytes
+
 #
 # Run bulk INSERT test.
 #
@@ -12,7 +25,6 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
 # 1. Time to INSERT 5 million rows
 # 2. Disk writes
 # 3. Disk space used
-# 4. Peak memory usage
 #
 def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
    # Create a branch for us
@@ -43,9 +55,6 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg
                    # time and I/O
                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")

-            # Record peak memory usage
-            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
-
            # Report disk space used by the repository
-            timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
+            timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
            zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -4,6 +4,19 @@ from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver

 pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")

+def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
+    path = "{}/tenants/{}/timelines/{}".format(repo_dir, tenantid, timelineid)
+
+    totalbytes = 0
+    for root, dirs, files in os.walk(path):
+        for name in files:
+            totalbytes += os.path.getsize(os.path.join(root, name))
+
+        if 'wal' in dirs:
+            dirs.remove('wal')  # don't visit 'wal' subdirectory
+
+    return totalbytes
+
 #
 # Run a very short pgbench test.
 #
@@ -51,5 +64,5 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin
    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")

    # Report disk space used by the repository
-    timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
+    timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
    zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
--- a/test_runner/performance/test_write_amplification.py
+++ b/test_runner/performance/test_write_amplification.py
@@ -1,74 +0,0 @@
-# Demonstrate Write Amplification with naive oldest-first layer checkpointing
-# algorithm.
-#
-# In each iteration of the test, we create a new table that's slightly under 10
-# MB in size (10 MB is the current "segment size" used by the page server). Then
-# we make a tiny update to all the tables already created. This creates a WAL
-# pattern where you have a lot of updates on one segment (the newly created
-# one), alternating with a small updates on all relations. This is the worst
-# case scenario for the naive checkpointing policy where we write out the layers
-# in LSN order, writing the oldest layer first. That creates a new 10 MB image
-# layer to be created for each of those small updates.  This is the Write
-# Amplification problem at its finest.
-import os
-from contextlib import closing
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
-
-pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
-
-def test_write_amplification(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
-    # Create a branch for us
-    zenith_cli.run(["branch", "test_write_amplification", "empty"])
-
-    pg = postgres.create_start('test_write_amplification')
-    print("postgres is running on 'test_write_amplification' branch")
-
-    # Open a connection directly to the page server that we'll use to force
-    # flushing the layers to disk
-    psconn = pageserver.connect();
-    pscur = psconn.cursor()
-
-    with closing(pg.connect()) as conn:
-        with conn.cursor() as cur:
-            # Get the timeline ID of our branch. We need it for the 'do_gc' command
-            cur.execute("SHOW zenith.zenith_timeline")
-            timeline = cur.fetchone()[0]
-
-            with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
-                with zenbenchmark.record_duration('run'):
-
-                    # NOTE: Because each iteration updates every table already created,
-                    # the runtime and write amplification is O(n^2), where n is the
-                    # number of iterations.
-                    for i in range(25):
-                        cur.execute(f'''
-                        CREATE TABLE tbl{i} AS
-                            SELECT g as i, 'long string to consume some space' || g as t
-                            FROM generate_series(1, 100000) g
-                        ''')
-                        cur.execute(f"create index on tbl{i} (i);")
-                        for j in range(1, i):
-                            cur.execute(f"delete from tbl{j} where i = {i}")
-
-                        # Force checkpointing. As of this writing, we don't have
-                        # a back-pressure mechanism, and the page server cannot
-                        # keep up digesting and checkpointing the WAL at the
-                        # rate that it is generated. If we don't force a
-                        # checkpoint, the WAL will just accumulate in memory
-                        # until you hit OOM error. So in effect, we use much
-                        # more memory to hold the incoming WAL, and write them
-                        # out in larger batches than we'd really want. Using
-                        # more memory hides the write amplification problem this
-                        # test tries to demonstrate.
-                        #
-                        # The write amplification problem is real, and using
-                        # more memory isn't the right solution. We could
-                        # demonstrate the effect also by generating the WAL
-                        # slower, adding some delays in this loop.  But forcing
-                        # the the checkpointing and GC makes the test go faster,
-                        # with the same total I/O effect.
-                        pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
-
-            # Report disk space used by the repository
-            timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
-            zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
--- a/vendor/postgres
+++ b/vendor/postgres
--- a/walkeeper/Cargo.toml
+++ b/walkeeper/Cargo.toml
@@ -28,11 +28,9 @@ humantime = "2.1.0"
 walkdir = "2"
 serde = { version = "1.0", features = ["derive"] }
 hex = "0.4.3"
-const_format = "0.2.21"

 # FIXME: 'pageserver' is needed for ZTimelineId. Refactor
 pageserver = { path = "../pageserver" }
 postgres_ffi = { path = "../postgres_ffi" }
 workspace_hack = { path = "../workspace_hack" }
-zenith_metrics = { path = "../zenith_metrics" }
 zenith_utils = { path = "../zenith_utils" }
--- a/walkeeper/src/bin/wal_acceptor.rs
+++ b/walkeeper/src/bin/wal_acceptor.rs
@@ -3,23 +3,18 @@
 //
 use anyhow::Result;
 use clap::{App, Arg};
-use const_format::formatcp;
 use daemonize::Daemonize;
 use log::*;
 use std::env;
-use std::net::TcpListener;
 use std::path::{Path, PathBuf};
 use std::thread;
-use zenith_utils::http::endpoint;
 use zenith_utils::logging;

-use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR};
 use walkeeper::s3_offload;
 use walkeeper::wal_service;
 use walkeeper::WalAcceptorConf;

 fn main() -> Result<()> {
-    zenith_metrics::set_common_metrics_prefix("safekeeper");
    let arg_matches = App::new("Zenith wal_acceptor")
        .about("Store WAL stream to local file system and push it to WAL receivers")
        .arg(
@@ -30,18 +25,11 @@ fn main() -> Result<()> {
                .help("Path to the WAL acceptor data directory"),
        )
        .arg(
-            Arg::with_name("listen-pg")
+            Arg::with_name("listen")
                .short("l")
-                .long("listen-pg")
-                .alias("listen") // for compatibility
+                .long("listen")
                .takes_value(true)
-                .help(formatcp!("listen for incoming WAL data connections on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
-        )
-        .arg(
-            Arg::with_name("listen-http")
-                .long("listen-http")
-                .takes_value(true)
-                .help(formatcp!("http endpoint address for metrics on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
+                .help("listen for incoming connections on ip:port (default: 127.0.0.1:5454)"),
        )
        .arg(
            Arg::with_name("pageserver")
@@ -82,8 +70,7 @@ fn main() -> Result<()> {
        daemonize: false,
        no_sync: false,
        pageserver_addr: None,
-        listen_pg_addr: DEFAULT_PG_LISTEN_ADDR.to_string(),
-        listen_http_addr: DEFAULT_HTTP_LISTEN_ADDR.to_string(),
+        listen_addr: "localhost:5454".to_string(),
        ttl: None,
        recall_period: None,
        pageserver_auth_token: env::var("PAGESERVER_AUTH_TOKEN").ok(),
@@ -104,12 +91,8 @@ fn main() -> Result<()> {
        conf.daemonize = true;
    }

-    if let Some(addr) = arg_matches.value_of("listen-pg") {
-        conf.listen_pg_addr = addr.to_owned();
-    }
-
-    if let Some(addr) = arg_matches.value_of("listen-http") {
-        conf.listen_http_addr = addr.to_owned();
+    if let Some(addr) = arg_matches.value_of("listen") {
+        conf.listen_addr = addr.to_owned();
    }

    if let Some(addr) = arg_matches.value_of("pageserver") {
@@ -131,11 +114,6 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
    let log_filename = conf.data_dir.join("wal_acceptor.log");
    let (_scope_guard, log_file) = logging::init(log_filename, conf.daemonize)?;

-    let http_listener = TcpListener::bind(conf.listen_http_addr.clone()).map_err(|e| {
-        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
-        e
-    })?;
-
    if conf.daemonize {
        info!("daemonizing...");

@@ -158,16 +136,6 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {

    let mut threads = Vec::new();

-    let http_endpoint_thread = thread::Builder::new()
-        .name("http_endpoint_thread".into())
-        .spawn(move || {
-            // No authentication at all: read-only metrics only, early stage.
-            let router = endpoint::make_router();
-            endpoint::serve_thread_main(router, http_listener).unwrap();
-        })
-        .unwrap();
-    threads.push(http_endpoint_thread);
-
    if conf.ttl.is_some() {
        let s3_conf = conf.clone();
        let s3_offload_thread = thread::Builder::new()
--- a/walkeeper/src/lib.rs
+++ b/walkeeper/src/lib.rs
@@ -11,23 +11,12 @@ pub mod send_wal;
 pub mod timeline;
 pub mod wal_service;

-pub mod defaults {
-    use const_format::formatcp;
-
-    pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454;
-    pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
-
-    pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676;
-    pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
-}
-
 #[derive(Debug, Clone)]
 pub struct WalAcceptorConf {
    pub data_dir: PathBuf,
    pub daemonize: bool,
    pub no_sync: bool,
-    pub listen_pg_addr: String,
-    pub listen_http_addr: String,
+    pub listen_addr: String,
    pub pageserver_addr: Option<String>,
    // TODO (create issue) this is temporary, until protocol between PG<->SK<->PS rework
    pub pageserver_auth_token: Option<String>,
--- a/walkeeper/src/receive_wal.rs
+++ b/walkeeper/src/receive_wal.rs
@@ -42,7 +42,7 @@ fn request_callback(conf: WalAcceptorConf, timelineid: ZTimelineId, tenantid: ZT
    );

    // use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses
-    let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_pg_addr);
+    let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_addr);
    let me_conf: Config = me_connstr.parse().unwrap();
    let (host, port) = connection_host_port(&me_conf);
    let callme = format!(
--- a/walkeeper/src/safekeeper.rs
+++ b/walkeeper/src/safekeeper.rs
@@ -15,11 +15,8 @@ use std::cmp::min;
 use std::io;
 use std::io::Read;

-use lazy_static::lazy_static;
-
 use crate::replication::HotStandbyFeedback;
 use postgres_ffi::xlog_utils::MAX_SEND_SIZE;
-use zenith_metrics::{register_gauge_vec, Gauge, GaugeVec};
 use zenith_utils::bin_ser::LeSer;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::pq_proto::SystemId;
@@ -282,38 +279,6 @@ pub trait Storage {
    fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()>;
 }

-lazy_static! {
-    // The prometheus crate does not support u64 yet, i64 only (see `IntGauge`).
-    // i64 is faster than f64, so update to u64 when available.
-    static ref FLUSH_LSN_GAUGE: GaugeVec = register_gauge_vec!(
-        "safekeeper_flush_lsn",
-        "Current flush_lsn, grouped by timeline",
-        &["ztli"]
-    )
-    .expect("Failed to register safekeeper_flush_lsn int gauge vec");
-    static ref COMMIT_LSN_GAUGE: GaugeVec = register_gauge_vec!(
-        "safekeeper_commit_lsn",
-        "Current commit_lsn (not necessarily persisted to disk), grouped by timeline",
-        &["ztli"]
-    )
-    .expect("Failed to register safekeeper_commit_lsn int gauge vec");
-}
-
-struct SafeKeeperMetrics {
-    flush_lsn: Gauge,
-    commit_lsn: Gauge,
-}
-
-impl SafeKeeperMetrics {
-    fn new(ztli: ZTimelineId) -> SafeKeeperMetrics {
-        let ztli_str = format!("{}", ztli);
-        SafeKeeperMetrics {
-            flush_lsn: FLUSH_LSN_GAUGE.with_label_values(&[&ztli_str]),
-            commit_lsn: COMMIT_LSN_GAUGE.with_label_values(&[&ztli_str]),
-        }
-    }
-}
-
 /// SafeKeeper which consumes events (messages from compute) and provides
 /// replies.
 pub struct SafeKeeper<ST: Storage> {
@@ -321,8 +286,6 @@ pub struct SafeKeeper<ST: Storage> {
    /// Established by reading wal.
    pub flush_lsn: Lsn,
    pub tli: u32,
-    // Cached metrics so we don't have to recompute labels on each update.
-    metrics: Option<SafeKeeperMetrics>,
    /// not-yet-flushed pairs of same named fields in s.*
    pub commit_lsn: Lsn,
    pub truncate_lsn: Lsn,
@@ -341,7 +304,6 @@ where
        SafeKeeper {
            flush_lsn,
            tli,
-            metrics: None,
            commit_lsn: state.commit_lsn,
            truncate_lsn: state.truncate_lsn,
            storage,
@@ -393,8 +355,6 @@ where
        self.s.server.wal_seg_size = msg.wal_seg_size;
        self.storage.persist(&self.s, true)?;

-        self.metrics = Some(SafeKeeperMetrics::new(self.s.server.ztli));
-
        info!(
            "processed greeting from proposer {:?}, sending term {:?}",
            msg.proposer_id, self.s.acceptor_state.term
@@ -518,11 +478,6 @@ where
        }
        if last_rec_lsn > self.flush_lsn {
            self.flush_lsn = last_rec_lsn;
-            self.metrics
-                .as_ref()
-                .unwrap()
-                .flush_lsn
-                .set(u64::from(self.flush_lsn) as f64);
        }

        // Advance commit_lsn taking into account what we have locally. xxx this
@@ -540,11 +495,6 @@ where
            sync_control_file |=
                commit_lsn >= msg.h.epoch_start_lsn && self.s.commit_lsn < msg.h.epoch_start_lsn;
            self.commit_lsn = commit_lsn;
-            self.metrics
-                .as_ref()
-                .unwrap()
-                .commit_lsn
-                .set(u64::from(self.commit_lsn) as f64);
        }

        self.truncate_lsn = msg.h.truncate_lsn;
--- a/walkeeper/src/wal_service.rs
+++ b/walkeeper/src/wal_service.rs
@@ -13,9 +13,9 @@ use zenith_utils::postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
 pub fn thread_main(conf: WalAcceptorConf) -> Result<()> {
-    info!("Starting wal acceptor on {}", conf.listen_pg_addr);
-    let listener = TcpListener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
-        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+    info!("Starting wal acceptor on {}", conf.listen_addr);
+    let listener = TcpListener::bind(conf.listen_addr.clone()).map_err(|e| {
+        error!("failed to bind to address {}: {}", conf.listen_addr, e);
        e
    })?;

--- a/zenith_metrics/src/lib.rs
+++ b/zenith_metrics/src/lib.rs
@@ -5,8 +5,6 @@
 use lazy_static::lazy_static;
 use once_cell::race::OnceBox;
 pub use prometheus::{exponential_buckets, linear_buckets};
-pub use prometheus::{register_gauge, Gauge};
-pub use prometheus::{register_gauge_vec, GaugeVec};
 pub use prometheus::{register_histogram, Histogram};
 pub use prometheus::{register_histogram_vec, HistogramVec};
 pub use prometheus::{register_int_counter, IntCounter};
@@ -23,7 +21,7 @@ pub use wrappers::{CountedReader, CountedWriter};
 /// Metrics gathering is a relatively simple and standalone operation, so
 /// it might be fine to do it this way to keep things simple.
 pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
-    update_rusage_metrics();
+    update_io_metrics();
    prometheus::gather()
 }

@@ -33,29 +31,16 @@ static COMMON_METRICS_PREFIX: OnceBox<&str> = OnceBox::new();
 /// name like 'pageserver'. Should be executed exactly once in the beginning of
 /// any executable which uses common metrics.
 pub fn set_common_metrics_prefix(prefix: &'static str) {
-    // Not unwrap() because metrics may be initialized after multiple threads have been started.
-    COMMON_METRICS_PREFIX
-        .set(prefix.into())
-        .unwrap_or_else(|_| {
-            eprintln!(
-                "set_common_metrics_prefix() was called second time with '{}', exiting",
-                prefix
-            );
-            std::process::exit(1);
-        });
+    COMMON_METRICS_PREFIX.set(prefix.into()).unwrap();
 }

 /// Prepends a prefix to a common metric name so they are distinguished between
 /// different services, see https://github.com/zenithdb/zenith/pull/681
 /// A call to set_common_metrics_prefix() is necessary prior to calling this.
 pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String {
-    // Not unwrap() because metrics may be initialized after multiple threads have been started.
    format!(
        "{}_{}",
-        COMMON_METRICS_PREFIX.get().unwrap_or_else(|| {
-            eprintln!("set_common_metrics_prefix() was not called, but metrics are used, exiting");
-            std::process::exit(1);
-        }),
+        COMMON_METRICS_PREFIX.get().unwrap(),
        unprefixed_metric_name
    )
 }
@@ -67,11 +52,6 @@ lazy_static! {
        &["io_operation"]
    )
    .expect("Failed to register disk i/o bytes int gauge vec");
-    static ref MAXRSS_KB: IntGauge = register_int_gauge!(
-        new_common_metric_name("maxrss_kb"),
-        "Memory usage (Maximum Resident Set Size)"
-    )
-    .expect("Failed to register maxrss_kb int gauge");
 }

 // Records I/O stats in a "cross-platform" way.
@@ -83,7 +63,7 @@ lazy_static! {
 // We know the size of the block, so we can determine the I/O bytes out of it.
 // The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
 #[allow(clippy::unnecessary_cast)]
-fn update_rusage_metrics() {
+fn update_io_metrics() {
    let rusage_stats = get_rusage_stats();

    const BYTES_IN_BLOCK: i64 = 512;
@@ -93,7 +73,6 @@ fn update_rusage_metrics() {
    DISK_IO_BYTES
        .with_label_values(&["write"])
        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
-    MAXRSS_KB.set(rusage_stats.ru_maxrss);
 }

 fn get_rusage_stats() -> libc::rusage {
--- a/zenith_utils/src/lib.rs
+++ b/zenith_utils/src/lib.rs
@@ -8,9 +8,6 @@ pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
 pub mod seqwait;

-/// append only ordered map implemented with a Vec
-pub mod vec_map;
-
 // Async version of SeqWait. Currently unused.
 // pub mod seqwait_async;

--- a/zenith_utils/src/vec_map.rs
+++ b/zenith_utils/src/vec_map.rs
@@ -1,293 +0,0 @@
-use std::{cmp::Ordering, ops::RangeBounds};
-
-use serde::{Deserialize, Serialize};
-
-/// Ordered map datastructure implemented in a Vec.
-/// Append only - can only add keys that are larger than the
-/// current max key.
-#[derive(Clone, Debug, Serialize, Deserialize)]
-pub struct VecMap<K, V>(Vec<(K, V)>);
-
-impl<K, V> Default for VecMap<K, V> {
-    fn default() -> Self {
-        VecMap(Default::default())
-    }
-}
-
-#[derive(Debug)]
-pub struct InvalidKey;
-
-impl<K: Ord, V> VecMap<K, V> {
-    pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
-    }
-
-    pub fn as_slice(&self) -> &[(K, V)] {
-        self.0.as_slice()
-    }
-
-    /// This function may panic if given a range where the lower bound is
-    /// greater than the upper bound.
-    pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
-        use std::ops::Bound::*;
-
-        let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
-
-        let start_idx = match range.start_bound() {
-            Unbounded => 0,
-            Included(k) => binary_search(k).unwrap_or_else(std::convert::identity),
-            Excluded(k) => match binary_search(k) {
-                Ok(idx) => idx + 1,
-                Err(idx) => idx,
-            },
-        };
-
-        let end_idx = match range.end_bound() {
-            Unbounded => self.0.len(),
-            Included(k) => match binary_search(k) {
-                Ok(idx) => idx + 1,
-                Err(idx) => idx,
-            },
-            Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
-        };
-
-        &self.0[start_idx..end_idx]
-    }
-
-    /// Add a key value pair to the map.
-    /// If [`key`] is less than or equal to the current maximum key
-    /// the pair will not be added and InvalidKey error will be returned.
-    pub fn append(&mut self, key: K, value: V) -> Result<(), InvalidKey> {
-        if let Some((last_key, _last_value)) = self.0.last() {
-            if &key <= last_key {
-                return Err(InvalidKey);
-            }
-        }
-
-        self.0.push((key, value));
-        Ok(())
-    }
-
-    /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If [`key`] is less than the current maximum key no updates or additions
-    /// will occur and InvalidKey error will be returned.
-    pub fn append_or_update_last(&mut self, key: K, mut value: V) -> Result<Option<V>, InvalidKey> {
-        if let Some((last_key, last_value)) = self.0.last_mut() {
-            match key.cmp(last_key) {
-                Ordering::Less => return Err(InvalidKey),
-                Ordering::Equal => {
-                    std::mem::swap(last_value, &mut value);
-                    return Ok(Some(value));
-                }
-                Ordering::Greater => {}
-            }
-        }
-
-        self.0.push((key, value));
-        Ok(None)
-    }
-
-    /// Split the map into two.
-    ///
-    /// The left map contains everything before [`cutoff`] (exclusive).
-    /// Right map contains [`cutoff`] and everything after (inclusive).
-    pub fn split_at(&self, cutoff: &K) -> (Self, Self)
-    where
-        K: Clone,
-        V: Clone,
-    {
-        let split_idx = self
-            .0
-            .binary_search_by_key(&cutoff, extract_key)
-            .unwrap_or_else(std::convert::identity);
-
-        (
-            VecMap(self.0[..split_idx].to_vec()),
-            VecMap(self.0[split_idx..].to_vec()),
-        )
-    }
-
-    /// Move items from [`other`] to the end of [`self`], leaving [`other`] empty.
-    /// If any keys in [`other`] is less than or equal to any key in [`self`],
-    /// [`InvalidKey`] error will be returned and no mutation will occur.
-    pub fn extend(&mut self, other: &mut Self) -> Result<(), InvalidKey> {
-        let self_last_opt = self.0.last().map(extract_key);
-        let other_first_opt = other.0.last().map(extract_key);
-
-        if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
-            if self_last >= other_first {
-                return Err(InvalidKey);
-            }
-        }
-
-        self.0.append(&mut other.0);
-
-        Ok(())
-    }
-}
-
-fn extract_key<K, V>(entry: &(K, V)) -> &K {
-    &entry.0
-}
-
-#[cfg(test)]
-mod tests {
-    use std::{collections::BTreeMap, ops::Bound};
-
-    use super::VecMap;
-
-    #[test]
-    fn unbounded_range() {
-        let mut vec = VecMap::default();
-        vec.append(0, ()).unwrap();
-
-        assert_eq!(vec.slice_range(0..0), &[]);
-    }
-
-    #[test]
-    #[should_panic]
-    fn invalid_ordering_range() {
-        let mut vec = VecMap::default();
-        vec.append(0, ()).unwrap();
-
-        #[allow(clippy::reversed_empty_ranges)]
-        vec.slice_range(1..0);
-    }
-
-    #[test]
-    fn range_tests() {
-        let mut vec = VecMap::default();
-        vec.append(0, ()).unwrap();
-        vec.append(2, ()).unwrap();
-        vec.append(4, ()).unwrap();
-
-        assert_eq!(vec.slice_range(0..0), &[]);
-        assert_eq!(vec.slice_range(0..1), &[(0, ())]);
-        assert_eq!(vec.slice_range(0..2), &[(0, ())]);
-        assert_eq!(vec.slice_range(0..3), &[(0, ()), (2, ())]);
-
-        assert_eq!(vec.slice_range(..0), &[]);
-        assert_eq!(vec.slice_range(..1), &[(0, ())]);
-
-        assert_eq!(vec.slice_range(..3), &[(0, ()), (2, ())]);
-        assert_eq!(vec.slice_range(..3), &[(0, ()), (2, ())]);
-
-        assert_eq!(vec.slice_range(0..=0), &[(0, ())]);
-        assert_eq!(vec.slice_range(0..=1), &[(0, ())]);
-        assert_eq!(vec.slice_range(0..=2), &[(0, ()), (2, ())]);
-        assert_eq!(vec.slice_range(0..=3), &[(0, ()), (2, ())]);
-
-        assert_eq!(vec.slice_range(..=0), &[(0, ())]);
-        assert_eq!(vec.slice_range(..=1), &[(0, ())]);
-        assert_eq!(vec.slice_range(..=2), &[(0, ()), (2, ())]);
-        assert_eq!(vec.slice_range(..=3), &[(0, ()), (2, ())]);
-    }
-
-    struct BoundIter {
-        min: i32,
-        max: i32,
-
-        next: Option<Bound<i32>>,
-    }
-
-    impl BoundIter {
-        fn new(min: i32, max: i32) -> Self {
-            Self {
-                min,
-                max,
-
-                next: Some(Bound::Unbounded),
-            }
-        }
-    }
-
-    impl Iterator for BoundIter {
-        type Item = Bound<i32>;
-
-        fn next(&mut self) -> Option<Self::Item> {
-            let cur = self.next?;
-
-            self.next = match &cur {
-                Bound::Unbounded => Some(Bound::Included(self.min)),
-                Bound::Included(x) => {
-                    if *x >= self.max {
-                        Some(Bound::Excluded(self.min))
-                    } else {
-                        Some(Bound::Included(x + 1))
-                    }
-                }
-                Bound::Excluded(x) => {
-                    if *x >= self.max {
-                        None
-                    } else {
-                        Some(Bound::Excluded(x + 1))
-                    }
-                }
-            };
-
-            Some(cur)
-        }
-    }
-
-    #[test]
-    fn range_exhaustive() {
-        let map: BTreeMap<i32, ()> = (1..=7).step_by(2).map(|x| (x, ())).collect();
-        let mut vec = VecMap::default();
-        for &key in map.keys() {
-            vec.append(key, ()).unwrap();
-        }
-
-        const RANGE_MIN: i32 = 0;
-        const RANGE_MAX: i32 = 8;
-        for lower_bound in BoundIter::new(RANGE_MIN, RANGE_MAX) {
-            let ub_min = match lower_bound {
-                Bound::Unbounded => RANGE_MIN,
-                Bound::Included(x) => x,
-                Bound::Excluded(x) => x + 1,
-            };
-            for upper_bound in BoundIter::new(ub_min, RANGE_MAX) {
-                let map_range: Vec<(i32, ())> = map
-                    .range((lower_bound, upper_bound))
-                    .map(|(&x, _)| (x, ()))
-                    .collect();
-                let vec_slice = vec.slice_range((lower_bound, upper_bound));
-
-                assert_eq!(map_range, vec_slice);
-            }
-        }
-    }
-
-    #[test]
-    fn extend() {
-        let mut left = VecMap::default();
-        left.append(0, ()).unwrap();
-        assert_eq!(left.as_slice(), &[(0, ())]);
-
-        let mut empty = VecMap::default();
-        left.extend(&mut empty).unwrap();
-        assert_eq!(left.as_slice(), &[(0, ())]);
-        assert_eq!(empty.as_slice(), &[]);
-
-        let mut right = VecMap::default();
-        right.append(1, ()).unwrap();
-
-        left.extend(&mut right).unwrap();
-
-        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
-        assert_eq!(right.as_slice(), &[]);
-
-        let mut zero_map = VecMap::default();
-        zero_map.append(0, ()).unwrap();
-
-        left.extend(&mut zero_map).unwrap_err();
-        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
-        assert_eq!(zero_map.as_slice(), &[(0, ())]);
-
-        let mut one_map = VecMap::default();
-        one_map.append(1, ()).unwrap();
-
-        left.extend(&mut one_map).unwrap_err();
-        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
-        assert_eq!(one_map.as_slice(), &[(1, ())]);
-    }
-}
Author	SHA1	Message	Date
Patrick Insinger	17ca2d51d2	pageserver - walredo nits	2021-10-05 23:47:37 -07:00
Patrick Insinger	e72e14e6cf	pageserver - walredo request serialization module	2021-10-05 23:45:20 -07:00
Patrick Insinger	f0f6daad23	pageserver - reduce # of copies in walredo request	2021-10-05 23:36:02 -07:00
Patrick Insinger	8f71bfe8f9	walredo - move nonrel redo logic into function	2021-10-05 23:33:47 -07:00
Patrick Insinger	f644009b5c	pageserver - move walredo nonrel logic to module	2021-10-05 23:26:01 -07:00
Patrick Insinger	669a939fff	pageserver - move walredo to folder	2021-10-05 23:14:39 -07:00