cargo fmt

Bump vendor/postgres
Merge with main
2026-02-07 04:30:36 +00:00 · 2021-10-20 14:01:00 +03:00 · 2021-10-20 13:59:42 +03:00 · 2021-10-20 13:19:50 +03:00 · 2021-10-19 17:42:29 +03:00 · 2021-10-19 12:01:24 +03:00
31 changed files with 2702 additions and 3238 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -180,9 +180,9 @@ dependencies = [

 [[package]]
 name = "bitflags"
-version = "1.3.2"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"

 [[package]]
 name = "bitvec"
@@ -338,7 +338,7 @@ dependencies = [
 "bytes",
 "hex",
 "lazy_static",
- "nix 0.20.0",
+ "nix",
 "pageserver",
 "postgres",
 "postgres_ffi",
@@ -886,9 +886,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"

 [[package]]
 name = "libc"
-version = "0.2.109"
+version = "0.2.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f98a04dce437184842841303488f70d0188c5f51437d2a834dc097eafa909a01"
+checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21"

 [[package]]
 name = "libloading"
@@ -902,9 +902,9 @@ dependencies = [

 [[package]]
 name = "lock_api"
-version = "0.4.5"
+version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109"
+checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb"
 dependencies = [
 "scopeguard",
 ]
@@ -918,15 +918,6 @@ dependencies = [
 "cfg-if 1.0.0",
 ]

-[[package]]
-name = "lz4_flex"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "177c079243f6867429aca5af5053747f57e329d44f0c58bebca078cd14873ec2"
-dependencies = [
- "twox-hash",
-]
-
 [[package]]
 name = "matchers"
 version = "0.0.1"
@@ -1052,19 +1043,6 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "nix"
-version = "0.23.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f305c2c2e4c39a82f7bf0bf65fb557f9070ce06781d4f2454295cc34b1c43188"
-dependencies = [
- "bitflags",
- "cc",
- "cfg-if 1.0.0",
- "libc",
- "memoffset",
-]
-
 [[package]]
 name = "nom"
 version = "6.1.2"
@@ -1202,9 +1180,6 @@ dependencies = [
 "hyper",
 "lazy_static",
 "log",
- "lz4_flex",
- "nix 0.23.0",
- "parking_lot",
 "postgres",
 "postgres-protocol",
 "postgres-types",
@@ -1223,16 +1198,15 @@ dependencies = [
 "toml",
 "tracing",
 "workspace_hack",
- "yakv",
 "zenith_metrics",
 "zenith_utils",
 ]

 [[package]]
 name = "parking_lot"
-version = "0.11.2"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99"
+checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb"
 dependencies = [
 "instant",
 "lock_api",
@@ -1241,9 +1215,9 @@ dependencies = [

 [[package]]
 name = "parking_lot_core"
-version = "0.8.5"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216"
+checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018"
 dependencies = [
 "cfg-if 1.0.0",
 "instant",
@@ -1934,12 +1908,6 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"

-[[package]]
-name = "static_assertions"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
-
 [[package]]
 name = "stringprep"
 version = "0.1.2"
@@ -2260,16 +2228,6 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642"

-[[package]]
-name = "twox-hash"
-version = "1.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f559b464de2e2bdabcac6a210d12e9b5a5973c251e102c44c585c71d51bd78e"
-dependencies = [
- "cfg-if 1.0.0",
- "static_assertions",
-]
-
 [[package]]
 name = "typenum"
 version = "1.13.0"
@@ -2584,17 +2542,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a"

-[[package]]
-name = "yakv"
-version = "0.2.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17eba1abb31dda774cd901a9692a47aac716050975a30993c79826a08de47a34"
-dependencies = [
- "anyhow",
- "fs2",
- "parking_lot",
-]
-
 [[package]]
 name = "zenith"
 version = "0.1.0"
@@ -2634,7 +2581,6 @@ dependencies = [
 "jsonwebtoken",
 "lazy_static",
 "log",
- "nix 0.23.0",
 "postgres",
 "rand",
 "routerify",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,3 @@ members = [
 # This is useful for profiling and, to some extent, debug.
 # Besides, debug info should not affect the performance.
 debug = true
-panic = 'abort'
-
-[profile.dev]
-panic = 'abort'
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -298,10 +298,9 @@ impl PostgresNode {
        conf.append("max_replication_slots", "10");
        conf.append("hot_standby", "on");
        conf.append("shared_buffers", "1MB");
-        conf.append("max_wal_size", "100GB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
-        conf.append("wal_sender_timeout", "0");
+        conf.append("wal_sender_timeout", "10s");
        conf.append("wal_level", "replica");
        conf.append("listen_addresses", &self.address.ip().to_string());
        conf.append("port", &self.address.port().to_string());
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -37,16 +37,11 @@ async-trait = "0.1"
 const_format = "0.2.21"
 tracing = "0.1.27"
 signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
-nix = "0.23"
-#yakv = { path  = "../../yakv" }
-yakv = "0.2.7"
-lz4_flex = "0.9.0"

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_metrics = { path = "../zenith_metrics" }
 zenith_utils = { path = "../zenith_utils" }
 workspace_hack = { path = "../workspace_hack" }
-parking_lot = "0.11.2"

 [dev-dependencies]
 hex-literal = "0.3"
--- a/pageserver/src/bin/dump_layerfile.rs
+++ b/pageserver/src/bin/dump_layerfile.rs
@@ -0,0 +1,25 @@
+//! Main entry point for the dump_layerfile executable
+//!
+//! A handy tool for debugging, that's all.
+use anyhow::Result;
+use clap::{App, Arg};
+use pageserver::layered_repository::dump_layerfile_from_path;
+use std::path::PathBuf;
+
+fn main() -> Result<()> {
+    let arg_matches = App::new("Zenith dump_layerfile utility")
+        .about("Dump contents of one layer file, for debugging")
+        .arg(
+            Arg::with_name("path")
+                .help("Path to file to dump")
+                .required(true)
+                .index(1),
+        )
+        .get_matches();
+
+    let path = PathBuf::from(arg_matches.value_of("path").unwrap());
+
+    dump_layerfile_from_path(&path)?;
+
+    Ok(())
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -42,9 +42,6 @@ struct CfgFileParams {
    listen_http_addr: Option<String>,
    checkpoint_distance: Option<String>,
    checkpoint_period: Option<String>,
-    upload_distance: Option<String>,
-    upload_period: Option<String>,
-    reconstruct_threshold: Option<String>,
    gc_horizon: Option<String>,
    gc_period: Option<String>,
    pg_distrib_dir: Option<String>,
@@ -106,9 +103,6 @@ impl CfgFileParams {
            listen_http_addr: get_arg("listen-http"),
            checkpoint_distance: get_arg("checkpoint_distance"),
            checkpoint_period: get_arg("checkpoint_period"),
-            upload_distance: get_arg("upload_distance"),
-            upload_period: get_arg("upload_period"),
-            reconstruct_threshold: get_arg("reconstruct_threshold"),
            gc_horizon: get_arg("gc_horizon"),
            gc_period: get_arg("gc_period"),
            pg_distrib_dir: get_arg("postgres-distrib"),
@@ -127,9 +121,6 @@ impl CfgFileParams {
            listen_http_addr: self.listen_http_addr.or(other.listen_http_addr),
            checkpoint_distance: self.checkpoint_distance.or(other.checkpoint_distance),
            checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
-            upload_distance: self.upload_distance.or(other.upload_distance),
-            upload_period: self.upload_period.or(other.upload_period),
-            reconstruct_threshold: self.reconstruct_threshold.or(other.reconstruct_threshold),
            gc_horizon: self.gc_horizon.or(other.gc_horizon),
            gc_period: self.gc_period.or(other.gc_period),
            pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
@@ -167,20 +158,6 @@ impl CfgFileParams {
            None => DEFAULT_CHECKPOINT_PERIOD,
        };

-        let upload_distance: u64 = match self.upload_distance.as_ref() {
-            Some(upload_distance_str) => upload_distance_str.parse()?,
-            None => DEFAULT_UPLOAD_DISTANCE,
-        };
-        let upload_period = match self.upload_period.as_ref() {
-            Some(upload_period_str) => humantime::parse_duration(upload_period_str)?,
-            None => DEFAULT_UPLOAD_PERIOD,
-        };
-
-        let reconstruct_threshold: u64 = match self.reconstruct_threshold.as_ref() {
-            Some(reconstruct_threshold_str) => reconstruct_threshold_str.parse()?,
-            None => DEFAULT_RECONSTRUCT_THRESHOLD,
-        };
-
        let gc_horizon: u64 = match self.gc_horizon.as_ref() {
            Some(horizon_str) => horizon_str.parse()?,
            None => DEFAULT_GC_HORIZON,
@@ -259,9 +236,6 @@ impl CfgFileParams {
            listen_http_addr,
            checkpoint_distance,
            checkpoint_period,
-            upload_distance,
-            upload_period,
-            reconstruct_threshold,
            gc_horizon,
            gc_period,

@@ -322,24 +296,6 @@ fn main() -> Result<()> {
                .takes_value(true)
                .help("Interval between checkpoint iterations"),
        )
-        .arg(
-            Arg::with_name("upload_distance")
-                .long("upload_distance")
-                .takes_value(true)
-                .help("Distance from current LSN to perform checkpoint of in-memory layers"),
-        )
-        .arg(
-            Arg::with_name("upload_period")
-                .long("upload_period")
-                .takes_value(true)
-                .help("Interval between upload iterations"),
-        )
-        .arg(
-            Arg::with_name("reconstruct_threshold")
-                .long("reconstruct_threshold")
-                .takes_value(true)
-                .help("Minimal size of deltas after which page reconstruction (materialization) can be performed"),
-        )
        .arg(
            Arg::with_name("gc_horizon")
                .long("gc_horizon")
@@ -644,9 +600,6 @@ mod tests {
            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
-            upload_distance: Some("upload_distance_VALUE".to_string()),
-            upload_period: Some("upload_period_VALUE".to_string()),
-            reconstruct_threshold: Some("reconstruct_threshold_VALUE".to_string()),
            gc_horizon: Some("gc_horizon_VALUE".to_string()),
            gc_period: Some("gc_period_VALUE".to_string()),
            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
@@ -670,9 +623,6 @@ mod tests {
 listen_http_addr = 'listen_http_addr_VALUE'
 checkpoint_distance = 'checkpoint_distance_VALUE'
 checkpoint_period = 'checkpoint_period_VALUE'
-upload_distance = 'upload_distance_VALUE'
-upload_period = 'upload_period_VALUE'
-reconstruct_threshold = 'reconstruct_threshold_VALUE'
 gc_horizon = 'gc_horizon_VALUE'
 gc_period = 'gc_period_VALUE'
 pg_distrib_dir = 'pg_distrib_dir_VALUE'
@@ -707,9 +657,6 @@ local_path = 'relish_storage_local_VALUE'
            listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
            checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
            checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
-            upload_distance: Some("upload_distance_VALUE".to_string()),
-            upload_period: Some("upload_period_VALUE".to_string()),
-            reconstruct_threshold: Some("reconstruct_threshold_VALUE".to_string()),
            gc_horizon: Some("gc_horizon_VALUE".to_string()),
            gc_period: Some("gc_period_VALUE".to_string()),
            pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
@@ -736,9 +683,6 @@ local_path = 'relish_storage_local_VALUE'
 listen_http_addr = 'listen_http_addr_VALUE'
 checkpoint_distance = 'checkpoint_distance_VALUE'
 checkpoint_period = 'checkpoint_period_VALUE'
-upload_distance = 'upload_distance_VALUE'
-upload_period = 'upload_period_VALUE'
-reconstruct_threshold = 'reconstruct_threshold_VALUE'
 gc_horizon = 'gc_horizon_VALUE'
 gc_period = 'gc_period_VALUE'
 pg_distrib_dir = 'pg_distrib_dir_VALUE'
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -147,7 +147,7 @@ pub fn create_repo(

    let tli = create_timeline(conf, None, &tenantid)?;

-    let repo = Arc::new(crate::buffered_repository::BufferedRepository::new(
+    let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
        conf,
        wal_redo_manager,
        tenantid,
@@ -182,7 +182,6 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
    let initdb_output = Command::new(initdb_path)
        .args(&["-D", initdbpath.to_str().unwrap()])
        .args(&["-U", &conf.superuser])
-        .args(&["-E", "utf8"])
        .arg("--no-instructions")
        // This is only used for a temporary installation that is deleted shortly after,
        // so no need to fsync it
--- a/pageserver/src/buffered_repository.rs
+++ b/pageserver/src/buffered_repository.rs
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -39,8 +39,9 @@
 //!
 use crate::layered_repository::blob::BlobWriter;
 use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
-use crate::layered_repository::storage_layer::{Layer, SegmentTag};
-use crate::repository::{PageReconstructData, PageReconstructResult, PageVersion};
+use crate::layered_repository::storage_layer::{
+    Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
+};
 use crate::waldecoder;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
@@ -147,10 +148,6 @@ pub struct DeltaLayerInner {
 }

 impl Layer for DeltaLayer {
-    fn get_tenant_id(&self) -> ZTenantId {
-        self.tenantid
-    }
-
    fn get_timeline_id(&self) -> ZTimelineId {
        self.timelineid
    }
@@ -204,22 +201,22 @@ impl Layer for DeltaLayer {
            for ((_blknum, pv_lsn), blob_range) in iter {
                let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;

-                match pv {
-                    PageVersion::Page(img) => {
-                        // Found a page image, return it
-                        reconstruct_data.page_img = Some(img);
+                if let Some(img) = pv.page_image {
+                    // Found a page image, return it
+                    reconstruct_data.page_img = Some(img);
+                    need_image = false;
+                    break;
+                } else if let Some(rec) = pv.record {
+                    let will_init = rec.will_init;
+                    reconstruct_data.records.push((*pv_lsn, rec));
+                    if will_init {
+                        // This WAL record initializes the page, so no need to go further back
                        need_image = false;
                        break;
                    }
-                    PageVersion::Wal(rec) => {
-                        let will_init = rec.will_init;
-                        reconstruct_data.records.push((*pv_lsn, rec));
-                        if will_init {
-                            // This WAL record initializes the page, so no need to go further back
-                            need_image = false;
-                            break;
-                        }
-                    }
+                } else {
+                    // No base image, and no WAL record. Huh?
+                    bail!("no page image or WAL record for requested page");
                }
            }

@@ -229,7 +226,7 @@ impl Layer for DeltaLayer {
        // If an older page image is needed to reconstruct the page, let the
        // caller know.
        if need_image {
-            Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
+            Ok(PageReconstructResult::Continue(self.start_lsn))
        } else {
            Ok(PageReconstructResult::Complete)
        }
@@ -310,22 +307,19 @@ impl Layer for DeltaLayer {
            let buf = read_blob(&chapter, blob_range)?;
            let pv = PageVersion::des(&buf)?;

-            match pv {
-                PageVersion::Page(img) => {
-                    write!(&mut desc, " img {} bytes", img.len())?;
-                }
-                PageVersion::Wal(rec) => {
-                    let wal_desc = waldecoder::describe_wal_record(&rec.rec);
-                    write!(
-                        &mut desc,
-                        " rec {} bytes will_init: {} {}",
-                        rec.rec.len(),
-                        rec.will_init,
-                        wal_desc
-                    )?;
-                }
+            if let Some(img) = pv.page_image.as_ref() {
+                write!(&mut desc, " img {} bytes", img.len())?;
+            }
+            if let Some(rec) = pv.record.as_ref() {
+                let wal_desc = waldecoder::describe_wal_record(&rec.rec);
+                write!(
+                    &mut desc,
+                    " rec {} bytes will_init: {} {}",
+                    rec.rec.len(),
+                    rec.will_init,
+                    wal_desc
+                )?;
            }
-
            println!("  blk {} at {}: {}", blk, lsn, desc);
        }

@@ -334,19 +328,6 @@ impl Layer for DeltaLayer {
 }

 impl DeltaLayer {
-    /// debugging function to print out the contents of the layer
-    pub fn versions(&self) -> Result<Vec<(u32, Lsn, PageVersion)>> {
-        let mut versions: Vec<(u32, Lsn, PageVersion)> = Vec::new();
-        let inner = self.load()?;
-        let (_path, book) = self.open_book()?;
-        let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
-        for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
-            let buf = read_blob(&chapter, blob_range)?;
-            versions.push((*blk, *lsn, PageVersion::des(&buf)?));
-        }
-        Ok(versions)
-    }
-
    fn path_for(
        path_or_conf: &PathOrConf,
        timelineid: ZTimelineId,
@@ -379,7 +360,6 @@ impl DeltaLayer {
        dropped: bool,
        page_versions: impl Iterator<Item = (u32, Lsn, &'a PageVersion)>,
        relsizes: VecMap<Lsn, u32>,
-        nosync: bool,
    ) -> Result<DeltaLayer> {
        if seg.rel.is_blocky() {
            assert!(!relsizes.is_empty());
@@ -452,9 +432,8 @@ impl DeltaLayer {

        // This flushes the underlying 'buf_writer'.
        let writer = book.close()?;
-        if !nosync {
-            writer.get_ref().sync_all()?;
-        }
+        writer.get_ref().sync_all()?;
+
        trace!("saved {}", &path.display());

        drop(inner);
@@ -463,7 +442,12 @@ impl DeltaLayer {
    }

    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
-        let path = self.path();
+        let path = Self::path_for(
+            &self.path_or_conf,
+            self.timelineid,
+            self.tenantid,
+            &self.layer_name(),
+        );

        let file = File::open(&path)?;
        let book = Book::new(file)?;
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -13,6 +13,8 @@ use anyhow::Result;
 use log::*;
 use zenith_utils::lsn::Lsn;

+use super::METADATA_FILE_NAME;
+
 // Note: LayeredTimeline::load_layer_map() relies on this sort order
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
 pub struct DeltaFileName {
@@ -290,7 +292,7 @@ pub fn list_files(
            deltafiles.push(deltafilename);
        } else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
            imgfiles.push(imgfilename);
-        } else if fname == "metadata" || fname == "ancestor" || fname.ends_with(".old") {
+        } else if fname == METADATA_FILE_NAME || fname == "ancestor" || fname.ends_with(".old") {
            // ignore these
        } else {
            warn!("unrecognized filename in timeline dir: {}", fname);
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -22,8 +22,11 @@
 //! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
 //!
 use crate::layered_repository::filename::{ImageFileName, PathOrConf};
-use crate::layered_repository::storage_layer::{Layer, SegmentTag, RELISH_SEG_SIZE};
-use crate::repository::{PageReconstructData, PageReconstructResult};
+use crate::layered_repository::storage_layer::{
+    Layer, PageReconstructData, PageReconstructResult, SegmentTag,
+};
+use crate::layered_repository::LayeredTimeline;
+use crate::layered_repository::RELISH_SEG_SIZE;
 use crate::PageServerConf;
 use crate::{ZTenantId, ZTimelineId};
 use anyhow::{anyhow, bail, ensure, Result};
@@ -114,10 +117,6 @@ impl Layer for ImageLayer {
        PathBuf::from(self.layer_name().to_string())
    }

-    fn get_tenant_id(&self) -> ZTenantId {
-        self.tenantid
-    }
-
    fn get_timeline_id(&self) -> ZTimelineId {
        self.timelineid
    }
@@ -251,14 +250,13 @@ impl ImageLayer {
    }

    /// Create a new image file, using the given array of pages.
-    pub fn create(
+    fn create(
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
        seg: SegmentTag,
        lsn: Lsn,
        base_images: Vec<Bytes>,
-        nosync: bool,
    ) -> Result<ImageLayer> {
        let image_type = if seg.rel.is_blocky() {
            let num_blocks: u32 = base_images.len().try_into()?;
@@ -318,9 +316,8 @@ impl ImageLayer {

        // This flushes the underlying 'buf_writer'.
        let writer = book.close()?;
-        if !nosync {
-            writer.get_ref().sync_all()?;
-        }
+        writer.get_ref().sync_all()?;
+
        trace!("saved {}", path.display());

        drop(inner);
@@ -328,7 +325,6 @@ impl ImageLayer {
        Ok(layer)
    }

-    /*
    // Create a new image file by materializing every page in a source layer
    // at given LSN.
    pub fn create_from_src(
@@ -366,7 +362,6 @@ impl ImageLayer {

        Self::create(conf, timelineid, timeline.tenantid, seg, lsn, base_images)
    }
-    */

    ///
    /// Load the contents of the file into memory
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -3,9 +3,10 @@
 //!

 use crate::relish::RelishTag;
-use crate::repository::{PageReconstructData, PageReconstructResult};
-use crate::{ZTenantId, ZTimelineId};
+use crate::repository::WALRecord;
+use crate::ZTimelineId;
 use anyhow::Result;
+use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::path::PathBuf;
@@ -44,6 +45,56 @@ impl SegmentTag {
    }
 }

+///
+/// Represents a version of a page at a specific LSN. The LSN is the key of the
+/// entry in the 'page_versions' hash, it is not duplicated here.
+///
+/// A page version can be stored as a full page image, or as WAL record that needs
+/// to be applied over the previous page version to reconstruct this version.
+///
+/// It's also possible to have both a WAL record and a page image in the same
+/// PageVersion. That happens if page version is originally stored as a WAL record
+/// but it is later reconstructed by a GetPage@LSN request by performing WAL
+/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
+/// the WAL record in that case. TODO: That's pretty accidental, not the result
+/// of any grand design. If we want to keep reconstructed page versions around, we
+/// probably should have a separate buffer cache so that we could control the
+/// replacement policy globally. Or if we keep a reconstructed page image, we
+/// could throw away the WAL record.
+///
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageVersion {
+    /// an 8kb page image
+    pub page_image: Option<Bytes>,
+    /// WAL record to get from previous page version to this one.
+    pub record: Option<WALRecord>,
+}
+
+///
+/// Data needed to reconstruct a page version
+///
+/// 'page_img' is the old base image of the page to start the WAL replay with.
+/// It can be None, if the first WAL record initializes the page (will_init)
+/// 'records' contains the records to apply over the base image.
+///
+pub struct PageReconstructData {
+    pub records: Vec<(Lsn, WALRecord)>,
+    pub page_img: Option<Bytes>,
+}
+
+/// Return value from Layer::get_page_reconstruct_data
+pub enum PageReconstructResult {
+    /// Got all the data needed to reconstruct the requested page
+    Complete,
+    /// This layer didn't contain all the required data, the caller should look up
+    /// the predecessor layer at the returned LSN and collect more data from there.
+    Continue(Lsn),
+    /// This layer didn't contain data needed to reconstruct the page version at
+    /// the returned LSN. This is usually considered an error, but might be OK
+    /// in some circumstances.
+    Missing(Lsn),
+}
+
 ///
 /// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
 /// There are two kinds of layers, in-memory and on-disk layers. In-memory
@@ -53,8 +104,6 @@ impl SegmentTag {
 /// in-memory and on-disk layers.
 ///
 pub trait Layer: Send + Sync {
-    fn get_tenant_id(&self) -> ZTenantId;
-
    /// Identify the timeline this relish belongs to
    fn get_timeline_id(&self) -> ZTimelineId;

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -9,7 +9,6 @@ use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};

 pub mod basebackup;
 pub mod branches;
-pub mod buffered_repository;
 pub mod http;
 pub mod layered_repository;
 pub mod page_service;
@@ -18,7 +17,6 @@ pub mod relish_storage;
 pub mod repository;
 pub mod restore_local_repo;
 pub mod tenant_mgr;
-pub mod toast_store;
 pub mod waldecoder;
 pub mod walreceiver;
 pub mod walredo;
@@ -32,17 +30,14 @@ pub mod defaults {
    pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
    pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");

-    // Minimal size of WAL records chain to trigger materialization of the page
+    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
+    // would be more appropriate. But a low value forces the code to be exercised more,
+    // which is good for now to trigger bugs.
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
-    pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(10);
+    pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(1);

-    pub const DEFAULT_UPLOAD_DISTANCE: u64 = 1024 * 1024 * 1024;
-    pub const DEFAULT_UPLOAD_PERIOD: Duration = Duration::from_secs(3600);
-
-    pub const DEFAULT_RECONSTRUCT_THRESHOLD: u64 = 0;
-
-    pub const DEFAULT_GC_HORIZON: u64 = 1024;
-    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
+    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
+    pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);

    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
    pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
@@ -69,9 +64,6 @@ pub struct PageServerConf {
    // page server crashes.
    pub checkpoint_distance: u64,
    pub checkpoint_period: Duration,
-    pub upload_period: Duration,
-    pub upload_distance: u64,
-    pub reconstruct_threshold: u64,

    pub gc_horizon: u64,
    pub gc_period: Duration,
@@ -157,9 +149,6 @@ impl PageServerConf {
            daemonize: false,
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
            checkpoint_period: Duration::from_secs(10),
-            upload_distance: defaults::DEFAULT_UPLOAD_DISTANCE,
-            upload_period: defaults::DEFAULT_UPLOAD_PERIOD,
-            reconstruct_threshold: defaults::DEFAULT_RECONSTRUCT_THRESHOLD,
            gc_horizon: defaults::DEFAULT_GC_HORIZON,
            gc_period: Duration::from_secs(10),
            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -693,21 +693,67 @@ impl postgres_backend::Handler for PageServerHandler {
            let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;

            pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                RowDescriptor::int8_col(b"meta_total"),
-                RowDescriptor::int8_col(b"meta_removed"),
-                RowDescriptor::int8_col(b"meta_dropped"),
-                RowDescriptor::int8_col(b"pages_total"),
-                RowDescriptor::int8_col(b"pages_removed"),
-                RowDescriptor::int8_col(b"pages_dropped"),
+                RowDescriptor::int8_col(b"layer_relfiles_total"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
+                RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
+                RowDescriptor::int8_col(b"layer_relfiles_removed"),
+                RowDescriptor::int8_col(b"layer_relfiles_dropped"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
+                RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
                RowDescriptor::int8_col(b"elapsed"),
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
-                Some(result.meta_total.to_string().as_bytes()),
-                Some(result.meta_removed.to_string().as_bytes()),
-                Some(result.meta_dropped.to_string().as_bytes()),
-                Some(result.pages_total.to_string().as_bytes()),
-                Some(result.pages_removed.to_string().as_bytes()),
-                Some(result.pages_dropped.to_string().as_bytes()),
+                Some(result.ondisk_relfiles_total.to_string().as_bytes()),
+                Some(
+                    result
+                        .ondisk_relfiles_needed_by_cutoff
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(
+                    result
+                        .ondisk_relfiles_needed_by_branches
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
+                Some(
+                    result
+                        .ondisk_relfiles_needed_as_tombstone
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
+                Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
+                Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
+                Some(
+                    result
+                        .ondisk_nonrelfiles_needed_by_cutoff
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(
+                    result
+                        .ondisk_nonrelfiles_needed_by_branches
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
+                Some(
+                    result
+                        .ondisk_nonrelfiles_needed_as_tombstone
+                        .to_string()
+                        .as_bytes(),
+                ),
+                Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
+                Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
                Some(result.elapsed.as_millis().to_string().as_bytes()),
            ]))?
            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -44,27 +44,45 @@ pub trait Repository: Send + Sync {
 ///
 /// Result of performing GC
 ///
-#[derive(Default, Debug)]
+#[derive(Default)]
 pub struct GcResult {
-    pub meta_removed: u64, // removed versions beyond PITR interval for which new page image exists
-    pub meta_dropped: u64, // removed versions beyond PITR interval of dropped relations
-    pub meta_total: u64,   // total number of metaobject version histories
+    pub ondisk_relfiles_total: u64,
+    pub ondisk_relfiles_needed_by_cutoff: u64,
+    pub ondisk_relfiles_needed_by_branches: u64,
+    pub ondisk_relfiles_not_updated: u64,
+    pub ondisk_relfiles_needed_as_tombstone: u64,
+    pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
+    pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped

-    pub pages_removed: u64, // removed versions beyond PITR interval for which new page image exists
-    pub pages_dropped: u64, // removed versions beyond PITR interval of dropped relations
-    pub pages_total: u64,   // total number of page vaersion histories
+    pub ondisk_nonrelfiles_total: u64,
+    pub ondisk_nonrelfiles_needed_by_cutoff: u64,
+    pub ondisk_nonrelfiles_needed_by_branches: u64,
+    pub ondisk_nonrelfiles_not_updated: u64,
+    pub ondisk_nonrelfiles_needed_as_tombstone: u64,
+    pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
+    pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped

    pub elapsed: Duration,
 }

 impl AddAssign for GcResult {
    fn add_assign(&mut self, other: Self) {
-        self.meta_total += other.meta_total;
-        self.meta_removed += other.meta_removed;
-        self.meta_dropped += other.meta_dropped;
-        self.pages_total += other.pages_total;
-        self.pages_removed += other.pages_removed;
-        self.pages_dropped += other.pages_dropped;
+        self.ondisk_relfiles_total += other.ondisk_relfiles_total;
+        self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff;
+        self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches;
+        self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated;
+        self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone;
+        self.ondisk_relfiles_removed += other.ondisk_relfiles_removed;
+        self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped;
+
+        self.ondisk_nonrelfiles_total += other.ondisk_nonrelfiles_total;
+        self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff;
+        self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches;
+        self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated;
+        self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone;
+        self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed;
+        self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped;
+
        self.elapsed += other.elapsed;
    }
 }
@@ -93,18 +111,14 @@ pub trait Timeline: Send + Sync {

    /// Get a list of all existing relations
    /// Pass RelTag to get relation objects or None to get nonrels.
+    fn list_relishes(&self, tag: Option<RelTag>, lsn: Lsn) -> Result<HashSet<RelishTag>>;
+
    /// Get a list of all existing relations in given tablespace and database.
    fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelishTag>>;

    /// Get a list of all existing non-relational objects
    fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;

-    ///
-    /// Export data as delats and image layers between 'start_lsn' to 'end_lsn'. The
-    /// start is inclusive, and end is exclusive.
-    ///
-    fn export_timeline(&self, start_lsn: Lsn, end_lsn: Lsn) -> Result<()>;
-
    /// Get the LSN where this branch was created
    fn get_ancestor_lsn(&self) -> Lsn;

@@ -120,6 +134,7 @@ pub trait Timeline: Send + Sync {
    fn get_last_record_lsn(&self) -> Lsn;
    fn get_prev_record_lsn(&self) -> Lsn;
    fn get_start_lsn(&self) -> Lsn;
+    fn get_disk_consistent_lsn(&self) -> Lsn;

    /// Mutate the timeline with a [`TimelineWriter`].
    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
@@ -167,16 +182,6 @@ pub trait TimelineWriter: Deref<Target = dyn Timeline> {
    /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
    /// Previous last record LSN is stored alongside the latest and can be read.
    fn advance_last_record_lsn(&self, lsn: Lsn);
-
-    ///
-    /// Complete all delayed commits and advance disk_consistent_lsn
-    ///
-    fn checkpoint(&self) -> Result<()>;
-
-    ///
-    /// Import data from layer files
-    ///
-    fn import_timeline(&self, snapshot_lsn: Lsn) -> Result<()>;
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -209,39 +214,6 @@ impl WALRecord {
    }
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum PageVersion {
-    /// an 8kb page image
-    Page(Bytes),
-    /// WAL record to get from previous page version to this one.
-    Wal(WALRecord),
-}
-
-///
-/// Data needed to reconstruct a page version
-///
-/// 'page_img' is the old base image of the page to start the WAL replay with.
-/// It can be None, if the first WAL record initializes the page (will_init)
-/// 'records' contains the records to apply over the base image.
-///
-pub struct PageReconstructData {
-    pub records: Vec<(Lsn, WALRecord)>,
-    pub page_img: Option<Bytes>,
-}
-
-/// Return value from Layer::get_page_reconstruct_data
-pub enum PageReconstructResult {
-    /// Got all the data needed to reconstruct the requested page
-    Complete,
-    /// This layer didn't contain all the required data, the caller should look up
-    /// the predecessor layer at the returned LSN and collect more data from there.
-    Continue(Lsn),
-    /// This layer didn't contain data needed to reconstruct the page version at
-    /// the returned LSN. This is usually considered an error, but might be OK
-    /// in some circumstances.
-    Missing(Lsn),
-}
-
 ///
 /// Tests that should work the same with any Repository/Timeline implementation.
 ///
@@ -249,7 +221,7 @@ pub enum PageReconstructResult {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::buffered_repository::{BufferedRepository, METADATA_FILE_NAME};
+    use crate::layered_repository::{LayeredRepository, METADATA_FILE_NAME};
    use crate::walredo::{WalRedoError, WalRedoManager};
    use crate::PageServerConf;
    use hex_literal::hex;
@@ -325,7 +297,7 @@ mod tests {
        fn load(&self) -> Box<dyn Repository> {
            let walredo_mgr = Arc::new(TestRedoManager);

-            Box::new(BufferedRepository::new(
+            Box::new(LayeredRepository::new(
                self.conf,
                walredo_mgr,
                self.tenant_id,
--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -11,7 +11,7 @@ use std::io::{Read, Seek, SeekFrom};
 use std::path::{Path, PathBuf};

 use anyhow::{anyhow, bail, Result};
-use bytes::{Buf, Bytes, BytesMut};
+use bytes::{Buf, Bytes};
 use tracing::*;

 use crate::relish::*;
@@ -126,6 +126,7 @@ pub fn import_timeline_from_postgres_datadir(
        import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
    }
    // TODO: Scan pg_tblspc
+
    writer.advance_last_record_lsn(lsn);

    // Import WAL. This is needed even when starting from a shutdown checkpoint, because
@@ -139,7 +140,6 @@ pub fn import_timeline_from_postgres_datadir(
        lsn,
        &mut pg_control.checkPointCopy.clone(),
    )?;
-    writer.checkpoint()?;

    Ok(())
 }
@@ -416,6 +416,7 @@ pub fn save_decoded_record(
    if checkpoint.update_next_xid(decoded.xl_xid) {
        *checkpoint_modified = true;
    }
+
    // Iterate through all the blocks that the record modifies, and
    // "put" a separate copy of the record for each block.
    for blk in decoded.blocks.iter() {
@@ -426,43 +427,13 @@ pub fn save_decoded_record(
            forknum: blk.forknum as u8,
        });

-        //
-        // Instead of storing full-page-image WAL record,
-        // it is better to store extracted image: we can skip wal-redo
-        // in this case. Also some FPI records may contain multiple (up to 32) pages,
-        // so them have to be copied multiple times.
-        //
-        if blk.apply_image
-            && blk.has_image
-            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
-            && (decoded.xl_info == pg_constants::XLOG_FPI
-                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
-            // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0
-        {
-            // Extract page image from FPI record
-            let img_len = blk.bimg_len as usize;
-            let img_offs = blk.bimg_offset as usize;
-            let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize);
-            image.extend_from_slice(&recdata[img_offs..img_offs + img_len]);
+        let rec = WALRecord {
+            will_init: blk.will_init || blk.apply_image,
+            rec: recdata.clone(),
+            main_data_offset: decoded.main_data_offset as u32,
+        };

-            if blk.hole_length != 0 {
-                let tail = image.split_off(blk.hole_offset as usize);
-                image.resize(image.len() + blk.hole_length as usize, 0u8);
-                image.unsplit(tail);
-            }
-            image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
-            image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
-            assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
-            timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?;
-        } else {
-            let rec = WALRecord {
-                will_init: blk.will_init || blk.apply_image,
-                rec: recdata.clone(),
-                main_data_offset: decoded.main_data_offset as u32,
-            };
-            timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
-        }
+        timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
    }

    let mut buf = decoded.record.clone();
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -2,7 +2,7 @@
 //! page server.

 use crate::branches;
-use crate::buffered_repository::BufferedRepository;
+use crate::layered_repository::LayeredRepository;
 use crate::repository::{Repository, Timeline};
 use crate::walredo::PostgresRedoManager;
 use crate::PageServerConf;
@@ -62,7 +62,6 @@ fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {

 struct TenantHandleEntry {
    checkpointer_handle: Option<JoinHandle<()>>,
-    uploader_handle: Option<JoinHandle<()>>,
    gc_handle: Option<JoinHandle<()>>,
 }

@@ -99,22 +98,20 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
    let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);

    // Set up an object repository, for actual data storage.
-    let repo = Arc::new(BufferedRepository::new(
+    let repo = Arc::new(LayeredRepository::new(
        conf,
        Arc::new(walredo_mgr),
        tenant_id,
        true,
    ));

-    let checkpointer_handle = BufferedRepository::launch_checkpointer_thread(conf, repo.clone());
-    let gc_handle = BufferedRepository::launch_gc_thread(conf, repo.clone());
-    let uploader_handle = BufferedRepository::launch_upload_thread(conf, repo.clone());
+    let checkpointer_handle = LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
+    let gc_handle = LayeredRepository::launch_gc_thread(conf, repo.clone());

    let mut handles = TENANT_HANDLES.lock().unwrap();
    let h = TenantHandleEntry {
        checkpointer_handle: Some(checkpointer_handle),
        gc_handle: Some(gc_handle),
-        uploader_handle: Some(uploader_handle),
    };

    handles.insert(tenant_id, h);
@@ -173,8 +170,6 @@ pub fn stop_tenant_threads(tenantid: ZTenantId) {
    if let Some(h) = handles.get_mut(&tenantid) {
        h.checkpointer_handle.take().map(JoinHandle::join);
        debug!("checkpointer for tenant {} has stopped", tenantid);
-        h.uploader_handle.take().map(JoinHandle::join);
-        debug!("uploader for tenant {} has stopped", tenantid);
        h.gc_handle.take().map(JoinHandle::join);
        debug!("gc for tenant {} has stopped", tenantid);
    }
--- a/pageserver/src/toast_store.rs
+++ b/pageserver/src/toast_store.rs
@@ -1,268 +0,0 @@
-use anyhow::{anyhow, Result};
-use lz4_flex;
-use std::convert::TryInto;
-use std::ops::{Bound, RangeBounds};
-use std::path::Path;
-
-use yakv::storage::{
-    Key,
-    Select,
-    //    ReadOnlyTransaction,
-    Snapshot,
-    Storage,
-    StorageConfig,
-    StorageIterator,
-    Transaction,
-    Value,
-};
-
-const TOAST_SEGMENT_SIZE: usize = 2000;
-const CACHE_SIZE: usize = 1024; // 8Mb
-
-///
-/// Toast storage consistof two KV databases: one for storing main index
-/// and second for storing sliced BLOB (values larger than 2kb).
-/// BLOBs and main data are stored in different databases to improve
-/// data locality and reduce key size for TOAST segments.
-///
-pub struct ToastStore {
-    db: Storage, // key-value database
-}
-
-pub struct ToastIterator<'a> {
-    iter: StorageIterator<'a>,
-}
-
-pub struct ToastSnapshot<'a> {
-    //    tx: ReadOnlyTransaction<'a>,
-    tx: Snapshot<'a>,
-}
-
-impl<'a> ToastSnapshot<'a> {
-    pub fn range<R: RangeBounds<Key>>(&self, range: R) -> ToastIterator<'_> {
-        let from = match range.start_bound() {
-            Bound::Included(key) => {
-                let mut key = key.clone();
-                key.extend_from_slice(&[0u8; 4]);
-                Bound::Included(key)
-            }
-            Bound::Excluded(key) => {
-                let mut key = key.clone();
-                key.extend_from_slice(&[0u8; 4]);
-                Bound::Excluded(key)
-            }
-            _ => Bound::Unbounded,
-        };
-        let till = match range.end_bound() {
-            Bound::Included(key) => {
-                let mut key = key.clone();
-                key.extend_from_slice(&[0xFFu8; 4]);
-                Bound::Included(key)
-            }
-            Bound::Excluded(key) => {
-                let mut key = key.clone();
-                key.extend_from_slice(&[0xFFu8; 4]);
-                Bound::Excluded(key)
-            }
-            _ => Bound::Unbounded,
-        };
-        ToastIterator {
-            iter: self.tx.range((from, till)),
-        }
-    }
-
-    pub fn iter(&self) -> ToastIterator<'_> {
-        self.range(..)
-    }
-}
-
-impl<'a> Iterator for ToastIterator<'a> {
-    type Item = Result<(Key, Value)>;
-    fn next(&mut self) -> Option<Self::Item> {
-        let mut toast: Option<Vec<u8>> = None;
-        let mut next_segno = 0u16;
-        for elem in &mut self.iter {
-            let res = if let Ok((key, value)) = elem {
-                let key_len = key.len();
-                let n_segments =
-                    u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
-                let segno = u16::from_be_bytes(key[key_len - 2..].try_into().unwrap());
-                let key = key[..key_len - 4].to_vec();
-                if n_segments != 0 {
-                    // TOAST
-                    assert_eq!(segno, next_segno);
-                    if next_segno == 0 {
-                        toast = Some(Vec::with_capacity(n_segments as usize * TOAST_SEGMENT_SIZE))
-                    }
-                    toast.as_mut().unwrap().extend_from_slice(&value);
-                    next_segno = segno + 1;
-                    if next_segno != n_segments {
-                        continue;
-                    }
-                    let res = lz4_flex::decompress_size_prepended(&toast.unwrap());
-                    if let Ok(decompressed_data) = res {
-                        Ok((key, decompressed_data))
-                    } else {
-                        Err(anyhow!(res.unwrap_err()))
-                    }
-                } else {
-                    Ok((key, value))
-                }
-            } else {
-                elem
-            };
-            return Some(res);
-        }
-        assert_eq!(next_segno, 0);
-        None
-    }
-}
-
-impl<'a> DoubleEndedIterator for ToastIterator<'a> {
-    fn next_back(&mut self) -> Option<Self::Item> {
-        let mut toast: Option<Vec<u8>> = None;
-        let mut next_segno = 0u16;
-        while let Some(elem) = self.iter.next_back() {
-            if let Ok((key, value)) = elem {
-                assert!(!value.is_empty());
-                let key_len = key.len();
-                let n_segments =
-                    u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
-                let segno = u16::from_be_bytes(key[key_len - 2..].try_into().unwrap());
-                let key = key[..key_len - 4].to_vec();
-                if n_segments != 0 {
-                    // TOAST
-                    assert!(segno + 1 == next_segno || next_segno == 0);
-                    if next_segno == 0 {
-                        let len = (n_segments - 1) as usize * TOAST_SEGMENT_SIZE + value.len();
-                        let mut vec = vec![0u8; len];
-                        vec[len - value.len()..].copy_from_slice(&value);
-                        toast = Some(vec);
-                    } else {
-                        toast.as_mut().unwrap()[segno as usize * TOAST_SEGMENT_SIZE
-                            ..(segno + 1) as usize * TOAST_SEGMENT_SIZE]
-                            .copy_from_slice(&value);
-                    }
-                    next_segno = segno;
-                    if next_segno == 0 {
-                        let toast = toast.unwrap();
-                        assert!(!toast.is_empty());
-                        let res = lz4_flex::decompress_size_prepended(&toast);
-                        return Some(if let Ok(decompressed_data) = res {
-                            Ok((key, decompressed_data))
-                        } else {
-                            Err(anyhow!(res.unwrap_err()))
-                        });
-                    }
-                } else {
-                    return Some(Ok((key, value)));
-                }
-            } else {
-                return Some(elem);
-            }
-        }
-        assert_eq!(next_segno, 0);
-        None
-    }
-}
-
-//
-// FIXME-KK: not using WAL now. Implement asynchronous or delayed commit.
-//
-impl ToastStore {
-    pub fn new(path: &Path) -> Result<ToastStore> {
-        Ok(ToastStore {
-            db: Storage::open(
-                &path.join("pageserver.db"),
-                StorageConfig {
-                    cache_size: CACHE_SIZE,
-                    nosync: false,
-                },
-            )?,
-        })
-    }
-
-    pub fn put(&self, key: Key, value: Value) -> Result<()> {
-        let mut tx = self.db.start_transaction();
-        self.tx_remove(&mut tx, &key)?;
-        let value_len = value.len();
-        let mut key = key;
-        if value_len >= TOAST_SEGMENT_SIZE {
-            let compressed_data = lz4_flex::compress_prepend_size(&value);
-            let compressed_data_len = compressed_data.len();
-            let mut offs: usize = 0;
-            let mut segno = 0u16;
-            let n_segments =
-                ((compressed_data_len + TOAST_SEGMENT_SIZE - 1) / TOAST_SEGMENT_SIZE) as u16;
-            assert!(n_segments != 0);
-            key.extend_from_slice(&n_segments.to_be_bytes());
-            key.extend_from_slice(&[0u8; 2]);
-            let key_len = key.len();
-            while offs + TOAST_SEGMENT_SIZE < compressed_data_len {
-                key[key_len - 2..].copy_from_slice(&segno.to_be_bytes());
-                tx.put(
-                    &key,
-                    &compressed_data[offs..offs + TOAST_SEGMENT_SIZE].to_vec(),
-                )?;
-                offs += TOAST_SEGMENT_SIZE;
-                segno += 1;
-            }
-            key[key_len - 2..].copy_from_slice(&segno.to_be_bytes());
-            tx.put(&key, &compressed_data[offs..].to_vec())?;
-        } else {
-            key.extend_from_slice(&[0u8; 4]);
-            tx.put(&key, &value)?;
-        }
-        tx.subcommit()?;
-        //tx.delay();
-        Ok(())
-    }
-
-    pub fn commit(&self) -> Result<()> {
-        let tx = self.db.start_transaction();
-        tx.commit()?;
-        Ok(())
-    }
-
-    pub fn take_snapshot(&self) -> ToastSnapshot<'_> {
-        ToastSnapshot {
-            //tx: self.db.read_only_transaction(),
-            tx: self.db.take_snapshot(),
-        }
-    }
-
-    pub fn remove(&self, key: Key) -> Result<()> {
-        let mut tx = self.db.start_transaction();
-        self.tx_remove(&mut tx, &key)?;
-        tx.subcommit()?;
-        //tx.delay();
-        Ok(())
-    }
-
-    pub fn tx_remove(&self, tx: &mut Transaction, key: &[u8]) -> Result<()> {
-        let mut min_key = key.to_vec();
-        let mut max_key = key.to_vec();
-        min_key.extend_from_slice(&[0u8; 4]);
-        max_key.extend_from_slice(&[0xFFu8; 4]);
-        let mut iter = tx.range(&min_key..&max_key);
-        if let Some(entry) = iter.next() {
-            let mut key = entry?.0;
-            let key_len = key.len();
-            let n_segments = u16::from_be_bytes(key[key_len - 4..key_len - 2].try_into().unwrap());
-            if n_segments != 0 {
-                // TOAST
-                for i in 0..n_segments {
-                    key[key_len - 2..].copy_from_slice(&i.to_be_bytes());
-                    tx.remove(&key)?;
-                }
-            } else {
-                tx.remove(&key)?;
-            }
-        }
-        Ok(())
-    }
-
-    pub fn size(&self) -> u64 {
-        self.db.get_database_info().db_used
-    }
-}
--- a/pageserver/src/waldecoder.rs
+++ b/pageserver/src/waldecoder.rs
@@ -229,18 +229,17 @@ pub struct DecodedBkpBlock {
    pub blkno: u32,

    /* copy of the fork_flags field from the XLogRecordBlockHeader */
-    pub flags: u8,
+    flags: u8,

    /* Information on full-page image, if any */
-    pub has_image: bool,   /* has image, even for consistency checking */
+    has_image: bool,       /* has image, even for consistency checking */
    pub apply_image: bool, /* has image that should be restored */
    pub will_init: bool,   /* record doesn't need previous page version to apply */
    //char	   *bkp_image;
-    pub hole_offset: u16,
-    pub hole_length: u16,
-    pub bimg_offset: u32,
-    pub bimg_len: u16,
-    pub bimg_info: u8,
+    hole_offset: u16,
+    hole_length: u16,
+    bimg_len: u16,
+    bimg_info: u8,

    /* Buffer holding the rmgr-specific data associated with this block */
    has_data: bool,
@@ -860,19 +859,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
    }

    // 3. Decode blocks.
-    let mut ptr = record.len() - buf.remaining();
-    for blk in blocks.iter_mut() {
-        if blk.has_image {
-            blk.bimg_offset = ptr as u32;
-            ptr += blk.bimg_len as usize;
-        }
-        if blk.has_data {
-            ptr += blk.data_len as usize;
-        }
-    }
    // We don't need them, so just skip blocks_total_len bytes
    buf.advance(blocks_total_len as usize);
-    assert_eq!(ptr, record.len() - buf.remaining());

    let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;

--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -284,12 +284,14 @@ fn walreceiver_main(
        if let Some(last_lsn) = status_update {
            // TODO: More thought should go into what values are sent here.
            let last_lsn = PgLsn::from(u64::from(last_lsn));
-            let write_lsn = last_lsn;
+            // We are using disk consistent LSN as `write_lsn`, i.e. LSN at which page server
+            // may guarantee persistence of all received data. Safekeeper is not free to remove
+            // WAL preceding `write_lsn`: it should not be requested by this page server.
+            let write_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
            let flush_lsn = last_lsn;
            let apply_lsn = PgLsn::from(0);
            let ts = SystemTime::now();
            const NO_REPLY: u8 = 0;
-
            physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
        }

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,26 +22,23 @@ use byteorder::{ByteOrder, LittleEndian};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use lazy_static::lazy_static;
 use log::*;
-use nix::poll::*;
 use serde::Serialize;
 use std::fs;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
-use std::io::{Error, ErrorKind};
-use std::os::unix::io::AsRawFd;
+use std::io::Error;
 use std::path::PathBuf;
 use std::process::Stdio;
-use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::mpsc;
-use std::sync::mpsc::{Receiver, Sender, SyncSender};
 use std::sync::Mutex;
 use std::time::Duration;
 use std::time::Instant;
+use tokio::io::AsyncBufReadExt;
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tokio::process::{ChildStdin, ChildStdout, Command};
+use tokio::time::timeout;
 use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;
-use zenith_utils::nonblock::set_nonblock;
 use zenith_utils::zid::ZTenantId;

 use crate::relish::*;
@@ -56,10 +53,6 @@ use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
 use postgres_ffi::pg_constants;
 use postgres_ffi::XLogRecord;

-const N_CHANNELS: usize = 16;
-const CHANNEL_SIZE: usize = 1024 * 1024;
-type ChannelId = usize;
-
 ///
 /// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
 ///
@@ -137,22 +130,17 @@ lazy_static! {

 ///
 /// This is the real implementation that uses a Postgres process to
-/// perform WAL replay. I multiplex requests from multiple threads
-/// using `sender` channel and send them to the postgres wal-redo process
-/// pipe by separate thread. Responses are returned through set of `receivers`
-/// channels, used in round robin manner.  Receiver thread is protected by mutex
-/// to prevent it's usage by more than one thread
-/// In the future, we might want to launch a pool of processes to allow concurrent
-/// replay of multiple records.
+/// perform WAL replay. Only one thread can use the processs at a time,
+/// that is controlled by the Mutex. In the future, we might want to
+/// launch a pool of processes to allow concurrent replay of multiple
+/// records.
 ///
 pub struct PostgresRedoManager {
-    // mutiplexor pipe: use sync_channel to allow sharing sender by multiple threads
-    // and limit size of buffer
-    sender: SyncSender<(ChannelId, Vec<u8>)>,
-    // set of receiver channels
-    receivers: Vec<Mutex<Receiver<Bytes>>>,
-    // atomicly incremented counter for choosing receiver
-    round_robin: AtomicUsize,
+    tenantid: ZTenantId,
+    conf: &'static PageServerConf,
+
+    runtime: tokio::runtime::Runtime,
+    process: Mutex<Option<PostgresRedoProcess>>,
 }

 #[derive(Debug)]
@@ -165,13 +153,6 @@ struct WalRedoRequest {
    records: Vec<(Lsn, WALRecord)>,
 }

-impl WalRedoRequest {
-    // Can this request be served by zenith redo funcitons
-    // or we need to pass it to wal-redo postgres process?
-    fn can_apply_in_zenith(&self) -> bool {
-        !matches!(self.rel, RelishTag::Relation(_))
-    }
-}
 /// An error happened in WAL redo
 #[derive(Debug, thiserror::Error)]
 pub enum WalRedoError {
@@ -180,8 +161,6 @@ pub enum WalRedoError {

    #[error("cannot perform WAL redo now")]
    InvalidState,
-    #[error("cannot perform WAL redo for this request")]
-    InvalidRequest,
 }

 ///
@@ -202,6 +181,10 @@ impl WalRedoManager for PostgresRedoManager {
        base_img: Option<Bytes>,
        records: Vec<(Lsn, WALRecord)>,
    ) -> Result<Bytes, WalRedoError> {
+        let start_time;
+        let lock_time;
+        let end_time;
+
        let request = WalRedoRequest {
            rel,
            blknum,
@@ -209,14 +192,28 @@ impl WalRedoManager for PostgresRedoManager {
            base_img,
            records,
        };
-        let start_time = Instant::now();
-        let result = if request.can_apply_in_zenith() {
-            self.handle_apply_request_zenith(&request)
-        } else {
-            self.handle_apply_request_postgres(&request)
+
+        start_time = Instant::now();
+        let result = {
+            let mut process_guard = self.process.lock().unwrap();
+            lock_time = Instant::now();
+
+            // launch the WAL redo process on first use
+            if process_guard.is_none() {
+                let p = self
+                    .runtime
+                    .block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
+                *process_guard = Some(p);
+            }
+            let process = process_guard.as_mut().unwrap();
+
+            self.runtime
+                .block_on(self.handle_apply_request(process, &request))
        };
-        let end_time = Instant::now();
-        WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
+        end_time = Instant::now();
+
+        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+        WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());

        result
    }
@@ -226,109 +223,32 @@ impl PostgresRedoManager {
    ///
    /// Create a new PostgresRedoManager.
    ///
-    pub fn new(conf: &PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
-        let (tx, rx): (
-            SyncSender<(ChannelId, Vec<u8>)>,
-            Receiver<(ChannelId, Vec<u8>)>,
-        ) = mpsc::sync_channel(CHANNEL_SIZE);
-        let mut senders: Vec<Sender<Bytes>> = Vec::with_capacity(N_CHANNELS);
-        let mut receivers: Vec<Mutex<Receiver<Bytes>>> = Vec::with_capacity(N_CHANNELS);
-        for _ in 0..N_CHANNELS {
-            let (tx, rx) = mpsc::channel();
-            senders.push(tx);
-            receivers.push(Mutex::new(rx));
-        }
-        if let Ok(mut proc) = PostgresRedoProcess::launch(conf, &tenantid) {
-            let _proxy = std::thread::spawn(move || loop {
-                let (id, data) = rx.recv().unwrap();
-                match proc.apply_wal_records(data) {
-                    Ok(page) => senders[id as usize].send(page).unwrap(),
-                    Err(err) => {
-                        info!("wal-redo failed with error {:?}", err);
-                        proc.kill();
-                        break;
-                    }
-                }
-            });
-            PostgresRedoManager {
-                sender: tx,
-                receivers,
-                round_robin: AtomicUsize::new(0),
-            }
-        } else {
-            panic!("Failed to launch wal-redo postgres");
+    pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
+        // We block on waiting for requests on the walredo request channel, but
+        // use async I/O to communicate with the child process. Initialize the
+        // runtime for the async part.
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+
+        // The actual process is launched lazily, on first request.
+        PostgresRedoManager {
+            runtime,
+            tenantid,
+            conf,
+            process: Mutex::new(None),
        }
    }

-    fn apply_wal_records(
-        &self,
-        tag: BufferTag,
-        base_img: Option<Bytes>,
-        records: &[(Lsn, WALRecord)],
-    ) -> Result<Bytes, std::io::Error> {
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        let mut writebuf: Vec<u8> = Vec::new();
-        build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            build_push_page_msg(tag, &img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            build_apply_record_msg(*lsn, &rec.rec, &mut writebuf);
-        }
-        build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let id = self.round_robin.fetch_add(1, Ordering::Relaxed) % N_CHANNELS;
-        let rx = self.receivers[id].lock().unwrap();
-        self.sender.send((id, writebuf)).unwrap();
-        Ok(rx.recv().unwrap())
-    }
-
    ///
-    /// Process one request for WAL redo using wal-redo postgres
+    /// Process one request for WAL redo.
    ///
-    fn handle_apply_request_postgres(
+    async fn handle_apply_request(
        &self,
+        process: &mut PostgresRedoProcess,
        request: &WalRedoRequest,
    ) -> Result<Bytes, WalRedoError> {
-        let blknum = request.blknum;
-        let lsn = request.lsn;
-        let base_img = request.base_img.clone();
-        let records = &request.records;
-        let nrecords = records.len();
-
-        let start = Instant::now();
-
-        let apply_result: Result<Bytes, Error>;
-
-        if let RelishTag::Relation(rel) = request.rel {
-            // Relational WAL records are applied using wal-redo-postgres
-            let buf_tag = BufferTag { rel, blknum };
-            apply_result = self.apply_wal_records(buf_tag, base_img, records);
-
-            let duration = start.elapsed();
-
-            debug!(
-                "postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
-                nrecords,
-                duration.as_micros(),
-                lsn
-            );
-
-            apply_result.map_err(WalRedoError::IoError)
-        } else {
-            Err(WalRedoError::InvalidRequest)
-        }
-    }
-
-    ///
-    /// Process one request for WAL redo using custom zenith code
-    ///
-    fn handle_apply_request_zenith(&self, request: &WalRedoRequest) -> Result<Bytes, WalRedoError> {
        let rel = request.rel;
        let blknum = request.blknum;
        let lsn = request.lsn;
@@ -340,158 +260,178 @@ impl PostgresRedoManager {
        let start = Instant::now();

        let apply_result: Result<Bytes, Error>;
-
-        // Non-relational WAL records are handled here, with custom code that has the
-        // same effects as the corresponding Postgres WAL redo function.
-        const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
-        let mut page = BytesMut::new();
-        if let Some(fpi) = base_img {
-            // If full-page image is provided, then use it...
-            page.extend_from_slice(&fpi[..]);
+        if let RelishTag::Relation(rel) = rel {
+            // Relational WAL records are applied using wal-redo-postgres
+            let buf_tag = BufferTag { rel, blknum };
+            apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
        } else {
-            // otherwise initialize page with zeros
-            page.extend_from_slice(&ZERO_PAGE);
-        }
-        // Apply all collected WAL records
-        for (_lsn, record) in records {
-            let mut buf = record.rec.clone();
-
-            WAL_REDO_RECORD_COUNTER.inc();
-
-            // 1. Parse XLogRecord struct
-            // FIXME: refactor to avoid code duplication.
-            let xlogrec = XLogRecord::from_bytes(&mut buf);
-
-            //move to main data
-            // TODO probably, we should store some records in our special format
-            // to avoid this weird parsing on replay
-            let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
-            if buf.remaining() > skip {
-                buf.advance(skip);
+            // Non-relational WAL records are handled here, with custom code that has the
+            // same effects as the corresponding Postgres WAL redo function.
+            const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
+            let mut page = BytesMut::new();
+            if let Some(fpi) = base_img {
+                // If full-page image is provided, then use it...
+                page.extend_from_slice(&fpi[..]);
+            } else {
+                // otherwise initialize page with zeros
+                page.extend_from_slice(&ZERO_PAGE);
            }
+            // Apply all collected WAL records
+            for (_lsn, record) in records {
+                let mut buf = record.rec.clone();

-            if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
-                // Transaction manager stuff
-                let rec_segno = match rel {
-                    RelishTag::Slru { slru, segno } => {
-                        assert!(
-                            slru == SlruKind::Clog,
-                            "Not valid XACT relish tag {:?}",
-                            rel
-                        );
-                        segno
-                    }
-                    _ => panic!("Not valid XACT relish tag {:?}", rel),
-                };
-                let parsed_xact =
-                    XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
-                if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
-                    || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
-                {
-                    transaction_id_set_status(
-                        parsed_xact.xid,
-                        pg_constants::TRANSACTION_STATUS_COMMITTED,
-                        &mut page,
-                    );
-                    for subxact in &parsed_xact.subxacts {
-                        let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                        // only update xids on the requested page
-                        if rec_segno == segno && blknum == rpageno {
-                            transaction_id_set_status(
-                                *subxact,
-                                pg_constants::TRANSACTION_STATUS_COMMITTED,
-                                &mut page,
-                            );
-                        }
-                    }
-                } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
-                    || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
-                {
-                    transaction_id_set_status(
-                        parsed_xact.xid,
-                        pg_constants::TRANSACTION_STATUS_ABORTED,
-                        &mut page,
-                    );
-                    for subxact in &parsed_xact.subxacts {
-                        let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
-                        let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                        let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                        // only update xids on the requested page
-                        if rec_segno == segno && blknum == rpageno {
-                            transaction_id_set_status(
-                                *subxact,
-                                pg_constants::TRANSACTION_STATUS_ABORTED,
-                                &mut page,
-                            );
-                        }
-                    }
+                WAL_REDO_RECORD_COUNTER.inc();
+
+                // 1. Parse XLogRecord struct
+                // FIXME: refactor to avoid code duplication.
+                let xlogrec = XLogRecord::from_bytes(&mut buf);
+
+                //move to main data
+                // TODO probably, we should store some records in our special format
+                // to avoid this weird parsing on replay
+                let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
+                if buf.remaining() > skip {
+                    buf.advance(skip);
                }
-            } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
-                // Multixact operations
-                let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-                    let xlrec = XlMultiXactCreate::decode(&mut buf);
-                    if let RelishTag::Slru {
-                        slru,
-                        segno: rec_segno,
-                    } = rel
+
+                if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
+                    // Transaction manager stuff
+                    let rec_segno = match rel {
+                        RelishTag::Slru { slru, segno } => {
+                            assert!(
+                                slru == SlruKind::Clog,
+                                "Not valid XACT relish tag {:?}",
+                                rel
+                            );
+                            segno
+                        }
+                        _ => panic!("Not valid XACT relish tag {:?}", rel),
+                    };
+                    let parsed_xact =
+                        XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
+                    if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
+                        || parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
                    {
-                        if slru == SlruKind::MultiXactMembers {
-                            for i in 0..xlrec.nmembers {
-                                let pageno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
-                                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                                if segno == rec_segno && rpageno == blknum {
-                                    // update only target block
-                                    let offset = xlrec.moff + i;
-                                    let memberoff = mx_offset_to_member_offset(offset);
-                                    let flagsoff = mx_offset_to_flags_offset(offset);
-                                    let bshift = mx_offset_to_flags_bitshift(offset);
-                                    let mut flagsval =
-                                        LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
-                                    flagsval &=
-                                        !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
+                        transaction_id_set_status(
+                            parsed_xact.xid,
+                            pg_constants::TRANSACTION_STATUS_COMMITTED,
+                            &mut page,
+                        );
+                        for subxact in &parsed_xact.subxacts {
+                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                            // only update xids on the requested page
+                            if rec_segno == segno && blknum == rpageno {
+                                transaction_id_set_status(
+                                    *subxact,
+                                    pg_constants::TRANSACTION_STATUS_COMMITTED,
+                                    &mut page,
+                                );
+                            }
+                        }
+                    } else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
+                        || parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
+                    {
+                        transaction_id_set_status(
+                            parsed_xact.xid,
+                            pg_constants::TRANSACTION_STATUS_ABORTED,
+                            &mut page,
+                        );
+                        for subxact in &parsed_xact.subxacts {
+                            let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
+                            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                            // only update xids on the requested page
+                            if rec_segno == segno && blknum == rpageno {
+                                transaction_id_set_status(
+                                    *subxact,
+                                    pg_constants::TRANSACTION_STATUS_ABORTED,
+                                    &mut page,
+                                );
+                            }
+                        }
+                    }
+                } else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
+                    // Multixact operations
+                    let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                    if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+                        let xlrec = XlMultiXactCreate::decode(&mut buf);
+                        if let RelishTag::Slru {
+                            slru,
+                            segno: rec_segno,
+                        } = rel
+                        {
+                            if slru == SlruKind::MultiXactMembers {
+                                for i in 0..xlrec.nmembers {
+                                    let pageno =
+                                        i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
+                                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                                    if segno == rec_segno && rpageno == blknum {
+                                        // update only target block
+                                        let offset = xlrec.moff + i;
+                                        let memberoff = mx_offset_to_member_offset(offset);
+                                        let flagsoff = mx_offset_to_flags_offset(offset);
+                                        let bshift = mx_offset_to_flags_bitshift(offset);
+                                        let mut flagsval =
+                                            LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
+                                        flagsval &= !(((1
+                                            << pg_constants::MXACT_MEMBER_BITS_PER_XACT)
+                                            - 1)
                                            << bshift);
-                                    flagsval |= xlrec.members[i as usize].status << bshift;
-                                    LittleEndian::write_u32(
-                                        &mut page[flagsoff..flagsoff + 4],
-                                        flagsval,
-                                    );
-                                    LittleEndian::write_u32(
-                                        &mut page[memberoff..memberoff + 4],
-                                        xlrec.members[i as usize].xid,
-                                    );
+                                        flagsval |= xlrec.members[i as usize].status << bshift;
+                                        LittleEndian::write_u32(
+                                            &mut page[flagsoff..flagsoff + 4],
+                                            flagsval,
+                                        );
+                                        LittleEndian::write_u32(
+                                            &mut page[memberoff..memberoff + 4],
+                                            xlrec.members[i as usize].xid,
+                                        );
+                                    }
                                }
+                            } else {
+                                // Multixact offsets SLRU
+                                let offs = (xlrec.mid
+                                    % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
+                                    * 4) as usize;
+                                LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
                            }
                        } else {
-                            // Multixact offsets SLRU
-                            let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
-                                * 4) as usize;
-                            LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
+                            panic!();
                        }
                    } else {
                        panic!();
                    }
-                } else {
-                    panic!();
                }
            }
-        }

-        apply_result = Ok::<Bytes, Error>(page.freeze());
+            apply_result = Ok::<Bytes, Error>(page.freeze());
+        }

        let duration = start.elapsed();

+        let result: Result<Bytes, WalRedoError>;
+
        debug!(
-            "zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}",
+            "applied {} WAL records in {} ms to reconstruct page image at LSN {}",
            nrecords,
            duration.as_millis(),
            lsn
        );

-        apply_result.map_err(WalRedoError::IoError)
+        if let Err(e) = apply_result {
+            error!("could not apply WAL records: {:#}", e);
+            result = Err(WalRedoError::IoError(e));
+        } else {
+            let img = apply_result.unwrap();
+
+            result = Ok(img);
+        }
+
+        // The caller is responsible for sending the response
+        result
    }
 }

@@ -499,17 +439,18 @@ impl PostgresRedoManager {
 /// Handle to the Postgres WAL redo process
 ///
 struct PostgresRedoProcess {
-    child: Child,
    stdin: ChildStdin,
    stdout: ChildStdout,
-    stderr: ChildStderr,
 }

 impl PostgresRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    fn launch(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<PostgresRedoProcess, Error> {
+    async fn launch(
+        conf: &PageServerConf,
+        tenantid: &ZTenantId,
+    ) -> Result<PostgresRedoProcess, Error> {
        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
        // just create one with constant name. That fails if you try to launch more than
        // one WAL redo manager concurrently.
@@ -530,6 +471,7 @@ impl PostgresRedoProcess {
            .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
            .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
            .output()
+            .await
            .expect("failed to execute initdb");

        if !initdb.status.success() {
@@ -566,114 +508,102 @@ impl PostgresRedoProcess {
            datadir.display()
        );

-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
+        let stdin = child.stdin.take().expect("failed to open child's stdin");
+        let stderr = child.stderr.take().expect("failed to open child's stderr");
+        let stdout = child.stdout.take().expect("failed to open child's stdout");

-        set_nonblock(stdin.as_raw_fd())?;
-        set_nonblock(stdout.as_raw_fd())?;
-        set_nonblock(stderr.as_raw_fd())?;
+        // This async block reads the child's stderr, and forwards it to the logger
+        let f_stderr = async {
+            let mut stderr_buffered = tokio::io::BufReader::new(stderr);

-        Ok(PostgresRedoProcess {
-            child,
-            stdin,
-            stdout,
-            stderr,
-        })
-    }
+            let mut line = String::new();
+            loop {
+                let res = stderr_buffered.read_line(&mut line).await;
+                if res.is_err() {
+                    debug!("could not convert line to utf-8");
+                    continue;
+                }
+                if res.unwrap() == 0 {
+                    break;
+                }
+                error!("wal-redo-postgres: {}", line.trim());
+                line.clear();
+            }
+            Ok::<(), Error>(())
+        };
+        tokio::spawn(f_stderr);

-    fn kill(mut self) {
-        let _ = self.child.kill();
-        if let Ok(exit_status) = self.child.wait() {
-            error!("wal-redo-postgres exited with code {}", exit_status);
-        }
-        drop(self);
+        Ok(PostgresRedoProcess { stdin, stdout })
    }

    //
    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
-    fn apply_wal_records(&mut self, writebuf: Vec<u8>) -> Result<Bytes, std::io::Error> {
-        let mut nwrite = self.stdin.write(&writebuf)?;
-
-        // We expect the WAL redo process to respond with an 8k page image. We read it
-        // into this buffer.
-        let mut resultbuf = vec![0; pg_constants::BLCKSZ.into()];
-        let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-
-        // Prepare for calling poll()
-        let mut pollfds = [
-            PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
-        ];
+    async fn apply_wal_records(
+        &mut self,
+        tag: BufferTag,
+        base_img: Option<Bytes>,
+        records: &[(Lsn, WALRecord)],
+    ) -> Result<Bytes, std::io::Error> {
+        let stdout = &mut self.stdout;
+        // Buffer the writes to avoid a lot of small syscalls.
+        let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);

        // We do three things simultaneously: send the old base image and WAL records to
        // the child process's stdin, read the result from child's stdout, and forward any logging
        // information that the child writes to its stderr to the page server's log.
-        while nresult < pg_constants::BLCKSZ.into() {
-            // If we have more data to write, wake up if 'stdin' becomes writeable or
-            // we have data to read. Otherwise only wake up if there's data to read.
-            let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
-            let n = nix::poll::poll(&mut pollfds[0..nfds], TIMEOUT.as_millis() as i32)?;
-
-            if n == 0 {
-                return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
+        //
+        // 'f_stdin' handles writing the base image and WAL records to the child process.
+        // 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
+        // tokio runtime in the 'launch' function already, forwards the logging.
+        let f_stdin = async {
+            // Send base image, if any. (If the record initializes the page, previous page
+            // version is not needed.)
+            timeout(
+                TIMEOUT,
+                stdin.write_all(&build_begin_redo_for_block_msg(tag)),
+            )
+            .await??;
+            if let Some(img) = base_img {
+                timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??;
            }

-            // If we have some messages in stderr, forward them to the log.
-            let err_revents = pollfds[1].revents().unwrap();
-            if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                let mut errbuf: [u8; 16384] = [0; 16384];
-                let n = self.stderr.read(&mut errbuf)?;
+            // Send WAL records.
+            for (lsn, rec) in records.iter() {
+                WAL_REDO_RECORD_COUNTER.inc();

-                // The message might not be split correctly into lines here. But this is
-                // good enough, the important thing is to get the message to the log.
-                if n > 0 {
-                    error!(
-                        "wal-redo-postgres: {}",
-                        String::from_utf8_lossy(&errbuf[0..n])
-                    );
+                stdin
+                    .write_all(&build_apply_record_msg(*lsn, &rec.rec))
+                    .await?;

-                    // To make sure we capture all log from the process if it fails, keep
-                    // reading from the stderr, before checking the stdout.
-                    continue;
-                }
-            } else if err_revents.contains(PollFlags::POLLHUP) {
-                return Err(Error::new(
-                    ErrorKind::BrokenPipe,
-                    "WAL redo process closed its stderr unexpectedly",
-                ));
+                //debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
+                //       r.lsn >> 32, r.lsn & 0xffff_ffff);
            }
+            //debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
+            //       records.len(), lsn >> 32, lsn & 0xffff_ffff);

-            // If we have more data to write and 'stdin' is writeable, do write.
-            if nwrite < writebuf.len() {
-                let in_revents = pollfds[2].revents().unwrap();
-                if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                    nwrite += self.stdin.write(&writebuf[nwrite..])?;
-                } else if in_revents.contains(PollFlags::POLLHUP) {
-                    // We still have more data to write, but the process closed the pipe.
-                    return Err(Error::new(
-                        ErrorKind::BrokenPipe,
-                        "WAL redo process closed its stdin unexpectedly",
-                    ));
-                }
-            }
+            // Send GetPage command to get the result back
+            timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
+            timeout(TIMEOUT, stdin.flush()).await??;
+            //debug!("sent GetPage for {}", tag.blknum);
+            Ok::<(), Error>(())
+        };

-            // If we have some data in stdout, read it to the result buffer.
-            let out_revents = pollfds[0].revents().unwrap();
-            if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                nresult += self.stdout.read(&mut resultbuf[nresult..])?;
-            } else if out_revents.contains(PollFlags::POLLHUP) {
-                return Err(Error::new(
-                    ErrorKind::BrokenPipe,
-                    "WAL redo process closed its stdout unexpectedly",
-                ));
-            }
-        }
+        // Read back new page image
+        let f_stdout = async {
+            let mut buf = [0u8; 8192];

-        Ok(Bytes::from(resultbuf))
+            timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
+            //debug!("got response for {}", tag.blknum);
+            Ok::<[u8; 8192], Error>(buf)
+        };
+
+        let res = tokio::try_join!(f_stdout, f_stdin)?;
+
+        let buf = res.0;
+
+        Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
    }
 }

@@ -681,42 +611,62 @@ impl PostgresRedoProcess {
 // process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
 // explanation of the protocol.

-fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec<u8> {
    let len = 4 + 1 + 4 * 4;
+    let mut buf = Vec::with_capacity(1 + len);

    buf.put_u8(b'B');
    buf.put_u32(len as u32);

-    tag.ser_into(buf)
+    tag.ser_into(&mut buf)
        .expect("serialize BufferTag should always succeed");
+
+    debug_assert!(buf.len() == 1 + len);
+
+    buf
 }

-fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
+fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec<u8> {
    assert!(base_img.len() == 8192);

    let len = 4 + 1 + 4 * 4 + base_img.len();
+    let mut buf = Vec::with_capacity(1 + len);

    buf.put_u8(b'P');
    buf.put_u32(len as u32);
-    tag.ser_into(buf)
+    tag.ser_into(&mut buf)
        .expect("serialize BufferTag should always succeed");
    buf.put(base_img);
+
+    debug_assert!(buf.len() == 1 + len);
+
+    buf
 }

-fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
+fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
    let len = 4 + 8 + rec.len();
+    let mut buf: Vec<u8> = Vec::with_capacity(1 + len);

    buf.put_u8(b'A');
    buf.put_u32(len as u32);
    buf.put_u64(endlsn.0);
    buf.put(rec);
+
+    debug_assert!(buf.len() == 1 + len);
+
+    buf
 }

-fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
+fn build_get_page_msg(tag: BufferTag) -> Vec<u8> {
    let len = 4 + 1 + 4 * 4;
+    let mut buf = Vec::with_capacity(1 + len);

    buf.put_u8(b'G');
    buf.put_u32(len as u32);
-    tag.ser_into(buf)
+    tag.ser_into(&mut buf)
        .expect("serialize BufferTag should always succeed");
+
+    debug_assert!(buf.len() == 1 + len);
+
+    buf
 }
--- a/vendor/postgres
+++ b/vendor/postgres
--- a/walkeeper/src/replication.rs
+++ b/walkeeper/src/replication.rs
@@ -2,7 +2,7 @@
 //! with the "START_REPLICATION" message.

 use crate::send_wal::SendWalHandler;
-use crate::timeline::{Timeline, TimelineTools};
+use crate::timeline::{ReplicaState, Timeline, TimelineTools};
 use anyhow::{anyhow, Context, Result};
 use bytes::Bytes;
 use log::*;
@@ -20,7 +20,7 @@ use std::{str, thread};
 use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::postgres_backend::PostgresBackend;
-use zenith_utils::pq_proto::{BeMessage, FeMessage, XLogDataBody};
+use zenith_utils::pq_proto::{BeMessage, FeMessage, WalSndKeepAlive, XLogDataBody};
 use zenith_utils::sock_split::ReadStream;

 pub const END_REPLICATION_MARKER: Lsn = Lsn::MAX;
@@ -32,7 +32,7 @@ const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r';
 type FullTransactionId = u64;

 /// Hot standby feedback received from replica
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
 pub struct HotStandbyFeedback {
    pub ts: TimestampTz,
    pub xmin: FullTransactionId,
@@ -49,6 +49,16 @@ impl HotStandbyFeedback {
    }
 }

+/// Standby status update
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct StandbyReply {
+    pub write_lsn: Lsn, // disk consistent lSN
+    pub flush_lsn: Lsn, // LSN committedby quorum
+    pub apply_lsn: Lsn, // not used
+    pub reply_ts: TimestampTz,
+    pub reply_requested: bool,
+}
+
 /// A network connection that's speaking the replication protocol.
 pub struct ReplicationConn {
    /// This is an `Option` because we will spawn a background thread that will
@@ -56,16 +66,15 @@ pub struct ReplicationConn {
    stream_in: Option<ReadStream>,
 }

-// TODO: move this to crate::timeline when there's more users
-// TODO: design a proper Timeline mock api
-trait HsFeedbackSubscriber {
-    fn add_hs_feedback(&self, _feedback: HotStandbyFeedback) {}
+/// Scope guard to unregister replication connection from timeline
+struct ReplicationConnGuard {
+    replica: usize, // replica internal ID assigned by timeline
+    timeline: Arc<Timeline>,
 }

-impl HsFeedbackSubscriber for Arc<Timeline> {
-    #[inline(always)]
-    fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
-        Timeline::add_hs_feedback(self, feedback);
+impl Drop for ReplicationConnGuard {
+    fn drop(&mut self) {
+        self.timeline.update_replica_state(self.replica, None);
    }
 }

@@ -79,26 +88,33 @@ impl ReplicationConn {

    /// Handle incoming messages from the network.
    /// This is spawned into the background by `handle_start_replication`.
-    fn background_thread(
-        mut stream_in: impl Read,
-        subscriber: impl HsFeedbackSubscriber,
-    ) -> Result<()> {
+    fn background_thread(mut stream_in: impl Read, timeline: Arc<Timeline>) -> Result<()> {
+        let mut state = ReplicaState::new();
+        let replica = timeline.add_replica(state);
+        let _guard = ReplicationConnGuard {
+            replica,
+            timeline: timeline.clone(),
+        };
        // Wait for replica's feedback.
        while let Some(msg) = FeMessage::read(&mut stream_in)? {
            match &msg {
                FeMessage::CopyData(m) => {
                    // There's two possible data messages that the client is supposed to send here:
-                    // `HotStandbyFeedback` and `StandbyStatusUpdate`. We only handle hot standby
-                    // feedback.
+                    // `HotStandbyFeedback` and `StandbyStatusUpdate`.

                    match m.first().cloned() {
                        Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
                            // Note: deserializing is on m[1..] because we skip the tag byte.
-                            let feedback = HotStandbyFeedback::des(&m[1..])
+                            state.hs_feedback = HotStandbyFeedback::des(&m[1..])
                                .context("failed to deserialize HotStandbyFeedback")?;
-                            subscriber.add_hs_feedback(feedback);
+                            timeline.update_replica_state(replica, Some(state));
+                        }
+                        Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => {
+                            let reply = StandbyReply::des(&m[1..])
+                                .context("failed to deserialize StandbyReply")?;
+                            state.disk_consistent_lsn = reply.write_lsn;
+                            timeline.update_replica_state(replica, Some(state));
                        }
-                        Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => (),
                        _ => warn!("unexpected message {:?}", msg),
                    }
                }
@@ -187,7 +203,7 @@ impl ReplicationConn {
        // switch to copy
        pgb.write_message(&BeMessage::CopyBothResponse)?;

-        let mut end_pos: Lsn;
+        let mut end_pos = Lsn(0);
        let mut wal_file: Option<File> = None;

        loop {
@@ -202,7 +218,18 @@ impl ReplicationConn {
            } else {
                /* normal mode */
                let timeline = swh.timeline.get();
-                end_pos = timeline.wait_for_lsn(start_pos);
+                if let Some(lsn) = timeline.wait_for_lsn(start_pos) {
+                    end_pos = lsn
+                } else {
+                    // timeout expired: request pageserver status
+                    pgb.write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
+                        sent_ptr: end_pos.0,
+                        timestamp: get_current_timestamp(),
+                        request_reply: true,
+                    }))
+                    .context("Failed to send KeepAlive message")?;
+                    continue;
+                }
            }
            if end_pos == END_REPLICATION_MARKER {
                break;
@@ -257,18 +284,3 @@ impl ReplicationConn {
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // A no-op impl for tests
-    impl HsFeedbackSubscriber for () {}
-
-    #[test]
-    fn test_replication_conn_background_thread_eof() {
-        // Test that background_thread recognizes EOF
-        let stream: &[u8] = &[];
-        ReplicationConn::background_thread(stream, ()).unwrap();
-    }
-}
--- a/walkeeper/src/safekeeper.rs
+++ b/walkeeper/src/safekeeper.rs
@@ -191,6 +191,8 @@ pub struct AppendResponse {
    // We report back our awareness about which WAL is committed, as this is
    // a criterion for walproposer --sync mode exit
    pub commit_lsn: Lsn,
+    // Min disk consistent lsn of pageservers (portion of WAL applied and written to the disk by pageservers)
+    pub disk_consistent_lsn: Lsn,
    pub hs_feedback: HotStandbyFeedback,
 }

@@ -458,6 +460,7 @@ where
                epoch: self.s.acceptor_state.epoch,
                commit_lsn: Lsn(0),
                flush_lsn: Lsn(0),
+                disk_consistent_lsn: Lsn(0),
                hs_feedback: HotStandbyFeedback::empty(),
            };
            return Ok(AcceptorProposerMessage::AppendResponse(resp));
@@ -567,6 +570,7 @@ where
            epoch: self.s.acceptor_state.epoch,
            flush_lsn: self.flush_lsn,
            commit_lsn: self.s.commit_lsn,
+            disk_consistent_lsn: Lsn(0),
            // will be filled by caller code to avoid bothering safekeeper
            hs_feedback: HotStandbyFeedback::empty(),
        };
--- a/walkeeper/src/timeline.rs
+++ b/walkeeper/src/timeline.rs
@@ -11,9 +11,9 @@ use std::collections::HashMap;
 use std::fs::{self, File, OpenOptions};
 use std::io::{Seek, SeekFrom, Write};
 use std::sync::{Arc, Condvar, Mutex};
+use std::time::Duration;
 use zenith_utils::bin_ser::LeSer;
 use zenith_utils::lsn::Lsn;
-
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::replication::{HotStandbyFeedback, END_REPLICATION_MARKER};
@@ -25,6 +25,35 @@ use crate::WalAcceptorConf;
 use postgres_ffi::xlog_utils::{XLogFileName, XLOG_BLCKSZ};

 const CONTROL_FILE_NAME: &str = "safekeeper.control";
+const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
+
+/// Replica status: host standby feedback + disk consistent lsn
+#[derive(Debug, Clone, Copy)]
+pub struct ReplicaState {
+    /// combined disk_consistent_lsn of pageservers
+    pub disk_consistent_lsn: Lsn,
+    /// combined hot standby feedback from all replicas
+    pub hs_feedback: HotStandbyFeedback,
+}
+
+impl Default for ReplicaState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ReplicaState {
+    pub fn new() -> ReplicaState {
+        ReplicaState {
+            disk_consistent_lsn: Lsn(u64::MAX),
+            hs_feedback: HotStandbyFeedback {
+                ts: 0,
+                xmin: u64::MAX,
+                catalog_xmin: u64::MAX,
+            },
+        }
+    }
+}

 /// Shared state associated with database instance (tenant)
 struct SharedState {
@@ -33,8 +62,8 @@ struct SharedState {
    /// For receiving-sending wal cooperation
    /// quorum commit LSN we've notified walsenders about
    notified_commit_lsn: Lsn,
-    /// combined hot standby feedback from all replicas
-    hs_feedback: HotStandbyFeedback,
+    /// State of replicas
+    replicas: Vec<Option<ReplicaState>>,
 }

 // A named boolean.
@@ -45,6 +74,31 @@ pub enum CreateControlFile {
 }

 impl SharedState {
+    /// Get combined stateof all alive replicas
+    pub fn get_replicas_state(&self) -> ReplicaState {
+        let mut acc = ReplicaState::new();
+        for state in self.replicas.iter().flatten() {
+            acc.hs_feedback.ts = max(acc.hs_feedback.ts, state.hs_feedback.ts);
+            acc.hs_feedback.xmin = min(acc.hs_feedback.xmin, state.hs_feedback.xmin);
+            acc.hs_feedback.catalog_xmin =
+                min(acc.hs_feedback.catalog_xmin, state.hs_feedback.catalog_xmin);
+            acc.disk_consistent_lsn = Lsn::min(acc.disk_consistent_lsn, state.disk_consistent_lsn);
+        }
+        acc
+    }
+
+    /// Assign new replica ID. We choose first empty cell in the replicas vector
+    /// or extend the vector if there are not free items.
+    pub fn add_replica(&mut self, state: ReplicaState) -> usize {
+        if let Some(pos) = self.replicas.iter().position(|r| r.is_none()) {
+            self.replicas[pos] = Some(state);
+            return pos;
+        }
+        let pos = self.replicas.len();
+        self.replicas.push(Some(state));
+        pos
+    }
+
    /// Restore SharedState from control file. Locks the control file along the
    /// way to prevent running more than one instance of safekeeper on the same
    /// data dir.
@@ -74,21 +128,10 @@ impl SharedState {
        Ok(Self {
            notified_commit_lsn: Lsn(0),
            sk: SafeKeeper::new(Lsn(flush_lsn), tli, storage, state),
-            hs_feedback: HotStandbyFeedback {
-                ts: 0,
-                xmin: u64::MAX,
-                catalog_xmin: u64::MAX,
-            },
+            replicas: Vec::new(),
        })
    }

-    /// Accumulate hot standby feedbacks from replicas
-    pub fn add_hs_feedback(&mut self, feedback: HotStandbyFeedback) {
-        self.hs_feedback.xmin = min(self.hs_feedback.xmin, feedback.xmin);
-        self.hs_feedback.catalog_xmin = min(self.hs_feedback.catalog_xmin, feedback.catalog_xmin);
-        self.hs_feedback.ts = max(self.hs_feedback.ts, feedback.ts);
-    }
-
    /// Fetch and lock control file (prevent running more than one instance of safekeeper)
    /// If create=false and file doesn't exist, bails out.
    fn load_control_file(
@@ -178,20 +221,27 @@ impl Timeline {
        }
    }

-    /// Wait for an LSN to be committed.
+    /// Timed wait for an LSN to be committed.
    ///
    /// Returns the last committed LSN, which will be at least
-    /// as high as the LSN waited for.
+    /// as high as the LSN waited for, or None if timeout expired.
    ///
-    pub fn wait_for_lsn(&self, lsn: Lsn) -> Lsn {
+    pub fn wait_for_lsn(&self, lsn: Lsn) -> Option<Lsn> {
        let mut shared_state = self.mutex.lock().unwrap();
        loop {
            let commit_lsn = shared_state.notified_commit_lsn;
            // This must be `>`, not `>=`.
            if commit_lsn > lsn {
-                return commit_lsn;
+                return Some(commit_lsn);
            }
-            shared_state = self.cond.wait(shared_state).unwrap();
+            let result = self
+                .cond
+                .wait_timeout(shared_state, POLL_STATE_TIMEOUT)
+                .unwrap();
+            if result.1.timed_out() {
+                return None;
+            }
+            shared_state = result.0
        }
    }

@@ -219,9 +269,11 @@ impl Timeline {
            // commit_lsn if we are catching up safekeeper.
            commit_lsn = shared_state.sk.commit_lsn;

-            // if this is AppendResponse, fill in proper hot standby feedback
+            // if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
            if let AcceptorProposerMessage::AppendResponse(ref mut resp) = rmsg {
-                resp.hs_feedback = shared_state.hs_feedback.clone();
+                let state = shared_state.get_replicas_state();
+                resp.hs_feedback = state.hs_feedback;
+                resp.disk_consistent_lsn = state.disk_consistent_lsn;
            }
        }
        // Ping wal sender that new data might be available.
@@ -233,15 +285,14 @@ impl Timeline {
        self.mutex.lock().unwrap().sk.s.clone()
    }

-    // Accumulate hot standby feedbacks from replicas
-    pub fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
+    pub fn add_replica(&self, state: ReplicaState) -> usize {
        let mut shared_state = self.mutex.lock().unwrap();
-        shared_state.add_hs_feedback(feedback);
+        shared_state.add_replica(state)
    }

-    pub fn get_hs_feedback(&self) -> HotStandbyFeedback {
-        let shared_state = self.mutex.lock().unwrap();
-        shared_state.hs_feedback.clone()
+    pub fn update_replica_state(&self, id: usize, state: Option<ReplicaState>) {
+        let mut shared_state = self.mutex.lock().unwrap();
+        shared_state.replicas[id] = state;
    }

    pub fn get_end_of_wal(&self) -> (Lsn, u32) {
--- a/zenith_utils/Cargo.toml
+++ b/zenith_utils/Cargo.toml
@@ -27,7 +27,6 @@ workspace_hack = { path = "../workspace_hack" }
 rand = "0.8.3"
 jsonwebtoken = "7"
 hex = { version = "0.4.3", features = ["serde"] }
-nix = "0.23.0"

 rustls = "0.19.1"
 rustls-split = "0.2.1"
--- a/zenith_utils/src/lib.rs
+++ b/zenith_utils/src/lib.rs
@@ -40,7 +40,3 @@ pub mod logging;

 // Misc
 pub mod accum;
-
-// Utility for putting a raw file descriptor into non-blocking mode
-pub mod nonblock;
-
--- a/zenith_utils/src/nonblock.rs
+++ b/zenith_utils/src/nonblock.rs
@@ -1,17 +0,0 @@
-use nix::fcntl::{fcntl, OFlag, F_GETFL, F_SETFL};
-use std::os::unix::io::RawFd;
-
-/// Put a file descriptor into non-blocking mode
-pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
-    let bits = fcntl(fd, F_GETFL)?;
-
-    // Safety: If F_GETFL returns some unknown bits, they should be valid
-    // for passing back to F_SETFL, too. If we left them out, the F_SETFL
-    // would effectively clear them, which is not what we want.
-    let mut flags = unsafe { OFlag::from_bits_unchecked(bits) };
-    flags |= OFlag::O_NONBLOCK;
-
-    fcntl(fd, F_SETFL(flags))?;
-
-    Ok(())
-}
--- a/zenith_utils/src/pq_proto.rs
+++ b/zenith_utils/src/pq_proto.rs
@@ -358,6 +358,7 @@ pub enum BeMessage<'a> {
    RowDescription(&'a [RowDescriptor<'a>]),
    XLogData(XLogDataBody<'a>),
    NoticeResponse(String),
+    KeepAlive(WalSndKeepAlive),
 }

 // One row desciption in RowDescription packet.
@@ -409,6 +410,13 @@ pub struct XLogDataBody<'a> {
    pub data: &'a [u8],
 }

+#[derive(Debug)]
+pub struct WalSndKeepAlive {
+    pub sent_ptr: u64,
+    pub timestamp: i64,
+    pub request_reply: bool,
+}
+
 pub static HELLO_WORLD_ROW: BeMessage = BeMessage::DataRow(&[Some(b"hello world")]);

 // single text column
@@ -721,6 +729,18 @@ impl<'a> BeMessage<'a> {
                })
                .unwrap();
            }
+
+            BeMessage::KeepAlive(req) => {
+                buf.put_u8(b'd');
+                write_body(buf, |buf| {
+                    buf.put_u8(b'k');
+                    buf.put_u64(req.sent_ptr);
+                    buf.put_i64(req.timestamp);
+                    buf.put_u8(if req.request_reply { 1u8 } else { 0u8 });
+                    Ok::<_, io::Error>(())
+                })
+                .unwrap();
+            }
        }
        Ok(())
    }
--- a/zenith_utils/src/vec_map.rs
+++ b/zenith_utils/src/vec_map.rs
@@ -87,10 +87,6 @@ impl<K: Ord, V> VecMap<K, V> {
        Ok(None)
    }

-    pub fn last(&self) -> Option<&(K, V)> {
-        self.0.last()
-    }
-
    /// Split the map into two.
    ///
    /// The left map contains everything before `cutoff` (exclusive).
Author	SHA1	Message	Date
Alexey Kondratov	58ee590bfd	cargo fmt	2021-10-20 14:01:00 +03:00
Alexey Kondratov	78773ee2c1	Bump vendor/postgres	2021-10-20 13:59:42 +03:00
Konstantin Knizhnik	a6d096a874	Merge with main	2021-10-20 13:19:50 +03:00
Konstantin Knizhnik	1ae336c356	Fix type in comment	2021-10-19 17:42:29 +03:00
Konstantin Knizhnik	57da296356	Fix race condition in add_replica	2021-10-19 12:01:24 +03:00
Konstantin Knizhnik	4ee30b6c36	Merge with main	2021-10-08 14:47:00 +03:00
Konstantin Knizhnik	4d90f2f316	Add separate back pressure threashold for write/flush LSNs	2021-09-30 19:55:38 +03:00
Konstantin Knizhnik	fc774c819e	MReduce default checkpoint distance to 64Mb	2021-09-30 17:32:57 +03:00
Konstantin Knizhnik	04a6652f16	Bump postgres version	2021-09-22 16:02:12 +03:00
Konstantin Knizhnik	25ca847295	Remove extra tracing	2021-09-22 11:07:27 +03:00
Konstantin Knizhnik	b472e28f3a	Fix indentation	2021-09-22 10:40:31 +03:00
Konstantin Knizhnik	e28960b816	Fix zerialization of KeepAlive message	2021-09-22 10:40:31 +03:00
Konstantin Knizhnik	379f6b8638	Fix clippy errors	2021-09-22 10:40:31 +03:00
Konstantin Knizhnik	018408e0af	Fix clippy error	2021-09-22 10:40:31 +03:00
Konstantin Knizhnik	bf5d17cbaa	Request disk consistent LSN through safekeeper	2021-09-22 10:40:31 +03:00
Konstantin Knizhnik	ae2232641d	Eliminate warnings	2021-09-22 10:40:22 +03:00
Konstantin Knizhnik	891633f2cd	Bump postgres version	2021-09-22 10:36:14 +03:00
Konstantin Knizhnik	6056ce7602	Implement backpressure for compute node to avoid WAL overflow	2021-09-22 10:36:14 +03:00
Konstantin Knizhnik	e53dd5240a	Adjust default parameters for chekpointer to avoid OOM	2021-09-22 10:35:17 +03:00
Konstantin Knizhnik	611abc5299	Use checkpoint_distance instead of OLDEST_INMEM_DISTANCE	2021-09-22 10:35:17 +03:00
Konstantin Knizhnik	09cced0855	Delay adding new layers if checpointing is too slow to avoid OOM	2021-09-22 10:35:17 +03:00