pageserver - OrderedBlockIter

2026-02-01 09:40:38 +00:00 · 2021-10-09 00:49:59 -07:00
47 changed files with 1180 additions and 1578 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -260,7 +260,7 @@ jobs:
          when: always
          command: |
            du -sh /tmp/test_output/*
-            find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
+            find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
            du -sh /tmp/test_output/*
      - store_artifacts:
          path: /tmp/test_output
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -26,21 +26,18 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "ansi_term"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
-dependencies = [
- "winapi",
-]
-
 [[package]]
 name = "anyhow"
 version = "1.0.42"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "595d3cfa7a60d4555cb5067b99f07142a08ea778de5cf993f7b75c7d8fabc486"

+[[package]]
+name = "arc-swap"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e906254e445520903e7fc9da4f709886c84ae4bc4ddaf0e093188d66df4dc820"
+
 [[package]]
 name = "async-trait"
 version = "0.1.50"
@@ -301,7 +298,7 @@ version = "2.33.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
 dependencies = [
- "ansi_term 0.11.0",
+ "ansi_term",
 "atty",
 "bitflags",
 "strsim",
@@ -390,6 +387,26 @@ dependencies = [
 "rustc_version",
 ]

+[[package]]
+name = "crossbeam-channel"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
+dependencies = [
+ "cfg-if 1.0.0",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
+dependencies = [
+ "cfg-if 1.0.0",
+ "lazy_static",
+]
+
 [[package]]
 name = "crypto-mac"
 version = "0.10.0"
@@ -428,6 +445,16 @@ dependencies = [
 "dirs-sys",
 ]

+[[package]]
+name = "dirs-next"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1"
+dependencies = [
+ "cfg-if 1.0.0",
+ "dirs-sys-next",
+]
+
 [[package]]
 name = "dirs-sys"
 version = "0.3.6"
@@ -439,6 +466,17 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "dirs-sys-next"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
+dependencies = [
+ "libc",
+ "redox_users",
+ "winapi",
+]
+
 [[package]]
 name = "dlv-list"
 version = "0.2.3"
@@ -918,15 +956,6 @@ dependencies = [
 "cfg-if 1.0.0",
 ]

-[[package]]
-name = "matchers"
-version = "0.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
-dependencies = [
- "regex-automata",
-]
-
 [[package]]
 name = "matches"
 version = "0.1.8"
@@ -1191,12 +1220,10 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
- "signal-hook",
 "tar",
 "thiserror",
 "tokio",
 "toml",
- "tracing",
 "workspace_hack",
 "zenith_metrics",
 "zenith_utils",
@@ -1504,15 +1531,6 @@ dependencies = [
 "regex-syntax",
 ]

-[[package]]
-name = "regex-automata"
-version = "0.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
-dependencies = [
- "regex-syntax",
-]
-
 [[package]]
 name = "regex-syntax"
 version = "0.6.25"
@@ -1671,6 +1689,12 @@ dependencies = [
 "webpki",
 ]

+[[package]]
+name = "rustversion"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088"
+
 [[package]]
 name = "ryu"
 version = "1.0.5"
@@ -1828,32 +1852,12 @@ dependencies = [
 "opaque-debug",
 ]

-[[package]]
-name = "sharded-slab"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982"
-dependencies = [
- "lazy_static",
-]
-
 [[package]]
 name = "shlex"
 version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "42a568c8f2cd051a4d283bd6eb0343ac214c1b0f1ac19f93e1175b2dee38c73d"

-[[package]]
-name = "signal-hook"
-version = "0.3.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c98891d737e271a2954825ef19e46bd16bdb98e2746f2eec4f7a4ef7946efd1"
-dependencies = [
- "cc",
- "libc",
- "signal-hook-registry",
-]
-
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.0"
@@ -1886,6 +1890,59 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527"

+[[package]]
+name = "slog"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06"
+
+[[package]]
+name = "slog-async"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c60813879f820c85dbc4eabf3269befe374591289019775898d56a81a804fbdc"
+dependencies = [
+ "crossbeam-channel",
+ "slog",
+ "take_mut",
+ "thread_local",
+]
+
+[[package]]
+name = "slog-scope"
+version = "4.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f95a4b4c3274cd2869549da82b57ccc930859bdbf5bcea0424bc5f140b3c786"
+dependencies = [
+ "arc-swap",
+ "lazy_static",
+ "slog",
+]
+
+[[package]]
+name = "slog-stdlog"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8228ab7302adbf4fcb37e66f3cda78003feb521e7fd9e3847ec117a7784d0f5a"
+dependencies = [
+ "log",
+ "slog",
+ "slog-scope",
+]
+
+[[package]]
+name = "slog-term"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95c1e7e5aab61ced6006149ea772770b84a0d16ce0f7885def313e4829946d76"
+dependencies = [
+ "atty",
+ "chrono",
+ "slog",
+ "term",
+ "thread_local",
+]
+
 [[package]]
 name = "smallvec"
 version = "1.6.1"
@@ -1941,6 +1998,12 @@ dependencies = [
 "unicode-xid",
 ]

+[[package]]
+name = "take_mut"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
+
 [[package]]
 name = "tap"
 version = "1.0.1"
@@ -1972,6 +2035,17 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "term"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f"
+dependencies = [
+ "dirs-next",
+ "rustversion",
+ "winapi",
+]
+
 [[package]]
 name = "termcolor"
 version = "1.1.2"
@@ -2149,79 +2223,24 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6"

 [[package]]
 name = "tracing"
-version = "0.1.29"
+version = "0.1.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "375a639232caf30edfc78e8d89b2d4c375515393e7af7e16f01cd96917fb2105"
+checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
 dependencies = [
 "cfg-if 1.0.0",
 "pin-project-lite",
- "tracing-attributes",
 "tracing-core",
 ]

-[[package]]
-name = "tracing-attributes"
-version = "0.1.18"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4f480b8f81512e825f337ad51e94c1eb5d3bbdf2b363dcd01e2b19a9ffe3f8e"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "tracing-core"
-version = "0.1.21"
+version = "0.1.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f4ed65637b8390770814083d20756f87bfa2c21bf2f110babdc5438351746e4"
+checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052"
 dependencies = [
 "lazy_static",
 ]

-[[package]]
-name = "tracing-log"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
-dependencies = [
- "lazy_static",
- "log",
- "tracing-core",
-]
-
-[[package]]
-name = "tracing-serde"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
-dependencies = [
- "serde",
- "tracing-core",
-]
-
-[[package]]
-name = "tracing-subscriber"
-version = "0.2.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e0d2eaa99c3c2e41547cfa109e910a68ea03823cccad4a0525dcbc9b01e8c71"
-dependencies = [
- "ansi_term 0.12.1",
- "chrono",
- "lazy_static",
- "matchers",
- "regex",
- "serde",
- "serde_json",
- "sharded-slab",
- "smallvec",
- "thread_local",
- "tracing",
- "tracing-core",
- "tracing-log",
- "tracing-serde",
-]
-
 [[package]]
 name = "try-lock"
 version = "0.2.3"
@@ -2586,12 +2605,14 @@ dependencies = [
 "rustls-split",
 "serde",
 "serde_json",
+ "slog",
+ "slog-async",
+ "slog-scope",
+ "slog-stdlog",
+ "slog-term",
 "tempfile",
 "thiserror",
 "tokio",
- "tracing",
- "tracing-log",
- "tracing-subscriber",
 "webpki",
 "workspace_hack",
 "zenith_metrics",
--- a/4
+++ b/4
@@ -37,9 +37,7 @@ RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl
    mkdir zenith_install

 COPY --from=build /zenith/target/release/pageserver /usr/local/bin
-COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
-# TODO: temporary alias for compatibility, see https://github.com/zenithdb/zenith/pull/740
-RUN ln -s /usr/local/bin/safekeeper /usr/local/bin/wal_acceptor
+COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
 COPY --from=build /zenith/target/release/proxy /usr/local/bin
 COPY --from=pg-build /zenith/tmp_install postgres_install
 COPY docker-entrypoint.sh /docker-entrypoint.sh
--- a/Dockerfile.alpine
+++ b/Dockerfile.alpine
@@ -81,9 +81,7 @@ FROM alpine:3.13
 RUN apk add --update openssl build-base libseccomp-dev
 RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
 COPY --from=build /zenith/target/release/pageserver /usr/local/bin
-COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
-# TODO: temporary alias for compatibility, see https://github.com/zenithdb/zenith/pull/740
-RUN ln -s /usr/local/bin/safekeeper /usr/local/bin/wal_acceptor
+COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
 COPY --from=build /zenith/target/release/proxy /usr/local/bin
 COPY --from=pg-build /zenith/tmp_install /usr/local
 COPY docker-entrypoint.sh /docker-entrypoint.sh
--- a/46
+++ b/46
@@ -10,43 +10,32 @@ endif
 # We differentiate between release / debug build types using the BUILD_TYPE
 # environment variable.
 #
-BUILD_TYPE ?= debug
 ifeq ($(BUILD_TYPE),release)
 	PG_CONFIGURE_OPTS = --enable-debug
-	PG_CFLAGS = -O2 -g3 $(CFLAGS)
-	# Unfortunately, `--profile=...` is a nightly feature
-	CARGO_BUILD_FLAGS += --release
-else ifeq ($(BUILD_TYPE),debug)
-	PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
-	PG_CFLAGS = -O0 -g3 $(CFLAGS)
+	PG_CFLAGS = -O2 -g3 ${CFLAGS}
 else
-$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
+	PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
+	PG_CFLAGS = -O0 -g3 ${CFLAGS}
 endif

-# Choose whether we should be silent or verbose
-CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
-# Fix for a corner case when make doesn't pass a jobserver
-CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
-
-# This option has a side effect of passing make jobserver to cargo.
-# However, we shouldn't do this if `make -n` (--dry-run) has been asked.
-CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
-# Force cargo not to print progress bar
-CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-
 #
 # Top level Makefile to build Zenith and PostgreSQL
 #
-.PHONY: all
 all: zenith postgres

+# We don't want to run 'cargo build' in parallel with the postgres build,
+# because interleaving cargo build output with postgres build output looks
+# confusing. Also, 'cargo build' is parallel on its own, so it would be too
+# much parallelism. (Recursive invocation of postgres target still gets any
+# '-j' flag from the command line, so 'make -j' is still useful.)
+.NOTPARALLEL:
+
 ### Zenith Rust bits
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: zenith
 zenith: postgres-headers
-	+@echo "Compiling Zenith"
-	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
+	cargo build

 ### PostgreSQL parts
 tmp_install/build/config.status:
@@ -68,10 +57,10 @@ postgres-headers: postgres-configure
 	+@echo "Installing PostgreSQL headers"
 	$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install

+
 # Compile and install PostgreSQL and contrib/zenith
 .PHONY: postgres
-postgres: postgres-configure \
-		  postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
+postgres: postgres-configure
 	+@echo "Compiling PostgreSQL"
 	$(MAKE) -C tmp_install/build MAKELEVEL=0 install
 	+@echo "Compiling contrib/zenith"
@@ -79,21 +68,18 @@ postgres: postgres-configure \
 	+@echo "Compiling contrib/zenith_test_utils"
 	$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install

-.PHONY: postgres-clean
 postgres-clean:
 	$(MAKE) -C tmp_install/build MAKELEVEL=0 clean

 # This doesn't remove the effects of 'configure'.
-.PHONY: clean
 clean:
-	cd tmp_install/build && $(MAKE) clean
-	$(CARGO_CMD_PREFIX) cargo clean
+	cd tmp_install/build && ${MAKE} clean
+	cargo clean

 # This removes everything
-.PHONY: distclean
 distclean:
 	rm -rf tmp_install
-	$(CARGO_CMD_PREFIX) cargo clean
+	cargo clean

 .PHONY: fmt
 fmt:
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -199,45 +199,23 @@ impl PageServerNode {
        bail!("pageserver failed to start in {} seconds", RETRIES);
    }

-    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+    pub fn stop(&self) -> anyhow::Result<()> {
        let pid = read_pidfile(&self.pid_file())?;
        let pid = Pid::from_raw(pid);
-        if immediate {
-            println!("Stop pageserver immediately");
-            if kill(pid, Signal::SIGQUIT).is_err() {
-                bail!("Failed to kill pageserver with pid {}", pid);
-            }
-        } else {
-            println!("Stop pageserver gracefully");
-            if kill(pid, Signal::SIGTERM).is_err() {
-                bail!("Failed to stop pageserver with pid {}", pid);
-            }
+        if kill(pid, Signal::SIGTERM).is_err() {
+            bail!("Failed to kill pageserver with pid {}", pid);
        }

+        // wait for pageserver stop
        let address = connection_address(&self.pg_connection_config);
-
-        // TODO Remove this "timeout" and handle it on caller side instead.
-        // Shutting down may take a long time,
-        // if pageserver checkpoints a lot of data
-        for _ in 0..100 {
-            if let Err(_e) = TcpStream::connect(&address) {
-                println!("Pageserver stopped receiving connections");
-
-                //Now check status
-                match self.check_status() {
-                    Ok(_) => {
-                        println!("Pageserver status is OK. Wait a bit.");
-                        thread::sleep(Duration::from_secs(1));
-                    }
-                    Err(err) => {
-                        println!("Pageserver status is: {}", err);
-                        return Ok(());
-                    }
-                }
-            } else {
-                println!("Pageserver still receives connections");
-                thread::sleep(Duration::from_secs(1));
+        for _ in 0..5 {
+            let stream = TcpStream::connect(&address);
+            thread::sleep(Duration::from_secs(1));
+            if let Err(_e) = stream {
+                println!("Pageserver stopped");
+                return Ok(());
            }
+            println!("Stopping pageserver on {}", address);
        }

        bail!("Failed to stop pageserver with pid {}", pid);
@@ -335,9 +313,8 @@ impl PageServerNode {

 impl Drop for PageServerNode {
    fn drop(&mut self) {
-        // TODO Looks like this flag is never set
        if self.kill_on_exit {
-            let _ = self.stop(true);
+            let _ = self.stop();
        }
    }
 }
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -4,7 +4,7 @@

 Currently we build two main images:

- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).

 And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
--- a/docs/multitenancy.md
+++ b/docs/multitenancy.md
@@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id

 ### Safety

-For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
+For now particular tenant can only appear on a particular pageserver. Set of WAL acceptors are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
 log = "0.4.14"
 clap = "2.33.0"
 daemonize = "0.4.1"
-tokio = { version = "1.11", features = ["process", "macros", "fs", "rt"] }
+tokio = { version = "1.11", features = ["process", "macros", "fs"] }
 postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
@@ -35,8 +35,6 @@ scopeguard = "1.1.0"
 rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
 async-trait = "0.1"
 const_format = "0.2.21"
-tracing = "0.1.27"
-signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }

 postgres_ffi = { path = "../postgres_ffi" }
 zenith_metrics = { path = "../zenith_metrics" }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -31,7 +31,7 @@ use zenith_utils::lsn::Lsn;
 pub struct Basebackup<'a> {
    ar: Builder<&'a mut dyn Write>,
    timeline: &'a Arc<dyn Timeline>,
-    pub lsn: Lsn,
+    lsn: Lsn,
    prev_record_lsn: Lsn,
 }

@@ -97,6 +97,7 @@ impl<'a> Basebackup<'a> {
    pub fn send_tarball(&mut self) -> anyhow::Result<()> {
        // Create pgdata subdirs structure
        for dir in pg_constants::PGDATA_SUBDIRS.iter() {
+            info!("send subdir {:?}", *dir);
            let header = new_tar_header_dir(*dir)?;
            self.ar.append(&header, &mut io::empty())?;
        }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,6 +2,7 @@
 // Main entry point for the Page Server executable
 //

+use log::*;
 use pageserver::defaults::*;
 use serde::{Deserialize, Serialize};
 use std::{
@@ -11,19 +12,9 @@ use std::{
    str::FromStr,
    thread,
 };
-use tracing::*;
 use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};

 use anyhow::{bail, ensure, Context, Result};
-use signal_hook::consts::signal::*;
-use signal_hook::consts::TERM_SIGNALS;
-use signal_hook::flag;
-use signal_hook::iterator::exfiltrator::WithOrigin;
-use signal_hook::iterator::SignalsInfo;
-use std::process::exit;
-use std::sync::atomic::AtomicBool;
-use std::sync::Arc;
-
 use clap::{App, Arg, ArgMatches};
 use daemonize::Daemonize;

@@ -37,7 +28,6 @@ use pageserver::{
    RelishStorageKind, S3Config, LOG_FILE_NAME,
 };
 use zenith_utils::http::endpoint;
-use zenith_utils::postgres_backend;

 use const_format::formatcp;

@@ -457,18 +447,7 @@ fn main() -> Result<()> {

 fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
    // Initialize logger
-    let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;
-
-    let term_now = Arc::new(AtomicBool::new(false));
-    for sig in TERM_SIGNALS {
-        // When terminated by a second term signal, exit with exit code 1.
-        // This will do nothing the first time (because term_now is false).
-        flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
-        // But this will "arm" the above for the second time, by setting it to true.
-        // The order of registering these is important, if you put this one first, it will
-        // first arm and then terminate ‒ all in the first round.
-        flag::register(*sig, Arc::clone(&term_now))?;
-    }
+    let (_scope_guard, log_file) = logging::init(LOG_FILE_NAME, conf.daemonize)?;

    // TODO: Check that it looks like a valid repository before going further

@@ -501,7 +480,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {

        match daemonize.start() {
            Ok(_) => info!("Success, daemonized"),
-            Err(err) => error!(%err, "could not daemonize"),
+            Err(e) => error!("could not daemonize: {:#}", e),
        }
    }

@@ -546,42 +525,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
            page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
        })?;

-    for info in SignalsInfo::<WithOrigin>::new(TERM_SIGNALS)?.into_iter() {
-        match info.signal {
-            SIGQUIT => {
-                info!("Got SIGQUIT. Terminate pageserver in immediate shutdown mode");
-                exit(111);
-            }
-            SIGTERM => {
-                info!("Got SIGINT/SIGTERM. Terminate gracefully in fast shutdown mode");
-                // Terminate postgres backends
-                postgres_backend::set_pgbackend_shutdown_requested();
-                // Stop all tenants and flush their data
-                tenant_mgr::shutdown_all_tenants()?;
-                // Wait for pageservice thread to complete the job
-                page_service_thread
-                    .join()
-                    .expect("thread panicked")
-                    .expect("thread exited with an error");
+    join_handles.push(page_service_thread);

-                // Shut down http router
-                endpoint::shutdown();
-
-                // Wait for all threads
-                for handle in join_handles.into_iter() {
-                    handle
-                        .join()
-                        .expect("thread panicked")
-                        .expect("thread exited with an error");
-                }
-                info!("Pageserver shut down successfully completed");
-                exit(0);
-            }
-            _ => {
-                debug!("Unknown signal.");
-            }
-        }
+    for handle in join_handles.into_iter() {
+        handle
+            .join()
+            .expect("thread panicked")
+            .expect("thread exited with an error")
    }
-
    Ok(())
 }
--- a/pageserver/src/branches.rs
+++ b/pageserver/src/branches.rs
@@ -14,12 +14,12 @@ use std::{
    str::FromStr,
    sync::Arc,
 };
-use tracing::*;
+use zenith_utils::zid::{ZTenantId, ZTimelineId};

+use log::*;
 use zenith_utils::crashsafe_dir;
 use zenith_utils::logging;
 use zenith_utils::lsn::Lsn;
-use zenith_utils::zid::{ZTenantId, ZTimelineId};

 use crate::tenant_mgr;
 use crate::walredo::WalRedoManager;
@@ -100,7 +100,7 @@ pub struct PointInTime {
 pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
    // Initialize logger
    // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
-    let _log_file = logging::init(LOG_FILE_NAME, true)?;
+    let (_scope_guard, _log_file) = logging::init(LOG_FILE_NAME, true)?;

    // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
    // process during repository initialization.
@@ -176,7 +176,7 @@ fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 // to get bootstrap data for timeline initialization.
 //
 fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
-    info!("running initdb in {}... ", initdbpath.display());
+    info!("running initdb... ");

    let initdb_path = conf.pg_bin_dir().join("initdb");
    let initdb_output = Command::new(initdb_path)
@@ -195,6 +195,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
            String::from_utf8_lossy(&initdb_output.stderr)
        );
    }
+    info!("initdb succeeded");

    Ok(())
 }
@@ -209,8 +210,6 @@ fn bootstrap_timeline(
    tli: ZTimelineId,
    repo: &dyn Repository,
 ) -> Result<()> {
-    let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
-
    let initdb_path = conf.tenant_path(&tenantid).join("tmp");

    // Init temporarily repo to get bootstrap data
@@ -219,12 +218,14 @@ fn bootstrap_timeline(

    let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();

+    info!("bootstrap_timeline {:?} at lsn {}", pgdata_path, lsn);
+
    // Import the contents of the data directory at the initial checkpoint
    // LSN, and any WAL after that.
    let timeline = repo.create_empty_timeline(tli)?;
    restore_local_repo::import_timeline_from_postgres_datadir(
        &pgdata_path,
-        timeline.writer().as_ref(),
+        timeline.as_ref(),
        lsn,
    )?;
    timeline.checkpoint()?;
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -6,7 +6,6 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use routerify::{ext::RequestExt, RouterBuilder};
-use tracing::*;
 use zenith_utils::auth::JwtAuth;
 use zenith_utils::http::endpoint::attach_openapi_ui;
 use zenith_utils::http::endpoint::auth_middleware;
@@ -99,7 +98,6 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    check_permission(&request, Some(request_data.tenant_id))?;

    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered();
        branches::create_branch(
            get_config(&request),
            &request_data.name,
@@ -118,7 +116,6 @@ async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, A
    check_permission(&request, Some(tenantid))?;

    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("branch_list", tenant = %tenantid).entered();
        crate::branches::get_branches(get_config(&request), &tenantid)
    })
    .await
@@ -129,12 +126,11 @@ async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, A
 // TODO add to swagger
 async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
-    let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
+    let branch_name: &str = get_request_param(&request, "branch_name")?;
    let conf = get_state(&request).conf;
-    let path = conf.branch_path(&branch_name, &tenantid);
+    let path = conf.branch_path(branch_name, &tenantid);

    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
        let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
        BranchInfo::from_path(path, conf, &tenantid, &repo)
    })
@@ -148,13 +144,10 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
    // check for management permission
    check_permission(&request, None)?;

-    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_list").entered();
-        crate::branches::get_tenants(get_config(&request))
-    })
-    .await
-    .map_err(ApiError::from_err)??;
-
+    let response_data =
+        tokio::task::spawn_blocking(move || crate::branches::get_tenants(get_config(&request)))
+            .await
+            .map_err(ApiError::from_err)??;
    Ok(json_response(StatusCode::OK, response_data)?)
 }

@@ -165,7 +158,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    let request_data: TenantCreateRequest = json_request(&mut request).await?;

    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
        tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
    })
    .await
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -15,9 +15,9 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use bookfile::Book;
 use bytes::Bytes;
 use lazy_static::lazy_static;
+use log::*;
 use postgres_ffi::pg_constants::BLCKSZ;
 use serde::{Deserialize, Serialize};
-use tracing::*;

 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
@@ -26,18 +26,16 @@ use std::convert::TryInto;
 use std::fs;
 use std::fs::{File, OpenOptions};
 use std::io::Write;
-use std::ops::{Bound::Included, Deref};
+use std::ops::Bound::Included;
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, MutexGuard};
-use std::thread::JoinHandle;
 use std::time::{Duration, Instant};

+use crate::layered_repository::inmemory_layer::FreezeLayers;
 use crate::relish::*;
 use crate::relish_storage::schedule_timeline_upload;
-use crate::repository::{GcResult, Repository, Timeline, TimelineWriter, WALRecord};
-use crate::tenant_mgr;
-use crate::walreceiver;
+use crate::repository::{GcResult, Repository, Timeline, WALRecord};
 use crate::walreceiver::IS_WAL_RECEIVER;
 use crate::walredo::WalRedoManager;
 use crate::PageServerConf;
@@ -71,6 +69,8 @@ use storage_layer::{
    Layer, PageReconstructData, PageReconstructResult, SegmentTag, RELISH_SEG_SIZE,
 };

+use self::inmemory_layer::{NonWriteableError, WriteResult};
+
 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);

 // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
@@ -111,9 +111,6 @@ lazy_static! {
    .expect("failed to define a metric");
 }

-/// The name of the metadata file pageserver creates per timeline.
-pub const METADATA_FILE_NAME: &str = "metadata";
-
 ///
 /// Repository consists of multiple timelines. Keep them in a hash table.
 ///
@@ -215,23 +212,6 @@ impl Repository for LayeredRepository {
                self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc)
            })
    }
-
-    // Wait for all threads to complete and persist repository data before pageserver shutdown.
-    fn shutdown(&self) -> Result<()> {
-        trace!("LayeredRepository shutdown for tenant {}", self.tenantid);
-
-        let timelines = self.timelines.lock().unwrap();
-        for (timelineid, timeline) in timelines.iter() {
-            walreceiver::stop_wal_receiver(*timelineid);
-            // Wait for syncing data to disk
-            trace!("repo shutdown. checkpoint timeline {}", timelineid);
-            timeline.checkpoint()?;
-
-            //TODO Wait for walredo process to shutdown too
-        }
-
-        Ok(())
-    }
 }

 /// Private functions
@@ -260,10 +240,6 @@ impl LayeredRepository {
                    None
                };

-                let _enter =
-                    info_span!("loading timeline", timeline = %timelineid, tenant = %self.tenantid)
-                        .entered();
-
                let mut timeline = LayeredTimeline::new(
                    self.conf,
                    metadata,
@@ -276,16 +252,7 @@ impl LayeredRepository {
                )?;

                // List the layers on disk, and load them into the layer map
-                let _loaded_layers = timeline.load_layer_map(disk_consistent_lsn)?;
-                if self.upload_relishes {
-                    schedule_timeline_upload(());
-                    // schedule_timeline_upload(
-                    //     self.tenantid,
-                    //     timelineid,
-                    //     loaded_layers,
-                    //     disk_consistent_lsn,
-                    // );
-                }
+                timeline.load_layer_map(disk_consistent_lsn)?;

                // needs to be after load_layer_map
                timeline.init_current_logical_size()?;
@@ -315,24 +282,21 @@ impl LayeredRepository {
    ///
    /// Launch the checkpointer thread in given repository.
    ///
-    pub fn launch_checkpointer_thread(
-        conf: &'static PageServerConf,
-        rc: Arc<LayeredRepository>,
-    ) -> JoinHandle<()> {
-        std::thread::Builder::new()
+    pub fn launch_checkpointer_thread(conf: &'static PageServerConf, rc: Arc<LayeredRepository>) {
+        let _thread = std::thread::Builder::new()
            .name("Checkpointer thread".into())
            .spawn(move || {
                // FIXME: relaunch it? Panic is not good.
                rc.checkpoint_loop(conf).expect("Checkpointer thread died");
            })
-            .unwrap()
+            .unwrap();
    }

    ///
    /// Checkpointer thread's main loop
    ///
    fn checkpoint_loop(&self, conf: &'static PageServerConf) -> Result<()> {
-        while !tenant_mgr::shutdown_requested() {
+        loop {
            std::thread::sleep(conf.checkpoint_period);
            info!("checkpointer thread for tenant {} waking up", self.tenantid);

@@ -340,60 +304,44 @@ impl LayeredRepository {
            // bytes of WAL since last checkpoint.
            {
                let timelines = self.timelines.lock().unwrap();
-                for (timelineid, timeline) in timelines.iter() {
-                    let _entered =
-                        info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid)
-                            .entered();
-
+                for (_timelineid, timeline) in timelines.iter() {
                    STORAGE_TIME
                        .with_label_values(&["checkpoint_timed"])
                        .observe_closure_duration(|| {
-                            timeline.checkpoint_internal(conf.checkpoint_distance, false)
+                            timeline.checkpoint_internal(conf.checkpoint_distance)
                        })?
                }
                // release lock on 'timelines'
            }
        }
-        trace!("Checkpointer thread shut down");
-        Ok(())
    }

    ///
    /// Launch the GC thread in given repository.
    ///
-    pub fn launch_gc_thread(
-        conf: &'static PageServerConf,
-        rc: Arc<LayeredRepository>,
-    ) -> JoinHandle<()> {
-        std::thread::Builder::new()
+    pub fn launch_gc_thread(conf: &'static PageServerConf, rc: Arc<LayeredRepository>) {
+        let _thread = std::thread::Builder::new()
            .name("GC thread".into())
            .spawn(move || {
                // FIXME: relaunch it? Panic is not good.
                rc.gc_loop(conf).expect("GC thread died");
            })
-            .unwrap()
+            .unwrap();
    }

    ///
    /// GC thread's main loop
    ///
    fn gc_loop(&self, conf: &'static PageServerConf) -> Result<()> {
-        while !tenant_mgr::shutdown_requested() {
+        loop {
+            std::thread::sleep(conf.gc_period);
+            info!("gc thread for tenant {} waking up", self.tenantid);
+
            // Garbage collect old files that are not needed for PITR anymore
            if conf.gc_horizon > 0 {
                self.gc_iteration(None, conf.gc_horizon, false).unwrap();
            }
-
-            // TODO Write it in more adequate way using
-            // condvar.wait_timeout() or something
-            let mut sleep_time = conf.gc_period.as_secs();
-            while sleep_time > 0 && !tenant_mgr::shutdown_requested() {
-                sleep_time -= 1;
-                std::thread::sleep(Duration::from_secs(1));
-            }
-            info!("gc thread for tenant {} waking up", self.tenantid);
        }
-        Ok(())
    }

    /// Save timeline metadata to file
@@ -403,15 +351,17 @@ impl LayeredRepository {
        tenantid: ZTenantId,
        data: &TimelineMetadata,
        first_save: bool,
-    ) -> Result<()> {
-        let _enter = info_span!("saving metadata").entered();
-        let path = metadata_path(conf, timelineid, tenantid);
+    ) -> Result<PathBuf> {
+        let timeline_path = conf.timeline_path(&timelineid, &tenantid);
+        let path = timeline_path.join("metadata");
        // use OpenOptions to ensure file presence is consistent with first_save
        let mut file = OpenOptions::new()
            .write(true)
            .create_new(first_save)
            .open(&path)?;

+        info!("saving metadata {}", path.display());
+
        let mut metadata_bytes = TimelineMetadata::ser(data)?;

        assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
@@ -427,15 +377,11 @@ impl LayeredRepository {

        // fsync the parent directory to ensure the directory entry is durable
        if first_save {
-            let timeline_dir = File::open(
-                &path
-                    .parent()
-                    .expect("Metadata should always have a parent dir"),
-            )?;
+            let timeline_dir = File::open(&timeline_path)?;
            timeline_dir.sync_all()?;
        }

-        Ok(())
+        Ok(path)
    }

    fn load_metadata(
@@ -443,7 +389,7 @@ impl LayeredRepository {
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
    ) -> Result<TimelineMetadata> {
-        let path = metadata_path(conf, timelineid, tenantid);
+        let path = conf.timeline_path(&timelineid, &tenantid).join("metadata");
        let metadata_bytes = std::fs::read(&path)?;
        ensure!(metadata_bytes.len() == METADATA_MAX_SAFE_SIZE);

@@ -523,7 +469,7 @@ impl LayeredRepository {
            let timeline = self.get_timeline_locked(*timelineid, &mut *timelines)?;

            if let Some(ancestor_timeline) = &timeline.ancestor_timeline {
-                // If target_timeline is specified, we only need to know branchpoints of its children
+                // If target_timeline is specified, we only need to know branchpoints of its childs
                if let Some(timelineid) = target_timelineid {
                    if ancestor_timeline.timelineid == timelineid {
                        all_branchpoints
@@ -540,10 +486,6 @@ impl LayeredRepository {
        // Ok, we now know all the branch points.
        // Perform GC for each timeline.
        for timelineid in timelineids {
-            if tenant_mgr::shutdown_requested() {
-                return Ok(totals);
-            }
-
            // We have already loaded all timelines above
            // so this operation is just a quick map lookup.
            let timeline = self.get_timeline_locked(timelineid, &mut *timelines)?;
@@ -667,13 +609,6 @@ pub struct LayeredTimeline {

    /// If `true`, will backup its timeline files to remote storage after freezing.
    upload_relishes: bool,
-
-    /// Ensures layers aren't frozen by checkpointer between
-    /// [`LayeredTimeline::get_layer_for_write`] and layer reads.
-    /// Locked automatically by [`LayeredTimelineWriter`] and checkpointer.
-    /// Must always be acquired before the layer map/individual layer lock
-    /// to avoid deadlock.
-    write_lock: Mutex<()>,
 }

 /// Public interface functions
@@ -736,7 +671,13 @@ impl Timeline for LayeredTimeline {
            let segsize;
            if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? {
                segsize = layer.get_seg_size(lsn)?;
-                trace!("get_seg_size: {} at {} -> {}", seg, lsn, segsize);
+                trace!(
+                    "get_seg_size: {} at {}/{} -> {}",
+                    seg,
+                    self.timelineid,
+                    lsn,
+                    segsize
+                );
            } else {
                if segno == 0 {
                    return Ok(None);
@@ -838,13 +779,138 @@ impl Timeline for LayeredTimeline {
                result.insert(new_relish);
                trace!("List object {}", new_relish);
            } else {
-                trace!("Filtered out dropped object {}", new_relish);
+                trace!("Filter out droped object {}", new_relish);
            }
        }

        Ok(result)
    }

+    fn put_wal_record(&self, rel: RelishTag, blknum: u32, rec: WALRecord) -> Result<()> {
+        if !rel.is_blocky() && blknum != 0 {
+            bail!(
+                "invalid request for block {} for non-blocky relish {}",
+                blknum,
+                rel
+            );
+        }
+        ensure!(rec.lsn.is_aligned(), "unaligned record LSN");
+
+        let seg = SegmentTag::from_blknum(rel, blknum);
+        let delta_size = self.perform_write_op(seg, rec.lsn, |layer| {
+            layer.put_wal_record(blknum, rec.clone())
+        })?;
+        self.increase_current_logical_size(delta_size * BLCKSZ as u32);
+        Ok(())
+    }
+
+    fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: u32) -> anyhow::Result<()> {
+        if !rel.is_blocky() {
+            bail!("invalid truncation for non-blocky relish {}", rel);
+        }
+        ensure!(lsn.is_aligned(), "unaligned record LSN");
+
+        debug!("put_truncation: {} to {} blocks at {}", rel, relsize, lsn);
+
+        let oldsize = self
+            .get_relish_size(rel, self.get_last_record_lsn())?
+            .ok_or_else(|| {
+                anyhow!(
+                    "attempted to truncate non-existent relish {} at {}",
+                    rel,
+                    lsn
+                )
+            })?;
+
+        if oldsize <= relsize {
+            return Ok(());
+        }
+        let old_last_seg = (oldsize - 1) / RELISH_SEG_SIZE;
+
+        let last_remain_seg = if relsize == 0 {
+            0
+        } else {
+            (relsize - 1) / RELISH_SEG_SIZE
+        };
+
+        // Drop segments beyond the last remaining segment.
+        for remove_segno in (last_remain_seg + 1)..=old_last_seg {
+            let seg = SegmentTag {
+                rel,
+                segno: remove_segno,
+            };
+            self.perform_write_op(seg, lsn, |layer| layer.drop_segment(lsn))?;
+        }
+
+        // Truncate the last remaining segment to the specified size
+        if relsize == 0 || relsize % RELISH_SEG_SIZE != 0 {
+            let seg = SegmentTag {
+                rel,
+                segno: last_remain_seg,
+            };
+            self.perform_write_op(seg, lsn, |layer| {
+                layer.put_truncation(lsn, relsize % RELISH_SEG_SIZE)
+            })?;
+        }
+        self.decrease_current_logical_size((oldsize - relsize) * BLCKSZ as u32);
+        Ok(())
+    }
+
+    fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
+        trace!("drop_segment: {} at {}", rel, lsn);
+
+        if rel.is_blocky() {
+            if let Some(oldsize) = self.get_relish_size(rel, self.get_last_record_lsn())? {
+                let old_last_seg = if oldsize == 0 {
+                    0
+                } else {
+                    (oldsize - 1) / RELISH_SEG_SIZE
+                };
+
+                // Drop all segments of the relish
+                for remove_segno in 0..=old_last_seg {
+                    let seg = SegmentTag {
+                        rel,
+                        segno: remove_segno,
+                    };
+                    self.perform_write_op(seg, lsn, |layer| layer.drop_segment(lsn))?;
+                }
+                self.decrease_current_logical_size(oldsize * BLCKSZ as u32);
+            } else {
+                warn!(
+                    "drop_segment called on non-existent relish {} at {}",
+                    rel, lsn
+                );
+            }
+        } else {
+            // TODO handle TwoPhase relishes
+            let seg = SegmentTag::from_blknum(rel, 0);
+            self.perform_write_op(seg, lsn, |layer| layer.drop_segment(lsn))?;
+        }
+
+        Ok(())
+    }
+
+    fn put_page_image(&self, rel: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> {
+        if !rel.is_blocky() && blknum != 0 {
+            bail!(
+                "invalid request for block {} for non-blocky relish {}",
+                blknum,
+                rel
+            );
+        }
+        ensure!(lsn.is_aligned(), "unaligned record LSN");
+
+        let seg = SegmentTag::from_blknum(rel, blknum);
+
+        let delta_size = self.perform_write_op(seg, lsn, |layer| {
+            layer.put_page_image(blknum, lsn, img.clone())
+        })?;
+
+        self.increase_current_logical_size(delta_size * BLCKSZ as u32);
+        Ok(())
+    }
+
    /// Public entry point for checkpoint(). All the logic is in the private
    /// checkpoint_internal function, this public facade just wraps it for
    /// metrics collection.
@@ -852,7 +918,16 @@ impl Timeline for LayeredTimeline {
        STORAGE_TIME
            .with_label_values(&["checkpoint_force"])
            //pass checkpoint_distance=0 to force checkpoint
-            .observe_closure_duration(|| self.checkpoint_internal(0, true))
+            .observe_closure_duration(|| self.checkpoint_internal(0))
+    }
+
+    ///
+    /// Remember the (end of) last valid WAL record remembered in the timeline.
+    ///
+    fn advance_last_record_lsn(&self, new_lsn: Lsn) {
+        assert!(new_lsn.is_aligned());
+
+        self.last_record_lsn.advance(new_lsn);
    }

    fn get_last_record_lsn(&self) -> Lsn {
@@ -882,8 +957,6 @@ impl Timeline for LayeredTimeline {
    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize> {
        let mut total_blocks: usize = 0;

-        let _enter = info_span!("calc logical size", %lsn).entered();
-
        // list of all relations in this timeline, including ancestor timelines
        let all_rels = self.list_rels(0, 0, lsn)?;

@@ -905,13 +978,6 @@ impl Timeline for LayeredTimeline {

        Ok(total_blocks * BLCKSZ as usize)
    }
-
-    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a> {
-        Box::new(LayeredTimelineWriter {
-            tl: self,
-            _write_guard: self.write_lock.lock().unwrap(),
-        })
-    }
 }

 impl LayeredTimeline {
@@ -952,26 +1018,26 @@ impl LayeredTimeline {
            current_logical_size: AtomicUsize::new(current_logical_size),
            current_logical_size_gauge,
            upload_relishes,
-
-            write_lock: Mutex::new(()),
        };
        Ok(timeline)
    }

    ///
-    /// Scan the timeline directory to populate the layer map.
-    /// Returns all timeline-related files that were found and loaded.
+    /// Scan the timeline directory to populate the layer map
    ///
-    fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<Vec<PathBuf>> {
+    fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
+        info!(
+            "loading layer map for timeline {} into memory",
+            self.timelineid
+        );
        let mut layers = self.layers.lock().unwrap();
-        let mut num_layers = 0;
        let (imgfilenames, deltafilenames) =
            filename::list_files(self.conf, self.timelineid, self.tenantid)?;

        let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid);
-        let mut local_layers = Vec::with_capacity(imgfilenames.len() + deltafilenames.len());
+
        // First create ImageLayer structs for each image file.
-        for filename in &imgfilenames {
+        for filename in imgfilenames.iter() {
            if filename.lsn > disk_consistent_lsn {
                warn!(
                    "found future image layer {} on timeline {}",
@@ -984,13 +1050,17 @@ impl LayeredTimeline {

            let layer = ImageLayer::new(self.conf, self.timelineid, self.tenantid, filename);

-            trace!("found layer {}", layer.filename().display());
-            local_layers.push(layer.path());
+            info!(
+                "found layer {} {} on timeline {}",
+                layer.get_seg_tag(),
+                layer.get_start_lsn(),
+                self.timelineid
+            );
            layers.insert_historic(Arc::new(layer));
-            num_layers += 1;
        }

-        for filename in &deltafilenames {
+        // Then for the Delta files.
+        for filename in deltafilenames.iter() {
            ensure!(filename.start_lsn < filename.end_lsn);
            if filename.end_lsn > disk_consistent_lsn {
                warn!(
@@ -1004,14 +1074,15 @@ impl LayeredTimeline {

            let layer = DeltaLayer::new(self.conf, self.timelineid, self.tenantid, filename);

-            trace!("found layer {}", layer.filename().display());
-            local_layers.push(layer.path());
+            info!(
+                "found layer {} on timeline {}",
+                layer.filename().display(),
+                self.timelineid,
+            );
            layers.insert_historic(Arc::new(layer));
-            num_layers += 1;
        }
-        info!("loaded layer map with {} layers", num_layers);

-        Ok(local_layers)
+        Ok(())
    }

    ///
@@ -1060,7 +1131,12 @@ impl LayeredTimeline {
        lsn: Lsn,
        self_layers: &MutexGuard<LayerMap>,
    ) -> Result<Option<(Arc<dyn Layer>, Lsn)>> {
-        trace!("get_layer_for_read called for {} at {}", seg, lsn);
+        trace!(
+            "get_layer_for_read called for {} at {}/{}",
+            seg,
+            self.timelineid,
+            lsn
+        );

        // If you requested a page at an older LSN, before the branch point, dig into
        // the right ancestor timeline. This can only happen if you launch a read-only
@@ -1178,15 +1254,17 @@ impl LayeredTimeline {
                // First modification on this timeline
                start_lsn = self.ancestor_lsn + 1;
                trace!(
-                    "creating layer for write for {} at branch point {}",
+                    "creating layer for write for {} at branch point {}/{}",
                    seg,
+                    self.timelineid,
                    start_lsn
                );
            } else {
                start_lsn = prev_layer.get_end_lsn();
                trace!(
-                    "creating layer for write for {} after previous layer {}",
+                    "creating layer for write for {} after previous layer {}/{}",
                    seg,
+                    self.timelineid,
                    start_lsn
                );
            }
@@ -1227,20 +1305,31 @@ impl LayeredTimeline {
    /// Flush to disk all data that was written with the put_* functions
    ///
    /// NOTE: This has nothing to do with checkpoint in PostgreSQL.
-    fn checkpoint_internal(&self, checkpoint_distance: u64, forced: bool) -> Result<()> {
-        let mut write_guard = self.write_lock.lock().unwrap();
+    fn checkpoint_internal(&self, checkpoint_distance: u64) -> Result<()> {
+        // Grab lock on the layer map.
+        //
+        // TODO: We hold it locked throughout the checkpoint operation. That's bad,
+        // the checkpointing could take many seconds, and any incoming get_page_at_lsn()
+        // requests will block.
        let mut layers = self.layers.lock().unwrap();

        // Bump the generation number in the layer map, so that we can distinguish
        // entries inserted after the checkpoint started
        let current_generation = layers.increment_generation();

+        // Read 'last_record_lsn'. That becomes the cutoff LSN for frozen layers.
        let RecordLsn {
            last: last_record_lsn,
            prev: prev_record_lsn,
        } = self.last_record_lsn.load();

-        trace!("checkpoint starting at {}", last_record_lsn);
+        trace!(
+            "checkpointing timeline {} at {}",
+            self.timelineid,
+            last_record_lsn
+        );
+
+        let timeline_dir = File::open(self.conf.timeline_path(&self.timelineid, &self.tenantid))?;

        // Take the in-memory layer with the oldest WAL record. If it's older
        // than the threshold, write it out to disk as a new image and delta file.
@@ -1254,14 +1343,10 @@ impl LayeredTimeline {
        let mut disk_consistent_lsn = last_record_lsn;

        let mut created_historics = false;
-        let mut layer_uploads = Vec::new();
+
        while let Some((oldest_layer, oldest_generation)) = layers.peek_oldest_open() {
            let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn();

-            if tenant_mgr::shutdown_requested() && !forced {
-                return Ok(());
-            }
-
            // Does this layer need freezing?
            //
            // Write out all in-memory layers that contain WAL older than CHECKPOINT_DISTANCE.
@@ -1284,24 +1369,32 @@ impl LayeredTimeline {
                break;
            }

-            // Mark the layer as no longer accepting writes and record the end_lsn.
-            // This happens in-place, no new layers are created now.
-            // We call `get_last_record_lsn` again, which may be different from the
-            // original load, as we may have released the write lock since then.
-            oldest_layer.freeze(self.get_last_record_lsn());
+            // Freeze the layer.
+            //
+            // This is a two-step process. First, we "freeze" the in-memory
+            // layer, to close it for new writes, and replace the original
+            // layer with the new frozen in-memory layer (and possibly a new
+            // open layer to hold changes newer than the cutoff.) Then we write
+            // the frozen layer to disk, and replace the in-memory frozen layer
+            // with the new on-disk layers.
+            let FreezeLayers {
+                frozen,
+                open: maybe_new_open,
+            } = oldest_layer.freeze(last_record_lsn)?;

-            // The layer is no longer open, update the layer map to reflect this.
-            // We will replace it with on-disk historics below.
+            // replace this layer with the new layers that 'freeze' returned
            layers.pop_oldest_open();
-            layers.insert_historic(oldest_layer.clone());
+            if let Some(new_open) = maybe_new_open.clone() {
+                layers.insert_open(new_open);
+            }
+
+            // We temporarily insert InMemory layer into historic list here.
+            // TODO: check that all possible concurrent users of 'historic' treat it right
+            layers.insert_historic(frozen.clone());

            // Write the now-frozen layer to disk. That could take a while, so release the lock while do it
            drop(layers);
-            drop(write_guard);
-
-            let new_historics = oldest_layer.write_to_disk(self)?;
-
-            write_guard = self.write_lock.lock().unwrap();
+            let new_historics = frozen.write_to_disk(self)?;
            layers = self.layers.lock().unwrap();

            if !new_historics.is_empty() {
@@ -1309,16 +1402,11 @@ impl LayeredTimeline {
            }

            // Finally, replace the frozen in-memory layer with the new on-disk layers
-            layers.remove_historic(oldest_layer);
+            layers.remove_historic(frozen.clone());

            // Add the historics to the LayerMap
-            for delta_layer in new_historics.delta_layers {
-                layer_uploads.push(delta_layer.path());
-                layers.insert_historic(Arc::new(delta_layer));
-            }
-            for image_layer in new_historics.image_layers {
-                layer_uploads.push(image_layer.path());
-                layers.insert_historic(Arc::new(image_layer));
+            for n in new_historics {
+                layers.insert_historic(n);
            }
        }

@@ -1330,13 +1418,10 @@ impl LayeredTimeline {
        }

        drop(layers);
-        drop(write_guard);

        if created_historics {
            // We must fsync the timeline dir to ensure the directory entries for
            // new layer files are durable
-            let timeline_dir =
-                File::open(self.conf.timeline_path(&self.timelineid, &self.tenantid))?;
            timeline_dir.sync_all()?;
        }

@@ -1364,7 +1449,7 @@ impl LayeredTimeline {
            ancestor_timeline: ancestor_timelineid,
            ancestor_lsn: self.ancestor_lsn,
        };
-        LayeredRepository::save_metadata(
+        let _metadata_path = LayeredRepository::save_metadata(
            self.conf,
            self.timelineid,
            self.tenantid,
@@ -1373,10 +1458,12 @@ impl LayeredTimeline {
        )?;
        if self.upload_relishes {
            schedule_timeline_upload(())
-            // schedule_timeline_upload(
-            //     self.tenantid,
-            //     self.timelineid,
-            //     layer_uploads,
+            // schedule_timeline_upload(LocalTimeline {
+            //     tenant_id: self.tenantid,
+            //     timeline_id: self.timelineid,
+            //     metadata_path,
+            //     image_layers: image_layer_uploads,
+            //     delta_layers: delta_layer_uploads,
            //     disk_consistent_lsn,
            // });
        }
@@ -1413,11 +1500,11 @@ impl LayeredTimeline {
        let now = Instant::now();
        let mut result: GcResult = Default::default();

-        let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered();
-
-        info!("GC starting");
-
-        debug!("retain_lsns: {:?}", retain_lsns);
+        info!(
+            "running GC on timeline {}, cutoff {}",
+            self.timelineid, cutoff
+        );
+        info!("retain_lsns:  {:?}", retain_lsns);

        let mut layers_to_remove: Vec<Arc<dyn Layer>> = Vec::new();

@@ -1679,9 +1766,10 @@ impl LayeredTimeline {
        if data.records.is_empty() {
            if let Some(img) = &data.page_img {
                trace!(
-                    "found page image for blk {} in {} at {}, no WAL redo required",
+                    "found page image for blk {} in {} at {}/{}, no WAL redo required",
                    blknum,
                    rel,
+                    self.timelineid,
                    request_lsn
                );
                Ok(img.clone())
@@ -1707,9 +1795,9 @@ impl LayeredTimeline {
                Ok(ZERO_PAGE.clone())
            } else {
                if data.page_img.is_some() {
-                    trace!("found {} WAL records and a base image for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn);
+                    trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn);
                } else {
-                    trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn);
+                    trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn);
                }
                let img = self.walredo_mgr.request_redo(
                    rel,
@@ -1757,163 +1845,36 @@ impl LayeredTimeline {
        self.current_logical_size_gauge
            .set(val as i64 - diff as i64);
    }
-}

-struct LayeredTimelineWriter<'a> {
-    tl: &'a LayeredTimeline,
-    _write_guard: MutexGuard<'a, ()>,
-}
-
-impl Deref for LayeredTimelineWriter<'_> {
-    type Target = dyn Timeline;
-
-    fn deref(&self) -> &Self::Target {
-        self.tl
-    }
-}
-
-impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
-    fn put_wal_record(&self, rel: RelishTag, blknum: u32, rec: WALRecord) -> Result<()> {
-        if !rel.is_blocky() && blknum != 0 {
-            bail!(
-                "invalid request for block {} for non-blocky relish {}",
-                blknum,
-                rel
-            );
-        }
-        ensure!(rec.lsn.is_aligned(), "unaligned record LSN");
-
-        let seg = SegmentTag::from_blknum(rel, blknum);
-        let layer = self.tl.get_layer_for_write(seg, rec.lsn)?;
-        let delta_size = layer.put_wal_record(blknum, rec);
-        self.tl
-            .increase_current_logical_size(delta_size * BLCKSZ as u32);
-        Ok(())
-    }
-
-    fn put_page_image(&self, rel: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> {
-        if !rel.is_blocky() && blknum != 0 {
-            bail!(
-                "invalid request for block {} for non-blocky relish {}",
-                blknum,
-                rel
-            );
-        }
-        ensure!(lsn.is_aligned(), "unaligned record LSN");
-
-        let seg = SegmentTag::from_blknum(rel, blknum);
-
-        let layer = self.tl.get_layer_for_write(seg, lsn)?;
-        let delta_size = layer.put_page_image(blknum, lsn, img);
-
-        self.tl
-            .increase_current_logical_size(delta_size * BLCKSZ as u32);
-        Ok(())
-    }
-
-    fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: u32) -> Result<()> {
-        if !rel.is_blocky() {
-            bail!("invalid truncation for non-blocky relish {}", rel);
-        }
-        ensure!(lsn.is_aligned(), "unaligned record LSN");
-
-        debug!("put_truncation: {} to {} blocks at {}", rel, relsize, lsn);
-
-        let oldsize = self
-            .tl
-            .get_relish_size(rel, self.tl.get_last_record_lsn())?
-            .ok_or_else(|| {
-                anyhow!(
-                    "attempted to truncate non-existent relish {} at {}",
-                    rel,
-                    lsn
-                )
-            })?;
-
-        if oldsize <= relsize {
-            return Ok(());
-        }
-        let old_last_seg = (oldsize - 1) / RELISH_SEG_SIZE;
-
-        let last_remain_seg = if relsize == 0 {
-            0
-        } else {
-            (relsize - 1) / RELISH_SEG_SIZE
-        };
-
-        // Drop segments beyond the last remaining segment.
-        for remove_segno in (last_remain_seg + 1)..=old_last_seg {
-            let seg = SegmentTag {
-                rel,
-                segno: remove_segno,
-            };
-
-            let layer = self.tl.get_layer_for_write(seg, lsn)?;
-            layer.drop_segment(lsn);
-        }
-
-        // Truncate the last remaining segment to the specified size
-        if relsize == 0 || relsize % RELISH_SEG_SIZE != 0 {
-            let seg = SegmentTag {
-                rel,
-                segno: last_remain_seg,
-            };
-            let layer = self.tl.get_layer_for_write(seg, lsn)?;
-            layer.put_truncation(lsn, relsize % RELISH_SEG_SIZE)
-        }
-        self.tl
-            .decrease_current_logical_size((oldsize - relsize) * BLCKSZ as u32);
-        Ok(())
-    }
-
-    fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> {
-        trace!("drop_segment: {} at {}", rel, lsn);
-
-        if rel.is_blocky() {
-            if let Some(oldsize) = self
-                .tl
-                .get_relish_size(rel, self.tl.get_last_record_lsn())?
-            {
-                let old_last_seg = if oldsize == 0 {
-                    0
-                } else {
-                    (oldsize - 1) / RELISH_SEG_SIZE
-                };
-
-                // Drop all segments of the relish
-                for remove_segno in 0..=old_last_seg {
-                    let seg = SegmentTag {
-                        rel,
-                        segno: remove_segno,
-                    };
-                    let layer = self.tl.get_layer_for_write(seg, lsn)?;
-                    layer.drop_segment(lsn);
-                }
-                self.tl
-                    .decrease_current_logical_size(oldsize * BLCKSZ as u32);
-            } else {
-                warn!(
-                    "drop_segment called on non-existent relish {} at {}",
-                    rel, lsn
-                );
+    /// If a layer is in the process of being replaced in [`LayerMap`], write
+    /// operations will fail with [`NonWriteableError`]. This may happen due to
+    /// a race: the checkpointer thread freezes a layer just after
+    /// [`Self::get_layer_for_write`] returned it. To handle this error, we try
+    /// again getting the layer and attempt the write.
+    fn perform_write_op<R>(
+        &self,
+        seg: SegmentTag,
+        lsn: Lsn,
+        write_op: impl Fn(&Arc<InMemoryLayer>) -> WriteResult<R>,
+    ) -> anyhow::Result<R> {
+        let mut layer = self.get_layer_for_write(seg, lsn)?;
+        loop {
+            match write_op(&layer) {
+                Ok(r) => return Ok(r),
+                Err(NonWriteableError {}) => {}
            }
-        } else {
-            // TODO handle TwoPhase relishes
-            let seg = SegmentTag::from_blknum(rel, 0);
-            let layer = self.tl.get_layer_for_write(seg, lsn)?;
-            layer.drop_segment(lsn);
+
+            info!(
+                "attempted to write to non-writeable layer, retrying {} {}",
+                seg, lsn
+            );
+
+            // layer was non-writeable, try again
+            let new_layer = self.get_layer_for_write(seg, lsn)?;
+            // the new layer does not have to be writeable, but it should at least be different
+            assert!(!Arc::ptr_eq(&layer, &new_layer));
+            layer = new_layer;
        }
-
-        Ok(())
-    }
-
-    ///
-    /// Remember the (end of) last valid WAL record remembered in the timeline.
-    ///
-    fn advance_last_record_lsn(&self, new_lsn: Lsn) {
-        assert!(new_lsn.is_aligned());
-
-        self.tl.last_record_lsn.advance(new_lsn);
    }
 }

@@ -1935,15 +1896,6 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> {
    Ok(())
 }

-fn metadata_path(
-    conf: &'static PageServerConf,
-    timelineid: ZTimelineId,
-    tenantid: ZTenantId,
-) -> PathBuf {
-    conf.timeline_path(&timelineid, &tenantid)
-        .join(METADATA_FILE_NAME)
-}
-
 /// Add a suffix to a layer file's name: .{num}.old
 /// Uses the first available num (starts at 0)
 fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -65,6 +65,7 @@ use zenith_utils::bin_ser::BeSer;
 use zenith_utils::lsn::Lsn;

 use super::blob::{read_blob, BlobRange};
+use super::page_versions::OrderedBlockIter;

 // Magic constant to identify a Zenith delta file
 pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01;
@@ -169,7 +170,29 @@ impl Layer for DeltaLayer {
    }

    fn filename(&self) -> PathBuf {
-        PathBuf::from(self.layer_name().to_string())
+        PathBuf::from(
+            DeltaFileName {
+                seg: self.seg,
+                start_lsn: self.start_lsn,
+                end_lsn: self.end_lsn,
+                dropped: self.dropped,
+            }
+            .to_string(),
+        )
+    }
+
+    fn path(&self) -> Option<PathBuf> {
+        Some(Self::path_for(
+            &self.path_or_conf,
+            self.timelineid,
+            self.tenantid,
+            &DeltaFileName {
+                seg: self.seg,
+                start_lsn: self.start_lsn,
+                end_lsn: self.end_lsn,
+                dropped: self.dropped,
+            },
+        ))
    }

    /// Look up given page in the cache.
@@ -278,7 +301,9 @@ impl Layer for DeltaLayer {

    fn delete(&self) -> Result<()> {
        // delete underlying file
-        fs::remove_file(self.path())?;
+        if let Some(path) = self.path() {
+            fs::remove_file(path)?;
+        }
        Ok(())
    }

@@ -358,7 +383,7 @@ impl DeltaLayer {
        start_lsn: Lsn,
        end_lsn: Lsn,
        dropped: bool,
-        page_versions: impl Iterator<Item = (u32, Lsn, PageVersion)>,
+        page_versions: OrderedBlockIter,
        relsizes: VecMap<Lsn, u32>,
    ) -> Result<DeltaLayer> {
        if seg.rel.is_blocky() {
@@ -382,7 +407,9 @@ impl DeltaLayer {
        let mut inner = delta_layer.inner.lock().unwrap();

        // Write the in-memory btreemaps into a file
-        let path = delta_layer.path();
+        let path = delta_layer
+            .path()
+            .expect("DeltaLayer is supposed to have a layer path on disk");

        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
@@ -392,15 +419,20 @@ impl DeltaLayer {

        let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);

-        for (blknum, lsn, page_version) in page_versions {
-            // TODO avoid deserializing and then reserializing
-            let buf = PageVersion::ser(&page_version)?;
-            let blob_range = page_version_writer.write_blob(&buf)?;
+        for (blknum, history) in page_versions {
+            for (lsn, page_version) in history.as_slice() {
+                if lsn >= &end_lsn {
+                    continue;
+                }

-            inner
-                .page_version_metas
-                .append((blknum, lsn), blob_range)
-                .unwrap();
+                let buf = PageVersion::ser(page_version)?;
+                let blob_range = page_version_writer.write_blob(&buf)?;
+
+                inner
+                    .page_version_metas
+                    .append((blknum, *lsn), blob_range)
+                    .unwrap();
+            }
        }

        let book = page_version_writer.close()?;
@@ -447,7 +479,12 @@ impl DeltaLayer {
            &self.path_or_conf,
            self.timelineid,
            self.tenantid,
-            &self.layer_name(),
+            &DeltaFileName {
+                seg: self.seg,
+                start_lsn: self.start_lsn,
+                end_lsn: self.end_lsn,
+                dropped: self.dropped,
+            },
        );

        let file = File::open(&path)?;
@@ -556,23 +593,4 @@ impl DeltaLayer {
            }),
        })
    }
-
-    fn layer_name(&self) -> DeltaFileName {
-        DeltaFileName {
-            seg: self.seg,
-            start_lsn: self.start_lsn,
-            end_lsn: self.end_lsn,
-            dropped: self.dropped,
-        }
-    }
-
-    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            self.timelineid,
-            self.tenantid,
-            &self.layer_name(),
-        )
-    }
 }
--- a/pageserver/src/layered_repository/filename.rs
+++ b/pageserver/src/layered_repository/filename.rs
@@ -13,8 +13,6 @@ use anyhow::Result;
 use log::*;
 use zenith_utils::lsn::Lsn;

-use super::METADATA_FILE_NAME;
-
 // Note: LayeredTimeline::load_layer_map() relies on this sort order
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
 pub struct DeltaFileName {
@@ -37,7 +35,7 @@ impl DeltaFileName {
    /// Parse a string as a delta file name. Returns None if the filename does not
    /// match the expected pattern.
    ///
-    pub fn parse_str(fname: &str) -> Option<Self> {
+    pub fn from_str(fname: &str) -> Option<Self> {
        let rel;
        let mut parts;
        if let Some(rest) = fname.strip_prefix("rel_") {
@@ -170,7 +168,7 @@ impl ImageFileName {
    /// Parse a string as an image file name. Returns None if the filename does not
    /// match the expected pattern.
    ///
-    pub fn parse_str(fname: &str) -> Option<Self> {
+    pub fn from_str(fname: &str) -> Option<Self> {
        let rel;
        let mut parts;
        if let Some(rest) = fname.strip_prefix("rel_") {
@@ -288,11 +286,11 @@ pub fn list_files(
        let fname = direntry?.file_name();
        let fname = fname.to_str().unwrap();

-        if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
+        if let Some(deltafilename) = DeltaFileName::from_str(fname) {
            deltafiles.push(deltafilename);
-        } else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
+        } else if let Some(imgfilename) = ImageFileName::from_str(fname) {
            imgfiles.push(imgfilename);
-        } else if fname == METADATA_FILE_NAME || fname == "ancestor" || fname.ends_with(".old") {
+        } else if fname == "metadata" || fname == "ancestor" || fname.ends_with(".old") {
            // ignore these
        } else {
            warn!("unrecognized filename in timeline dir: {}", fname);
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -114,7 +114,25 @@ pub struct ImageLayerInner {

 impl Layer for ImageLayer {
    fn filename(&self) -> PathBuf {
-        PathBuf::from(self.layer_name().to_string())
+        PathBuf::from(
+            ImageFileName {
+                seg: self.seg,
+                lsn: self.lsn,
+            }
+            .to_string(),
+        )
+    }
+
+    fn path(&self) -> Option<PathBuf> {
+        Some(Self::path_for(
+            &self.path_or_conf,
+            self.timelineid,
+            self.tenantid,
+            &ImageFileName {
+                seg: self.seg,
+                lsn: self.lsn,
+            },
+        ))
    }

    fn get_timeline_id(&self) -> ZTimelineId {
@@ -204,7 +222,9 @@ impl Layer for ImageLayer {

    fn delete(&self) -> Result<()> {
        // delete underlying file
-        fs::remove_file(self.path())?;
+        if let Some(path) = self.path() {
+            fs::remove_file(path)?;
+        }
        Ok(())
    }

@@ -280,7 +300,9 @@ impl ImageLayer {
        let inner = layer.inner.lock().unwrap();

        // Write the images into a file
-        let path = layer.path();
+        let path = layer
+            .path()
+            .expect("ImageLayer is supposed to have a layer path on disk");
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
        let file = File::create(&path)?;
@@ -318,7 +340,7 @@ impl ImageLayer {
        let writer = book.close()?;
        writer.get_ref().sync_all()?;

-        trace!("saved {}", path.display());
+        trace!("saved {}", &path.display());

        drop(inner);

@@ -423,7 +445,15 @@ impl ImageLayer {
    }

    fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
-        let path = self.path();
+        let path = Self::path_for(
+            &self.path_or_conf,
+            self.timelineid,
+            self.tenantid,
+            &ImageFileName {
+                seg: self.seg,
+                lsn: self.lsn,
+            },
+        );

        let file = File::open(&path)?;
        let book = Book::new(file)?;
@@ -470,21 +500,4 @@ impl ImageLayer {
            }),
        })
    }
-
-    fn layer_name(&self) -> ImageFileName {
-        ImageFileName {
-            seg: self.seg,
-            lsn: self.lsn,
-        }
-    }
-
-    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> PathBuf {
-        Self::path_for(
-            &self.path_or_conf,
-            self.timelineid,
-            self.tenantid,
-            &self.layer_name(),
-        )
-    }
 }
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -15,10 +15,12 @@ use crate::{ZTenantId, ZTimelineId};
 use anyhow::{bail, ensure, Result};
 use bytes::Bytes;
 use log::*;
+use std::cmp::min;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
 use zenith_utils::vec_map::VecMap;

+use zenith_utils::accum::Accum;
 use zenith_utils::lsn::Lsn;

 use super::page_versions::PageVersions;
@@ -35,6 +37,9 @@ pub struct InMemoryLayer {
    ///
    start_lsn: Lsn,

+    /// Frozen in-memory layers have an inclusive end LSN.
+    end_lsn: Option<Lsn>,
+
    /// LSN of the oldest page version stored in this layer
    oldest_pending_lsn: Lsn,

@@ -47,13 +52,8 @@ pub struct InMemoryLayer {
 }

 pub struct InMemoryLayerInner {
-    /// Frozen in-memory layers have an exclusive end LSN.
-    /// Writes are only allowed when this is None
-    end_lsn: Option<Lsn>,
-
    /// If this relation was dropped, remember when that happened.
-    /// The drop LSN is recorded in [`end_lsn`].
-    dropped: bool,
+    drop_lsn: Option<Lsn>,

    ///
    /// All versions of all pages in the layer are are kept here.
@@ -69,11 +69,19 @@ pub struct InMemoryLayerInner {
    /// a non-blocky rel, 'segsizes' is not used and is always empty.
    ///
    segsizes: VecMap<Lsn, u32>,
+
+    /// Writes are only allowed when true.
+    /// Set to false when this layer is in the process of being replaced.
+    writeable: bool,
 }

 impl InMemoryLayerInner {
-    fn assert_writeable(&self) {
-        assert!(self.end_lsn.is_none());
+    fn check_writeable(&self) -> WriteResult<()> {
+        if self.writeable {
+            Ok(())
+        } else {
+            Err(NonWriteableError)
+        }
    }

    fn get_seg_size(&self, lsn: Lsn) -> u32 {
@@ -96,23 +104,30 @@ impl Layer for InMemoryLayer {
        let inner = self.inner.read().unwrap();

        let end_lsn;
-        if let Some(drop_lsn) = inner.end_lsn {
+        let dropped;
+        if let Some(drop_lsn) = inner.drop_lsn {
            end_lsn = drop_lsn;
+            dropped = true;
        } else {
            end_lsn = Lsn(u64::MAX);
+            dropped = false;
        }

        let delta_filename = DeltaFileName {
            seg: self.seg,
            start_lsn: self.start_lsn,
            end_lsn,
-            dropped: inner.dropped,
+            dropped,
        }
        .to_string();

        PathBuf::from(format!("inmem-{}", delta_filename))
    }

+    fn path(&self) -> Option<PathBuf> {
+        None
+    }
+
    fn get_timeline_id(&self) -> ZTimelineId {
        self.timelineid
    }
@@ -126,10 +141,14 @@ impl Layer for InMemoryLayer {
    }

    fn get_end_lsn(&self) -> Lsn {
+        if let Some(end_lsn) = self.end_lsn {
+            return Lsn(end_lsn.0 + 1);
+        }
+
        let inner = self.inner.read().unwrap();

-        if let Some(end_lsn) = inner.end_lsn {
-            end_lsn
+        if let Some(drop_lsn) = inner.drop_lsn {
+            drop_lsn
        } else {
            Lsn(u64::MAX)
        }
@@ -137,7 +156,7 @@ impl Layer for InMemoryLayer {

    fn is_dropped(&self) -> bool {
        let inner = self.inner.read().unwrap();
-        inner.dropped
+        inner.drop_lsn.is_some()
    }

    /// Look up given page in the cache.
@@ -157,7 +176,8 @@ impl Layer for InMemoryLayer {
            // Scan the page versions backwards, starting from `lsn`.
            let iter = inner
                .page_versions
-                .iter_block_lsn_range(blknum, ..=lsn)
+                .get_block_lsn_range(blknum, ..=lsn)
+                .iter()
                .rev();
            for (_entry_lsn, entry) in iter {
                if let Some(img) = &entry.page_image {
@@ -214,8 +234,8 @@ impl Layer for InMemoryLayer {
        assert!(lsn >= self.start_lsn);

        // Is the requested LSN after the segment was dropped?
-        if let Some(end_lsn) = inner.end_lsn {
-            if lsn >= end_lsn {
+        if let Some(drop_lsn) = inner.drop_lsn {
+            if lsn >= drop_lsn {
                return Ok(false);
            }
        }
@@ -246,47 +266,56 @@ impl Layer for InMemoryLayer {
        let inner = self.inner.read().unwrap();

        let end_str = inner
-            .end_lsn
+            .drop_lsn
            .as_ref()
-            .map(Lsn::to_string)
+            .map(|drop_lsn| drop_lsn.to_string())
            .unwrap_or_default();

        println!(
-            "----- in-memory layer for tli {} seg {} {}-{} {} ----",
-            self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
+            "----- in-memory layer for tli {} seg {} {}-{} ----",
+            self.timelineid, self.seg, self.start_lsn, end_str
        );

        for (k, v) in inner.segsizes.as_slice() {
            println!("segsizes {}: {}", k, v);
        }

-        for (blknum, lsn, pv) in inner.page_versions.ordered_page_version_iter(None) {
-            println!(
-                "blk {} at {}: {}/{}\n",
-                blknum,
-                lsn,
-                pv.page_image.is_some(),
-                pv.record.is_some()
-            );
+        for (blknum, history) in inner.page_versions.ordered_block_iter() {
+            for (lsn, pv) in history.as_slice() {
+                println!(
+                    "blk {} at {}: {}/{}\n",
+                    blknum,
+                    lsn,
+                    pv.page_image.is_some(),
+                    pv.record.is_some()
+                );
+            }
        }

        Ok(())
    }
 }

-/// A result of an inmemory layer data being written to disk.
-pub struct LayersOnDisk {
-    pub delta_layers: Vec<DeltaLayer>,
-    pub image_layers: Vec<ImageLayer>,
-}
+/// Write failed because the layer is in process of being replaced.
+/// See [`LayeredTimeline::perform_write_op`] for how to handle this error.
+#[derive(Debug)]
+pub struct NonWriteableError;

-impl LayersOnDisk {
-    pub fn is_empty(&self) -> bool {
-        self.delta_layers.is_empty() && self.image_layers.is_empty()
-    }
+pub type WriteResult<T> = std::result::Result<T, NonWriteableError>;
+
+/// Helper struct to cleanup `InMemoryLayer::freeze` return signature.
+pub struct FreezeLayers {
+    /// Replacement layer for the layer which freeze was called on.
+    pub frozen: Arc<InMemoryLayer>,
+    /// New open layer containing leftover data.
+    pub open: Option<Arc<InMemoryLayer>>,
 }

 impl InMemoryLayer {
+    fn assert_not_frozen(&self) {
+        assert!(self.end_lsn.is_none());
+    }
+
    /// Return the oldest page version that's stored in this layer
    pub fn get_oldest_pending_lsn(&self) -> Lsn {
        self.oldest_pending_lsn
@@ -322,13 +351,14 @@ impl InMemoryLayer {
            tenantid,
            seg,
            start_lsn,
+            end_lsn: None,
            oldest_pending_lsn,
            incremental: false,
            inner: RwLock::new(InMemoryLayerInner {
-                end_lsn: None,
-                dropped: false,
+                drop_lsn: None,
                page_versions: PageVersions::default(),
                segsizes,
+                writeable: true,
            }),
        })
    }
@@ -336,7 +366,7 @@ impl InMemoryLayer {
    // Write operations

    /// Remember new page version, as a WAL record over previous version
-    pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> u32 {
+    pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> WriteResult<u32> {
        self.put_page_version(
            blknum,
            rec.lsn,
@@ -348,7 +378,7 @@ impl InMemoryLayer {
    }

    /// Remember new page version, as a full page image
-    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> u32 {
+    pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> WriteResult<u32> {
        self.put_page_version(
            blknum,
            lsn,
@@ -361,7 +391,8 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> u32 {
+    pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> WriteResult<u32> {
+        self.assert_not_frozen();
        assert!(self.seg.blknum_in_seg(blknum));

        trace!(
@@ -373,7 +404,7 @@ impl InMemoryLayer {
        );
        let mut inner = self.inner.write().unwrap();

-        inner.assert_writeable();
+        inner.check_writeable()?;

        let old = inner.page_versions.append_or_update_last(blknum, lsn, pv);

@@ -434,22 +465,22 @@ impl InMemoryLayer {
                }

                inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
-                return newsize - oldsize;
+                return Ok(newsize - oldsize);
            }
        }
-
-        0
+        Ok(0)
    }

    /// Remember that the relation was truncated at given LSN
-    pub fn put_truncation(&self, lsn: Lsn, segsize: u32) {
+    pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> WriteResult<()> {
        assert!(
            self.seg.rel.is_blocky(),
            "put_truncation() called on a non-blocky rel"
        );
+        self.assert_not_frozen();

        let mut inner = self.inner.write().unwrap();
-        inner.assert_writeable();
+        inner.check_writeable()?;

        // check that this we truncate to a smaller size than segment was before the truncation
        let oldsize = inner.get_seg_size(lsn);
@@ -461,19 +492,25 @@ impl InMemoryLayer {
            // We already had an entry for this LSN. That's odd..
            warn!("Inserting truncation, but had an entry for the LSN already");
        }
+
+        Ok(())
    }

    /// Remember that the segment was dropped at given LSN
-    pub fn drop_segment(&self, lsn: Lsn) {
+    pub fn drop_segment(&self, lsn: Lsn) -> WriteResult<()> {
+        self.assert_not_frozen();
+
        let mut inner = self.inner.write().unwrap();

-        assert!(inner.end_lsn.is_none());
-        assert!(!inner.dropped);
-        inner.dropped = true;
-        assert!(self.start_lsn < lsn);
-        inner.end_lsn = Some(lsn);
+        inner.check_writeable()?;
+
+        assert!(inner.drop_lsn.is_none());
+        inner.drop_lsn = Some(lsn);
+        inner.writeable = false;

        trace!("dropped segment {} at {}", self.seg, lsn);
+
+        Ok(())
    }

    ///
@@ -513,43 +550,117 @@ impl InMemoryLayer {
            tenantid,
            seg,
            start_lsn,
+            end_lsn: None,
            oldest_pending_lsn,
            incremental: true,
            inner: RwLock::new(InMemoryLayerInner {
-                end_lsn: None,
-                dropped: false,
+                drop_lsn: None,
                page_versions: PageVersions::default(),
                segsizes,
+                writeable: true,
            }),
        })
    }

    pub fn is_writeable(&self) -> bool {
        let inner = self.inner.read().unwrap();
-        inner.end_lsn.is_none()
+        inner.writeable
    }

-    /// Make the layer non-writeable. Only call once.
-    /// Records the end_lsn for non-dropped layers.
-    /// `end_lsn` is inclusive
-    pub fn freeze(&self, end_lsn: Lsn) {
-        let mut inner = self.inner.write().unwrap();
+    /// Splits `self` into two InMemoryLayers: `frozen` and `open`.
+    /// All data up to and including `cutoff_lsn`
+    /// is copied to `frozen`, while the remaining data is copied to `open`.
+    /// After completion, self is non-writeable, but not frozen.
+    pub fn freeze(self: Arc<Self>, cutoff_lsn: Lsn) -> Result<FreezeLayers> {
+        info!(
+            "freezing in memory layer {} on timeline {} at {} (oldest {})",
+            self.filename().display(),
+            self.timelineid,
+            cutoff_lsn,
+            self.oldest_pending_lsn
+        );

-        if inner.end_lsn.is_some() {
-            assert!(inner.dropped);
-        } else {
-            assert!(!inner.dropped);
-            assert!(self.start_lsn < end_lsn + 1);
-            inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
+        self.assert_not_frozen();

-            if let Some((lsn, _)) = inner.segsizes.as_slice().last() {
-                assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
-            }
+        let self_ref = self.clone();
+        let mut inner = self_ref.inner.write().unwrap();
+        // Dropped layers don't need any special freeze actions,
+        // they are marked as non-writeable at drop and just
+        // written out to disk by checkpointer.
+        if inner.drop_lsn.is_some() {
+            assert!(!inner.writeable);
+            info!(
+                "freezing in memory layer for {} on timeline {} is dropped at {}",
+                self.seg,
+                self.timelineid,
+                inner.drop_lsn.unwrap()
+            );

-            for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) {
-                assert!(lsn <= end_lsn);
-            }
+            // There should be no newer layer that refers this non-writeable layer,
+            // because layer that is created after dropped one represents a new rel.
+            return Ok(FreezeLayers {
+                frozen: self,
+                open: None,
+            });
        }
+        assert!(inner.writeable);
+        inner.writeable = false;
+
+        // Divide all the page versions into old and new
+        // at the 'cutoff_lsn' point.
+        let mut after_oldest_lsn: Accum<Lsn> = Accum(None);
+
+        let cutoff_lsn_exclusive = Lsn(cutoff_lsn.0 + 1);
+
+        let (before_segsizes, mut after_segsizes) = inner.segsizes.split_at(&cutoff_lsn_exclusive);
+        if let Some((lsn, _size)) = after_segsizes.as_slice().first() {
+            after_oldest_lsn.accum(min, *lsn);
+        }
+
+        let (before_page_versions, after_page_versions) = inner
+            .page_versions
+            .split_at(cutoff_lsn_exclusive, &mut after_oldest_lsn);
+
+        let frozen = Arc::new(InMemoryLayer {
+            conf: self.conf,
+            tenantid: self.tenantid,
+            timelineid: self.timelineid,
+            seg: self.seg,
+            start_lsn: self.start_lsn,
+            end_lsn: Some(cutoff_lsn),
+            oldest_pending_lsn: self.start_lsn,
+            incremental: self.incremental,
+            inner: RwLock::new(InMemoryLayerInner {
+                drop_lsn: inner.drop_lsn,
+                page_versions: before_page_versions,
+                segsizes: before_segsizes,
+                writeable: false,
+            }),
+        });
+
+        let open = if !after_segsizes.is_empty() || !after_page_versions.is_empty() {
+            let mut new_open = Self::create_successor_layer(
+                self.conf,
+                frozen.clone(),
+                self.timelineid,
+                self.tenantid,
+                cutoff_lsn + 1,
+                after_oldest_lsn.0.unwrap(),
+            )?;
+
+            let new_inner = new_open.inner.get_mut().unwrap();
+            // Ensure page_versions doesn't contain anything
+            // so we can just replace it
+            assert!(new_inner.page_versions.is_empty());
+            new_inner.page_versions = after_page_versions;
+            new_inner.segsizes.extend(&mut after_segsizes).unwrap();
+
+            Some(Arc::new(new_open))
+        } else {
+            None
+        };
+
+        Ok(FreezeLayers { frozen, open })
    }

    /// Write the this frozen in-memory layer to disk.
@@ -560,15 +671,16 @@ impl InMemoryLayer {
    /// WAL records between start and end LSN. (The delta layer is not needed
    /// when a new relish is created with a single LSN, so that the start and
    /// end LSN are the same.)
-    pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<LayersOnDisk> {
+    pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<Vec<Arc<dyn Layer>>> {
        trace!(
-            "write_to_disk {} get_end_lsn is {}",
+            "write_to_disk {} end_lsn is {} get_end_lsn is {}",
            self.filename().display(),
+            self.end_lsn.unwrap_or(Lsn(0)),
            self.get_end_lsn()
        );

        // Grab the lock in read-mode. We hold it over the I/O, but because this
-        // layer is not writeable anymore, no one should be trying to acquire the
+        // layer is not writeable anymore, no one should be trying to aquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
        // though: another thread might have grabbed a reference to this layer
        // in `get_layer_for_write' just before the checkpointer called
@@ -577,45 +689,36 @@ impl InMemoryLayer {
        // would have to wait until we release it. That race condition is very
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().unwrap();
-        let end_lsn_exclusive = inner.end_lsn.unwrap();
+        assert!(!inner.writeable);

-        if inner.dropped {
+        if let Some(drop_lsn) = inner.drop_lsn {
            let delta_layer = DeltaLayer::create(
                self.conf,
                self.timelineid,
                self.tenantid,
                self.seg,
                self.start_lsn,
-                end_lsn_exclusive,
+                drop_lsn,
                true,
-                inner.page_versions.ordered_page_version_iter(None),
+                inner.page_versions.ordered_block_iter(),
                inner.segsizes.clone(),
            )?;
            trace!(
                "freeze: created delta layer for dropped segment {} {}-{}",
                self.seg,
                self.start_lsn,
-                end_lsn_exclusive
+                drop_lsn
            );
-            return Ok(LayersOnDisk {
-                delta_layers: vec![delta_layer],
-                image_layers: Vec::new(),
-            });
+            return Ok(vec![Arc::new(delta_layer)]);
        }

-        // Since `end_lsn` is inclusive, subtract 1.
-        // We want to make an ImageLayer for the last included LSN,
-        // so the DeltaLayer should exlcude that LSN.
-        let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
+        let end_lsn = self.end_lsn.unwrap();

-        let mut page_versions = inner
-            .page_versions
-            .ordered_page_version_iter(Some(end_lsn_inclusive));
+        let mut frozen_layers: Vec<Arc<dyn Layer>> = Vec::new();

-        let mut delta_layers = Vec::new();
+        if self.start_lsn != end_lsn {
+            let (before_segsizes, _after_segsizes) = inner.segsizes.split_at(&Lsn(end_lsn.0 + 1));

-        if self.start_lsn != end_lsn_inclusive {
-            let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive);
            // Write the page versions before the cutoff to disk.
            let delta_layer = DeltaLayer::create(
                self.conf,
@@ -623,36 +726,32 @@ impl InMemoryLayer {
                self.tenantid,
                self.seg,
                self.start_lsn,
-                end_lsn_inclusive,
+                end_lsn,
                false,
-                page_versions,
-                segsizes,
+                inner.page_versions.ordered_block_iter(),
+                before_segsizes,
            )?;
-            delta_layers.push(delta_layer);
+            frozen_layers.push(Arc::new(delta_layer));
            trace!(
                "freeze: created delta layer {} {}-{}",
                self.seg,
                self.start_lsn,
-                end_lsn_inclusive
+                end_lsn
            );
        } else {
-            assert!(page_versions.next().is_none());
+            for (_blknum, history) in inner.page_versions.ordered_block_iter() {
+                let (lsn, _pv) = history.as_slice().first().unwrap();
+                assert!(lsn >= &end_lsn);
+            }
        }

        drop(inner);

        // Write a new base image layer at the cutoff point
-        let image_layer =
-            ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?;
-        trace!(
-            "freeze: created image layer {} at {}",
-            self.seg,
-            end_lsn_inclusive
-        );
+        let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
+        frozen_layers.push(Arc::new(image_layer));
+        trace!("freeze: created image layer {} at {}", self.seg, end_lsn);

-        Ok(LayersOnDisk {
-            delta_layers,
-            image_layers: vec![image_layer],
-        })
+        Ok(frozen_layers)
    }
 }
--- a/pageserver/src/layered_repository/page_versions.rs
+++ b/pageserver/src/layered_repository/page_versions.rs
@@ -1,184 +1,96 @@
-use std::{
-    collections::HashMap,
-    ops::{Range, RangeBounds},
-    slice,
-};
+use std::{collections::HashMap, ops::RangeBounds};

-use zenith_utils::{bin_ser::LeSer, lsn::Lsn, vec_map::VecMap};
+use zenith_utils::{accum::Accum, lsn::Lsn, vec_map::VecMap};

 use super::storage_layer::PageVersion;

-const EMPTY_SLICE: &[(Lsn, Range<usize>)] = &[];
+const EMPTY_SLICE: &[(Lsn, PageVersion)] = &[];

 #[derive(Debug, Default)]
-pub struct PageVersions {
-    heap: Vec<u8>,
-    ranges: HashMap<u32, VecMap<Lsn, Range<usize>>>,
-}
+pub struct PageVersions(HashMap<u32, VecMap<Lsn, PageVersion>>);

 impl PageVersions {
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
    pub fn append_or_update_last(
        &mut self,
        blknum: u32,
        lsn: Lsn,
        page_version: PageVersion,
    ) -> Option<PageVersion> {
-        let mut new_bytes = PageVersion::ser(&page_version).unwrap();
-
-        let map = self.ranges.entry(blknum).or_insert_with(VecMap::default);
-
-        if let Some((last_lsn, last_range)) = map.as_slice().last() {
-            if lsn == *last_lsn {
-                let old_bytes = &self.heap[last_range.clone()];
-                if old_bytes == new_bytes {
-                    return Some(page_version);
-                }
-                // TODO optimize for case when old_bytes.len() >= new_bytes.len()
-            }
-        }
-
-        let new_range = self.heap.len()..self.heap.len() + new_bytes.len();
-        self.heap.append(&mut new_bytes);
-        map.append_or_update_last(lsn, new_range)
-            .unwrap()
-            .map(|old_range| {
-                let old_bytes = &self.heap[old_range];
-                PageVersion::des(old_bytes).unwrap()
-            })
-    }
-
-    /// Get all [`PageVersion`]s in a block
-    pub fn iter_block(&self, blknum: u32) -> BlockVersionIter<'_> {
-        let range_iter = self
-            .ranges
-            .get(&blknum)
-            .map(VecMap::as_slice)
-            .unwrap_or(EMPTY_SLICE)
-            .iter();
-
-        BlockVersionIter {
-            heap: &self.heap,
-            range_iter,
-        }
+        let map = self.0.entry(blknum).or_insert_with(VecMap::default);
+        map.append_or_update_last(lsn, page_version).unwrap()
    }

    /// Get a range of [`PageVersions`] in a block
-    pub fn iter_block_lsn_range<R: RangeBounds<Lsn>>(
+    pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(
        &self,
        blknum: u32,
        range: R,
-    ) -> BlockVersionIter<'_> {
-        let range_iter = self
-            .ranges
+    ) -> &[(Lsn, PageVersion)] {
+        self.0
            .get(&blknum)
            .map(|vec_map| vec_map.slice_range(range))
            .unwrap_or(EMPTY_SLICE)
-            .iter();
-
-        BlockVersionIter {
-            heap: &self.heap,
-            range_iter,
-        }
    }

-    /// Iterate through [`PageVersion`]s in (block, lsn) order.
-    /// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
-    pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
-        let mut ordered_blocks: Vec<u32> = self.ranges.keys().cloned().collect();
+    /// Split the page version map into two.
+    ///
+    /// Left contains everything up to and not including [`cutoff_lsn`].
+    /// Right contains [`cutoff_lsn`] and everything after.
+    pub fn split_at(&self, cutoff_lsn: Lsn, after_oldest_lsn: &mut Accum<Lsn>) -> (Self, Self) {
+        let mut before_blocks = HashMap::new();
+        let mut after_blocks = HashMap::new();
+
+        for (blknum, vec_map) in self.0.iter() {
+            let (before_versions, after_versions) = vec_map.split_at(&cutoff_lsn);
+
+            if !before_versions.is_empty() {
+                let old = before_blocks.insert(*blknum, before_versions);
+                assert!(old.is_none());
+            }
+
+            if !after_versions.is_empty() {
+                let (first_lsn, _first_pv) = &after_versions.as_slice()[0];
+                after_oldest_lsn.accum(std::cmp::min, *first_lsn);
+
+                let old = after_blocks.insert(*blknum, after_versions);
+                assert!(old.is_none());
+            }
+        }
+
+        (Self(before_blocks), Self(after_blocks))
+    }
+
+    /// Iterate through block-history pairs in block order.
+    pub fn ordered_block_iter(&self) -> OrderedBlockIter<'_> {
+        let mut ordered_blocks: Vec<u32> = self.0.keys().cloned().collect();
        ordered_blocks.sort_unstable();

-        let cur_block_iter = ordered_blocks
-            .first()
-            .map(|&blknum| self.iter_block(blknum))
-            .unwrap_or_else(|| {
-                let empty_iter = EMPTY_SLICE.iter();
-                BlockVersionIter {
-                    heap: &self.heap,
-                    range_iter: empty_iter,
-                }
-            });
-
-        OrderedPageVersionIter {
+        OrderedBlockIter {
            page_versions: self,
            ordered_blocks,
            cur_block_idx: 0,
-            cutoff_lsn,
-            cur_block_iter,
        }
    }
 }

-pub struct BlockVersionIter<'a> {
-    heap: &'a Vec<u8>,
-    range_iter: slice::Iter<'a, (Lsn, Range<usize>)>,
-}
-
-impl BlockVersionIter<'_> {
-    fn get_iter_result(&self, tuple: Option<&(Lsn, Range<usize>)>) -> Option<(Lsn, PageVersion)> {
-        let (lsn, range) = tuple?;
-        let range = range.clone();
-
-        let pv_bytes = &self.heap[range];
-        let page_version = PageVersion::des(pv_bytes).unwrap();
-
-        Some((*lsn, page_version))
-    }
-}
-
-impl Iterator for BlockVersionIter<'_> {
-    type Item = (Lsn, PageVersion);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let tuple = self.range_iter.next();
-        self.get_iter_result(tuple)
-    }
-}
-
-impl DoubleEndedIterator for BlockVersionIter<'_> {
-    fn next_back(&mut self) -> Option<Self::Item> {
-        let tuple = self.range_iter.next_back();
-        self.get_iter_result(tuple)
-    }
-}
-
-pub struct OrderedPageVersionIter<'a> {
+pub struct OrderedBlockIter<'a> {
    page_versions: &'a PageVersions,

    ordered_blocks: Vec<u32>,
    cur_block_idx: usize,
-
-    cutoff_lsn: Option<Lsn>,
-
-    cur_block_iter: BlockVersionIter<'a>,
 }

-impl OrderedPageVersionIter<'_> {
-    fn is_lsn_before_cutoff(&self, lsn: Lsn) -> bool {
-        if let Some(cutoff_lsn) = self.cutoff_lsn.as_ref() {
-            lsn < *cutoff_lsn
-        } else {
-            true
-        }
-    }
-}
-
-impl Iterator for OrderedPageVersionIter<'_> {
-    type Item = (u32, Lsn, PageVersion);
+impl<'a> Iterator for OrderedBlockIter<'a> {
+    type Item = (u32, &'a VecMap<Lsn, PageVersion>);

    fn next(&mut self) -> Option<Self::Item> {
-        loop {
-            if let Some((lsn, page_version)) = self.cur_block_iter.next() {
-                if self.is_lsn_before_cutoff(lsn) {
-                    let blknum = self.ordered_blocks[self.cur_block_idx];
-                    return Some((blknum, lsn, page_version));
-                }
-            }
-
-            let next_block_idx = self.cur_block_idx + 1;
-            let blknum: u32 = *self.ordered_blocks.get(next_block_idx)?;
-            self.cur_block_idx = next_block_idx;
-            self.cur_block_iter = self.page_versions.iter_block(blknum);
-        }
+        let blknum: u32 = *self.ordered_blocks.get(self.cur_block_idx)?;
+        self.cur_block_idx += 1;
+        Some((blknum, self.page_versions.0.get(&blknum).unwrap()))
    }
 }

@@ -204,24 +116,14 @@ mod tests {
            }
        }

-        let mut iter = page_versions.ordered_page_version_iter(None);
+        let mut iter = page_versions.ordered_block_iter();
        for blknum in 0..BLOCKS {
+            let (actual_blknum, vec_map) = iter.next().unwrap();
+            let slice = vec_map.as_slice();
+            assert_eq!(actual_blknum, blknum);
+            assert_eq!(slice.len(), LSNS as usize);
            for lsn in 0..LSNS {
-                let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
-                assert_eq!(actual_blknum, blknum);
-                assert_eq!(Lsn(lsn), actual_lsn);
-            }
-        }
-        assert!(iter.next().is_none());
-        assert!(iter.next().is_none()); // should be robust against excessive next() calls
-
-        const CUTOFF_LSN: Lsn = Lsn(30);
-        let mut iter = page_versions.ordered_page_version_iter(Some(CUTOFF_LSN));
-        for blknum in 0..BLOCKS {
-            for lsn in 0..CUTOFF_LSN.0 {
-                let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
-                assert_eq!(actual_blknum, blknum);
-                assert_eq!(Lsn(lsn), actual_lsn);
+                assert_eq!(Lsn(lsn), slice[lsn as usize].0);
            }
        }
        assert!(iter.next().is_none());
--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -123,6 +123,10 @@ pub trait Layer: Send + Sync {
    /// Is the segment represented by this layer dropped by PostgreSQL?
    fn is_dropped(&self) -> bool;

+    /// Gets the physical location of the layer on disk.
+    /// Some layers, such as in-memory, might not have the location.
+    fn path(&self) -> Option<PathBuf>;
+
    /// Filename used to store this layer on disk. (Even in-memory layers
    /// implement this, to print a handy unique identifier for the layer for
    /// log messages, even though they're never not on disk.)
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,6 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Result};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use lazy_static::lazy_static;
+use log::*;
 use regex::Regex;
 use std::net::TcpListener;
 use std::str;
@@ -20,12 +21,10 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::thread;
 use std::{io, net::TcpStream};
-use tracing::*;
 use zenith_metrics::{register_histogram_vec, HistogramVec};
 use zenith_utils::auth::{self, JwtAuth};
 use zenith_utils::auth::{Claims, Scope};
 use zenith_utils::lsn::Lsn;
-use zenith_utils::postgres_backend::is_socket_read_timed_out;
 use zenith_utils::postgres_backend::PostgresBackend;
 use zenith_utils::postgres_backend::{self, AuthType};
 use zenith_utils::pq_proto::{
@@ -188,32 +187,17 @@ pub fn thread_main(
    listener: TcpListener,
    auth_type: AuthType,
 ) -> anyhow::Result<()> {
-    let mut join_handles = Vec::new();
-
-    while !tenant_mgr::shutdown_requested() {
+    loop {
        let (socket, peer_addr) = listener.accept()?;
        debug!("accepted connection from {}", peer_addr);
        socket.set_nodelay(true).unwrap();
        let local_auth = auth.clone();
-
-        let handle = thread::Builder::new()
-            .name("serving Page Service thread".into())
-            .spawn(move || {
-                if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
-                    error!(%err, "page server thread exited with error");
-                }
-            })
-            .unwrap();
-
-        join_handles.push(handle);
+        thread::spawn(move || {
+            if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
+                error!("page server thread exiting with error: {:#}", err);
+            }
+        });
    }
-
-    debug!("page_service loop terminated. wait for connections to cancel");
-    for handle in join_handles.into_iter() {
-        handle.join().unwrap();
-    }
-
-    Ok(())
 }

 fn page_service_conn_main(
@@ -232,7 +216,7 @@ fn page_service_conn_main(
    }

    let mut conn_handler = PageServerHandler::new(conf, auth);
-    let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
+    let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
    pgbackend.run(&mut conn_handler)
 }

@@ -276,66 +260,50 @@ impl PageServerHandler {
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
    ) -> anyhow::Result<()> {
-        let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
-
        // Check that the timeline exists
        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;

        /* switch client to COPYBOTH */
        pgb.write_message(&BeMessage::CopyBothResponse)?;

-        while !tenant_mgr::shutdown_requested() {
-            match pgb.read_message() {
-                Ok(message) => {
-                    if let Some(message) = message {
-                        trace!("query: {:?}", message);
+        while let Some(message) = pgb.read_message()? {
+            trace!("query({:?}): {:?}", timelineid, message);

-                        let copy_data_bytes = match message {
-                            FeMessage::CopyData(bytes) => bytes,
-                            _ => continue,
-                        };
+            let copy_data_bytes = match message {
+                FeMessage::CopyData(bytes) => bytes,
+                _ => continue,
+            };

-                        let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
+            let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;

-                        let response = match zenith_fe_msg {
-                            PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_rel_exists"])
-                                .observe_closure_duration(|| {
-                                    self.handle_get_rel_exists_request(&*timeline, &req)
-                                }),
-                            PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_rel_size"])
-                                .observe_closure_duration(|| {
-                                    self.handle_get_nblocks_request(&*timeline, &req)
-                                }),
-                            PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
-                                .with_label_values(&["get_page_at_lsn"])
-                                .observe_closure_duration(|| {
-                                    self.handle_get_page_at_lsn_request(&*timeline, &req)
-                                }),
-                        };
+            let response = match zenith_fe_msg {
+                PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
+                    .with_label_values(&["get_rel_exists"])
+                    .observe_closure_duration(|| {
+                        self.handle_get_rel_exists_request(&*timeline, &req)
+                    }),
+                PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
+                    .with_label_values(&["get_rel_size"])
+                    .observe_closure_duration(|| self.handle_get_nblocks_request(&*timeline, &req)),
+                PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
+                    .with_label_values(&["get_page_at_lsn"])
+                    .observe_closure_duration(|| {
+                        self.handle_get_page_at_lsn_request(&*timeline, &req)
+                    }),
+            };

-                        let response = response.unwrap_or_else(|e| {
-                            // print the all details to the log with {:#}, but for the client the
-                            // error message is enough
-                            error!("error reading relation or page version: {:#}", e);
-                            PagestreamBeMessage::Error(PagestreamErrorResponse {
-                                message: e.to_string(),
-                            })
-                        });
+            let response = response.unwrap_or_else(|e| {
+                // print the all details to the log with {:#}, but for the client the
+                // error message is enough
+                error!("error reading relation or page version: {:#}", e);
+                PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    message: e.to_string(),
+                })
+            });

-                        pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
-                    } else {
-                        break;
-                    }
-                }
-                Err(e) => {
-                    if !is_socket_read_timed_out(&e) {
-                        return Err(e);
-                    }
-                }
-            }
+            pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
        }
+
        Ok(())
    }

@@ -395,8 +363,6 @@ impl PageServerHandler {
        timeline: &dyn Timeline,
        req: &PagestreamExistsRequest,
    ) -> Result<PagestreamBeMessage> {
-        let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
-
        let tag = RelishTag::Relation(req.rel);
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;

@@ -412,7 +378,6 @@ impl PageServerHandler {
        timeline: &dyn Timeline,
        req: &PagestreamNblocksRequest,
    ) -> Result<PagestreamBeMessage> {
-        let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
        let tag = RelishTag::Relation(req.rel);
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;

@@ -432,8 +397,6 @@ impl PageServerHandler {
        timeline: &dyn Timeline,
        req: &PagestreamGetPageRequest,
    ) -> Result<PagestreamBeMessage> {
-        let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
-            .entered();
        let tag = RelishTag::Relation(req.rel);
        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;

@@ -451,20 +414,17 @@ impl PageServerHandler {
        lsn: Option<Lsn>,
        tenantid: ZTenantId,
    ) -> anyhow::Result<()> {
-        let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
-        let _enter = span.enter();
-
        // check that the timeline exists
        let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;

-        // switch client to COPYOUT
+        /* switch client to COPYOUT */
        pgb.write_message(&BeMessage::CopyOutResponse)?;
+        info!("sent CopyOut");

        /* Send a tarball of the latest layer on the timeline */
        {
            let mut writer = CopyDataSink { pgb };
            let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
-            span.record("lsn", &basebackup.lsn.to_string().as_str());
            basebackup.send_tarball()?;
        }
        pgb.write_message(&BeMessage::CopyDone)?;
@@ -569,6 +529,11 @@ impl postgres_backend::Handler for PageServerHandler {
                None
            };

+            info!(
+                "got basebackup command. tenantid=\"{}\" timelineid=\"{}\" lsn=\"{:#?}\"",
+                tenantid, timelineid, lsn
+            );
+
            // Check that the timeline exists
            self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
@@ -586,9 +551,6 @@ impl postgres_backend::Handler for PageServerHandler {

            self.check_permission(Some(tenantid))?;

-            let _enter =
-                info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
-
            // Check that the timeline exists
            tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;

@@ -611,9 +573,6 @@ impl postgres_backend::Handler for PageServerHandler {

            self.check_permission(Some(tenantid))?;

-            let _enter =
-                info_span!("branch_create", name = %branchname, tenant = %tenantid).entered();
-
            let branch =
                branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
            let branch = serde_json::to_vec(&branch)?;
--- a/pageserver/src/relish_storage.rs
+++ b/pageserver/src/relish_storage.rs
@@ -12,12 +12,14 @@ mod rust_s3;
 /// local page server layer files with external storage.
 mod synced_storage;

-use std::{path::Path, thread};
+use std::path::Path;
+use std::thread;

 use anyhow::Context;

+use self::local_fs::LocalFs;
 pub use self::synced_storage::schedule_timeline_upload;
-use self::{local_fs::LocalFs, rust_s3::RustS3};
+use crate::relish_storage::rust_s3::RustS3;
 use crate::{PageServerConf, RelishStorageKind};

 pub fn run_storage_sync_thread(
--- a/pageserver/src/relish_storage/rust_s3.rs
+++ b/pageserver/src/relish_storage/rust_s3.rs
@@ -5,10 +5,9 @@ use std::path::Path;
 use anyhow::Context;
 use s3::{bucket::Bucket, creds::Credentials, region::Region};

-use crate::{
-    relish_storage::{strip_workspace_prefix, RelishStorage},
-    S3Config,
-};
+use crate::{relish_storage::strip_workspace_prefix, S3Config};
+
+use super::RelishStorage;

 const S3_FILE_SEPARATOR: char = '/';

--- a/pageserver/src/relish_storage/synced_storage.rs
+++ b/pageserver/src/relish_storage/synced_storage.rs
@@ -1,7 +1,6 @@
 use std::time::Duration;
 use std::{collections::BinaryHeap, sync::Mutex, thread};

-use crate::tenant_mgr;
 use crate::{relish_storage::RelishStorage, PageServerConf};

 lazy_static::lazy_static! {
@@ -32,26 +31,22 @@ pub fn run_storage_sync_thread<

    let handle = thread::Builder::new()
        .name("Queue based relish storage sync".to_string())
-        .spawn(move || {
-            while !tenant_mgr::shutdown_requested() {
-                let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
-                log::debug!("Upload queue length: {}", queue_accessor.len());
-                let next_task = queue_accessor.pop();
-                drop(queue_accessor);
-                match next_task {
-                    Some(task) => runtime.block_on(async {
-                        // suppress warnings
-                        let _ = (config, task, &relish_storage, max_concurrent_sync);
-                        todo!("omitted for brevity")
-                    }),
-                    None => {
-                        thread::sleep(Duration::from_secs(1));
-                        continue;
-                    }
+        .spawn(move || loop {
+            let mut queue_accessor = UPLOAD_QUEUE.lock().unwrap();
+            log::debug!("Upload queue length: {}", queue_accessor.len());
+            let next_task = queue_accessor.pop();
+            drop(queue_accessor);
+            match next_task {
+                Some(task) => runtime.block_on(async {
+                    // suppress warnings
+                    let _ = (config, task, &relish_storage, max_concurrent_sync);
+                    todo!("omitted for brevity")
+                }),
+                None => {
+                    thread::sleep(Duration::from_secs(1));
+                    continue;
                }
            }
-            log::debug!("Queue based relish storage sync thread shut down");
-            Ok(())
        })?;
    Ok(Some(handle))
 }
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -3,7 +3,7 @@ use anyhow::Result;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
-use std::ops::{AddAssign, Deref};
+use std::ops::AddAssign;
 use std::sync::Arc;
 use std::time::Duration;
 use zenith_utils::lsn::{Lsn, RecordLsn};
@@ -13,8 +13,6 @@ use zenith_utils::zid::ZTimelineId;
 /// A repository corresponds to one .zenith directory. One repository holds multiple
 /// timelines, forked off from the same initial call to 'initdb'.
 pub trait Repository: Send + Sync {
-    fn shutdown(&self) -> Result<()>;
-
    /// Get Timeline handle for given zenith timeline ID.
    fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;

@@ -125,39 +123,6 @@ pub trait Timeline: Send + Sync {
    // These are called by the WAL receiver to digest WAL records.
    //------------------------------------------------------------------------------

-    /// Atomically get both last and prev.
-    fn get_last_record_rlsn(&self) -> RecordLsn;
-    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
-    fn get_last_record_lsn(&self) -> Lsn;
-    fn get_prev_record_lsn(&self) -> Lsn;
-    fn get_start_lsn(&self) -> Lsn;
-
-    /// Mutate the timeline with a [`TimelineWriter`].
-    fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
-
-    ///
-    /// Flush to disk all data that was written with the put_* functions
-    ///
-    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
-    /// know anything about them here in the repository.
-    fn checkpoint(&self) -> Result<()>;
-
-    /// Retrieve current logical size of the timeline
-    ///
-    /// NOTE: counted incrementally, includes ancestors,
-    /// doesnt support TwoPhase relishes yet
-    fn get_current_logical_size(&self) -> usize;
-
-    /// Does the same as get_current_logical_size but counted on demand.
-    /// Used in tests to ensure thet incremental and non incremental variants match.
-    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
-}
-
-/// Various functions to mutate the timeline.
-// TODO Currently, Deref is used to allow easy access to read methods from this trait.
-// This is probably considered a bad practice in Rust and should be fixed eventually,
-// but will cause large code changes.
-pub trait TimelineWriter: Deref<Target = dyn Timeline> {
    /// Put a new page version that can be constructed from a WAL record
    ///
    /// This will implicitly extend the relation, if the page is beyond the
@@ -178,6 +143,29 @@ pub trait TimelineWriter: Deref<Target = dyn Timeline> {
    /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
    /// Previous last record LSN is stored alongside the latest and can be read.
    fn advance_last_record_lsn(&self, lsn: Lsn);
+    /// Atomically get both last and prev.
+    fn get_last_record_rlsn(&self) -> RecordLsn;
+    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
+    fn get_last_record_lsn(&self) -> Lsn;
+    fn get_prev_record_lsn(&self) -> Lsn;
+    fn get_start_lsn(&self) -> Lsn;
+
+    ///
+    /// Flush to disk all data that was written with the put_* functions
+    ///
+    /// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
+    /// know anything about them here in the repository.
+    fn checkpoint(&self) -> Result<()>;
+
+    /// Retrieve current logical size of the timeline
+    ///
+    /// NOTE: counted incrementally, includes ancestors,
+    /// doesnt support TwoPhase relishes yet
+    fn get_current_logical_size(&self) -> usize;
+
+    /// Does the same as get_current_logical_size but counted on demand.
+    /// Used in tests to ensure thet incremental and non incremental variants match.
+    fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -221,7 +209,7 @@ impl WALRecord {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::layered_repository::{LayeredRepository, METADATA_FILE_NAME};
+    use crate::layered_repository::LayeredRepository;
    use crate::walredo::{WalRedoError, WalRedoManager};
    use crate::PageServerConf;
    use hex_literal::hex;
@@ -318,15 +306,14 @@ mod tests {

        // Create timeline to work on
        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
-        let writer = tline.writer();

-        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
-        writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
-        writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
+        tline.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
+        tline.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;

-        writer.advance_last_record_lsn(Lsn(0x50));
+        tline.advance_last_record_lsn(Lsn(0x50));

        assert_current_logical_size(&tline, Lsn(0x50));

@@ -372,8 +359,8 @@ mod tests {
        );

        // Truncate last block
-        writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
-        writer.advance_last_record_lsn(Lsn(0x60));
+        tline.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
+        tline.advance_last_record_lsn(Lsn(0x60));
        assert_current_logical_size(&tline, Lsn(0x60));

        // Check reported size and contents after truncation
@@ -395,13 +382,13 @@ mod tests {
        );

        // Truncate to zero length
-        writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
-        writer.advance_last_record_lsn(Lsn(0x68));
+        tline.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
+        tline.advance_last_record_lsn(Lsn(0x68));
        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0);

        // Extend from 0 to 2 blocks, leaving a gap
-        writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
-        writer.advance_last_record_lsn(Lsn(0x70));
+        tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
+        tline.advance_last_record_lsn(Lsn(0x70));
        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2);
        assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE);
        assert_eq!(
@@ -436,26 +423,25 @@ mod tests {

        // Create timeline to work on
        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
-        let writer = tline.writer();

-        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        writer.advance_last_record_lsn(Lsn(0x20));
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        tline.advance_last_record_lsn(Lsn(0x20));

        // Check that rel exists and size is correct
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1);

        // Drop relish
-        writer.drop_relish(TESTREL_A, Lsn(0x30))?;
-        writer.advance_last_record_lsn(Lsn(0x30));
+        tline.drop_relish(TESTREL_A, Lsn(0x30))?;
+        tline.advance_last_record_lsn(Lsn(0x30));

        // Check that rel is not visible anymore
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
        assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none());

        // Extend it again
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
-        writer.advance_last_record_lsn(Lsn(0x40));
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
+        tline.advance_last_record_lsn(Lsn(0x40));

        // Check that rel exists and size is correct
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
@@ -473,7 +459,6 @@ mod tests {

        // Create timeline to work on
        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
-        let writer = tline.writer();

        //from storage_layer.rs
        const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
@@ -483,10 +468,10 @@ mod tests {
        for blkno in 0..relsize {
            let lsn = Lsn(0x20);
            let data = format!("foo blk {} at {}", blkno, lsn);
-            writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
+            tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
        }

-        writer.advance_last_record_lsn(Lsn(0x20));
+        tline.advance_last_record_lsn(Lsn(0x20));

        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
@@ -510,8 +495,8 @@ mod tests {

        // Truncate relation so that second segment was dropped
        // - only leave one page
-        writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
-        writer.advance_last_record_lsn(Lsn(0x60));
+        tline.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
+        tline.advance_last_record_lsn(Lsn(0x60));

        // Check reported size and contents after truncation
        assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1);
@@ -544,9 +529,9 @@ mod tests {
        for blkno in 0..relsize {
            let lsn = Lsn(0x80);
            let data = format!("foo blk {} at {}", blkno, lsn);
-            writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
+            tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
        }
-        writer.advance_last_record_lsn(Lsn(0x80));
+        tline.advance_last_record_lsn(Lsn(0x80));

        assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
        assert_eq!(
@@ -572,15 +557,14 @@ mod tests {
    fn test_large_rel() -> Result<()> {
        let repo = RepoHarness::create("test_large_rel")?.load();
        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
-        let writer = tline.writer();

        let mut lsn = 0x10;
        for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
            lsn += 0x10;
-            writer.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
+            tline.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
        }
-        writer.advance_last_record_lsn(Lsn(lsn));
+        tline.advance_last_record_lsn(Lsn(lsn));

        assert_current_logical_size(&tline, Lsn(lsn));

@@ -591,8 +575,8 @@ mod tests {

        // Truncate one block
        lsn += 0x10;
-        writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
-        writer.advance_last_record_lsn(Lsn(lsn));
+        tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
+        tline.advance_last_record_lsn(Lsn(lsn));
        assert_eq!(
            tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
            pg_constants::RELSEG_SIZE
@@ -601,8 +585,8 @@ mod tests {

        // Truncate another block
        lsn += 0x10;
-        writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
-        writer.advance_last_record_lsn(Lsn(lsn));
+        tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
+        tline.advance_last_record_lsn(Lsn(lsn));
        assert_eq!(
            tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
            pg_constants::RELSEG_SIZE - 1
@@ -614,8 +598,8 @@ mod tests {
        let mut size: i32 = 3000;
        while size >= 0 {
            lsn += 0x10;
-            writer.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
-            writer.advance_last_record_lsn(Lsn(lsn));
+            tline.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
+            tline.advance_last_record_lsn(Lsn(lsn));
            assert_eq!(
                tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
                size as u32
@@ -635,17 +619,16 @@ mod tests {
    fn test_list_rels_drop() -> Result<()> {
        let repo = RepoHarness::create("test_list_rels_drop")?.load();
        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
-        let writer = tline.writer();
        const TESTDB: u32 = 111;

        // Import initial dummy checkpoint record, otherwise the get_timeline() call
        // after branching fails below
-        writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
+        tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;

        // Create a relation on the timeline
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;

-        writer.advance_last_record_lsn(Lsn(0x30));
+        tline.advance_last_record_lsn(Lsn(0x30));

        // Check that list_rels() lists it after LSN 2, but no before it
        assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A));
@@ -655,17 +638,14 @@ mod tests {
        // Create a branch, check that the relation is visible there
        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
        let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
-        let new_writer = newtline.writer();

        assert!(newtline
            .list_rels(0, TESTDB, Lsn(0x30))?
            .contains(&TESTREL_A));

        // Drop it on the branch
-        new_writer.drop_relish(TESTREL_A, Lsn(0x40))?;
-        new_writer.advance_last_record_lsn(Lsn(0x40));
-
-        drop(new_writer);
+        newtline.drop_relish(TESTREL_A, Lsn(0x40))?;
+        newtline.advance_last_record_lsn(Lsn(0x40));

        // Check that it's no longer listed on the branch after the point where it was dropped
        assert!(newtline
@@ -693,30 +673,28 @@ mod tests {
    fn test_branch() -> Result<()> {
        let repo = RepoHarness::create("test_branch")?.load();
        let tline = repo.create_empty_timeline(TIMELINE_ID)?;
-        let writer = tline.writer();

        // Import initial dummy checkpoint record, otherwise the get_timeline() call
        // after branching fails below
-        writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
+        tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;

        // Create a relation on the timeline
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
-        writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
+        tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;

        // Create another relation
-        writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
+        tline.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;

-        writer.advance_last_record_lsn(Lsn(0x40));
+        tline.advance_last_record_lsn(Lsn(0x40));
        assert_current_logical_size(&tline, Lsn(0x40));

        // Branch the history, modify relation differently on the new timeline
        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
        let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
-        let new_writer = newtline.writer();

-        new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
-        new_writer.advance_last_record_lsn(Lsn(0x40));
+        newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
+        newtline.advance_last_record_lsn(Lsn(0x40));

        // Check page contents on both branches
        assert_eq!(
@@ -750,7 +728,7 @@ mod tests {
        repo.create_empty_timeline(TIMELINE_ID)?;
        drop(repo);

-        let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
+        let metadata_path = harness.timeline_path(&TIMELINE_ID).join("metadata");

        assert!(metadata_path.is_file());

--- a/pageserver/src/restore_local_repo.rs
+++ b/pageserver/src/restore_local_repo.rs
@@ -2,6 +2,7 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! zenith Timeline.
 //!
+use log::*;
 use postgres_ffi::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
 use std::cmp::min;
@@ -12,7 +13,6 @@ use std::path::Path;

 use anyhow::{bail, Result};
 use bytes::{Buf, Bytes};
-use tracing::*;

 use crate::relish::*;
 use crate::repository::*;
@@ -34,7 +34,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
 ///
 pub fn import_timeline_from_postgres_datadir(
    path: &Path,
-    writer: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
 ) -> Result<()> {
    // Scan 'global'
@@ -44,10 +44,10 @@ pub fn import_timeline_from_postgres_datadir(
            None => continue,

            Some("pg_control") => {
-                import_control_file(writer, lsn, &direntry.path())?;
+                import_control_file(timeline, lsn, &direntry.path())?;
            }
            Some("pg_filenode.map") => import_nonrel_file(
-                writer,
+                timeline,
                lsn,
                RelishTag::FileNodeMap {
                    spcnode: pg_constants::GLOBALTABLESPACE_OID,
@@ -59,7 +59,7 @@ pub fn import_timeline_from_postgres_datadir(
            // Load any relation files into the page server
            _ => import_relfile(
                &direntry.path(),
-                writer,
+                timeline,
                lsn,
                pg_constants::GLOBALTABLESPACE_OID,
                0,
@@ -86,7 +86,7 @@ pub fn import_timeline_from_postgres_datadir(

                Some("PG_VERSION") => continue,
                Some("pg_filenode.map") => import_nonrel_file(
-                    writer,
+                    timeline,
                    lsn,
                    RelishTag::FileNodeMap {
                        spcnode: pg_constants::DEFAULTTABLESPACE_OID,
@@ -98,7 +98,7 @@ pub fn import_timeline_from_postgres_datadir(
                // Load any relation files into the page server
                _ => import_relfile(
                    &direntry.path(),
-                    writer,
+                    timeline,
                    lsn,
                    pg_constants::DEFAULTTABLESPACE_OID,
                    dboid,
@@ -108,24 +108,24 @@ pub fn import_timeline_from_postgres_datadir(
    }
    for entry in fs::read_dir(path.join("pg_xact"))? {
        let entry = entry?;
-        import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
+        import_slru_file(timeline, lsn, SlruKind::Clog, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
        let entry = entry?;
-        import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
+        import_slru_file(timeline, lsn, SlruKind::MultiXactMembers, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
        let entry = entry?;
-        import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
+        import_slru_file(timeline, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
    }
    for entry in fs::read_dir(path.join("pg_twophase"))? {
        let entry = entry?;
        let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
-        import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
+        import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
    }
    // TODO: Scan pg_tblspc

-    writer.advance_last_record_lsn(lsn);
+    timeline.advance_last_record_lsn(lsn);

    Ok(())
 }
@@ -133,13 +133,12 @@ pub fn import_timeline_from_postgres_datadir(
 // subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
 fn import_relfile(
    path: &Path,
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    spcoid: Oid,
    dboid: Oid,
 ) -> Result<()> {
    // Does it look like a relation file?
-    trace!("importing rel file {}", path.display());

    let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
    if let Err(e) = p {
@@ -167,14 +166,14 @@ fn import_relfile(
            }

            // TODO: UnexpectedEof is expected
-            Err(err) => match err.kind() {
+            Err(e) => match e.kind() {
                std::io::ErrorKind::UnexpectedEof => {
                    // reached EOF. That's expected.
                    // FIXME: maybe check that we read the full length of the file?
                    break;
                }
                _ => {
-                    bail!("error reading file {}: {:#}", path.display(), err);
+                    bail!("error reading file {}: {:#}", path.display(), e);
                }
            },
        };
@@ -191,7 +190,7 @@ fn import_relfile(
 /// are just slurped into the repository as one blob.
 ///
 fn import_nonrel_file(
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    tag: RelishTag,
    path: &Path,
@@ -201,7 +200,7 @@ fn import_nonrel_file(
    // read the whole file
    file.read_to_end(&mut buffer)?;

-    trace!("importing non-rel file {}", path.display());
+    info!("importing non-rel file {}", path.display());

    timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
    Ok(())
@@ -212,13 +211,13 @@ fn import_nonrel_file(
 ///
 /// The control file is imported as is, but we also extract the checkpoint record
 /// from it and store it separated.
-fn import_control_file(timeline: &dyn TimelineWriter, lsn: Lsn, path: &Path) -> Result<()> {
+fn import_control_file(timeline: &dyn Timeline, lsn: Lsn, path: &Path) -> Result<()> {
    let mut file = File::open(path)?;
    let mut buffer = Vec::new();
    // read the whole file
    file.read_to_end(&mut buffer)?;

-    trace!("importing control file {}", path.display());
+    info!("importing control file {}", path.display());

    // Import it as ControlFile
    timeline.put_page_image(
@@ -239,18 +238,13 @@ fn import_control_file(timeline: &dyn TimelineWriter, lsn: Lsn, path: &Path) ->
 ///
 /// Import an SLRU segment file
 ///
-fn import_slru_file(
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
-    slru: SlruKind,
-    path: &Path,
-) -> Result<()> {
+fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Path) -> Result<()> {
    // Does it look like an SLRU file?
    let mut file = File::open(path)?;
    let mut buf: [u8; 8192] = [0u8; 8192];
    let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;

-    trace!("importing slru file {}", path.display());
+    info!("importing slru file {}", path.display());

    let mut rpageno = 0;
    loop {
@@ -266,14 +260,14 @@ fn import_slru_file(
            }

            // TODO: UnexpectedEof is expected
-            Err(err) => match err.kind() {
+            Err(e) => match e.kind() {
                std::io::ErrorKind::UnexpectedEof => {
                    // reached EOF. That's expected.
                    // FIXME: maybe check that we read the full length of the file?
                    break;
                }
                _ => {
-                    bail!("error reading file {}: {:#}", path.display(), err);
+                    bail!("error reading file {}: {:#}", path.display(), e);
                }
            },
        };
@@ -291,15 +285,12 @@ fn import_slru_file(
 ///
 pub fn save_decoded_record(
    checkpoint: &mut CheckPoint,
-    checkpoint_modified: &mut bool,
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    decoded: &DecodedWALRecord,
    recdata: Bytes,
    lsn: Lsn,
 ) -> Result<()> {
-    if checkpoint.update_next_xid(decoded.xl_xid) {
-        *checkpoint_modified = true;
-    }
+    checkpoint.update_next_xid(decoded.xl_xid);

    // Iterate through all the blocks that the record modifies, and
    // "put" a separate copy of the record for each block.
@@ -383,7 +374,7 @@ pub fn save_decoded_record(
        } else {
            assert!(info == pg_constants::CLOG_TRUNCATE);
            let xlrec = XlClogTruncate::decode(&mut buf);
-            save_clog_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
+            save_clog_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
        }
    } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
        let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
@@ -452,17 +443,10 @@ pub fn save_decoded_record(
            )?;
        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
            let xlrec = XlMultiXactCreate::decode(&mut buf);
-            save_multixact_create_record(
-                checkpoint,
-                checkpoint_modified,
-                timeline,
-                lsn,
-                &xlrec,
-                decoded,
-            )?;
+            save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
        } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
            let xlrec = XlMultiXactTruncate::decode(&mut buf);
-            save_multixact_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
+            save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
        }
    } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
        let xlrec = XlRelmapUpdate::decode(&mut buf);
@@ -471,10 +455,7 @@ pub fn save_decoded_record(
        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
        if info == pg_constants::XLOG_NEXTOID {
            let next_oid = buf.get_u32_le();
-            if checkpoint.nextOid != next_oid {
-                checkpoint.nextOid = next_oid;
-                *checkpoint_modified = true;
-            }
+            checkpoint.nextOid = next_oid;
        } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
            || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
        {
@@ -490,7 +471,6 @@ pub fn save_decoded_record(
            );
            if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
                checkpoint.oldestXid = xlog_checkpoint.oldestXid;
-                *checkpoint_modified = true;
            }
        }
    }
@@ -498,11 +478,7 @@ pub fn save_decoded_record(
 }

 /// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record.
-fn save_xlog_dbase_create(
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
-    rec: &XlCreateDatabase,
-) -> Result<()> {
+fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> {
    let db_id = rec.db_id;
    let tablespace_id = rec.tablespace_id;
    let src_db_id = rec.src_db_id;
@@ -579,11 +555,7 @@ fn save_xlog_dbase_create(
 /// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record.
 ///
 /// This is the same logic as in PostgreSQL's smgr_redo() function.
-fn save_xlog_smgr_truncate(
-    timeline: &dyn TimelineWriter,
-    lsn: Lsn,
-    rec: &XlSmgrTruncate,
-) -> Result<()> {
+fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> {
    let spcnode = rec.rnode.spcnode;
    let dbnode = rec.rnode.dbnode;
    let relnode = rec.rnode.relnode;
@@ -645,7 +617,7 @@ fn save_xlog_smgr_truncate(
 /// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
 ///
 fn save_xact_record(
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    parsed: &XlXactParsedRecord,
    decoded: &DecodedWALRecord,
@@ -702,8 +674,7 @@ fn save_xact_record(

 fn save_clog_truncate_record(
    checkpoint: &mut CheckPoint,
-    checkpoint_modified: &mut bool,
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    xlrec: &XlClogTruncate,
 ) -> Result<()> {
@@ -721,7 +692,6 @@ fn save_clog_truncate_record(
    // TODO Figure out if there will be any issues with replica.
    checkpoint.oldestXid = xlrec.oldest_xid;
    checkpoint.oldestXidDB = xlrec.oldest_xid_db;
-    *checkpoint_modified = true;

    // TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it

@@ -764,8 +734,7 @@ fn save_clog_truncate_record(

 fn save_multixact_create_record(
    checkpoint: &mut CheckPoint,
-    checkpoint_modified: &mut bool,
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    xlrec: &XlMultiXactCreate,
    decoded: &DecodedWALRecord,
@@ -821,11 +790,9 @@ fn save_multixact_create_record(
    }
    if xlrec.mid >= checkpoint.nextMulti {
        checkpoint.nextMulti = xlrec.mid + 1;
-        *checkpoint_modified = true;
    }
    if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
        checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
-        *checkpoint_modified = true;
    }
    let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
        if mbr.xid.wrapping_sub(acc) as i32 > 0 {
@@ -835,22 +802,18 @@ fn save_multixact_create_record(
        }
    });

-    if checkpoint.update_next_xid(max_mbr_xid) {
-        *checkpoint_modified = true;
-    }
+    checkpoint.update_next_xid(max_mbr_xid);
    Ok(())
 }

 fn save_multixact_truncate_record(
    checkpoint: &mut CheckPoint,
-    checkpoint_modified: &mut bool,
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    xlrec: &XlMultiXactTruncate,
 ) -> Result<()> {
    checkpoint.oldestMulti = xlrec.end_trunc_off;
    checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
-    *checkpoint_modified = true;

    // PerformMembersTruncation
    let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
@@ -884,7 +847,7 @@ fn save_multixact_truncate_record(
 }

 fn save_relmap_page(
-    timeline: &dyn TimelineWriter,
+    timeline: &dyn Timeline,
    lsn: Lsn,
    xlrec: &XlRelmapUpdate,
    decoded: &DecodedWALRecord,
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -8,14 +8,12 @@ use crate::walredo::PostgresRedoManager;
 use crate::PageServerConf;
 use anyhow::{anyhow, bail, Context, Result};
 use lazy_static::lazy_static;
-use log::{debug, info};
+use log::info;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::fs;
 use std::str::FromStr;
-use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex, MutexGuard};
-use std::thread::JoinHandle;
 use zenith_utils::zid::{ZTenantId, ZTimelineId};

 lazy_static! {
@@ -26,19 +24,6 @@ lazy_static! {
 fn access_repository() -> MutexGuard<'static, HashMap<ZTenantId, Arc<dyn Repository>>> {
    REPOSITORY.lock().unwrap()
 }
-struct TenantHandleEntry {
-    checkpointer_handle: Option<JoinHandle<()>>,
-    gc_handle: Option<JoinHandle<()>>,
-}
-
-// Logically these handles belong to Repository,
-// but it's just simpler to store them separately
-lazy_static! {
-    static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
-        Mutex::new(HashMap::new());
-}
-
-static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);

 pub fn init(conf: &'static PageServerConf) {
    let mut m = access_repository();
@@ -62,18 +47,8 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Arc<Layered
        tenant_id,
        true,
    ));
-
-    let checkpointer_handle = LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
-    let gc_handle = LayeredRepository::launch_gc_thread(conf, repo.clone());
-
-    let mut handles = TENANT_HANDLES.lock().unwrap();
-    let h = TenantHandleEntry {
-        checkpointer_handle: Some(checkpointer_handle),
-        gc_handle: Some(gc_handle),
-    };
-
-    handles.insert(tenant_id, h);
-
+    LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
+    LayeredRepository::launch_gc_thread(conf, repo.clone());
    repo
 }

@@ -107,35 +82,6 @@ fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
    }
 }

-// Check this flag in the thread loops to know when to exit
-pub fn shutdown_requested() -> bool {
-    SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
-}
-
-pub fn stop_tenant_threads(tenantid: ZTenantId) {
-    let mut handles = TENANT_HANDLES.lock().unwrap();
-    if let Some(h) = handles.get_mut(&tenantid) {
-        h.checkpointer_handle.take().map(JoinHandle::join);
-        debug!("checkpointer for tenant {} has stopped", tenantid);
-        h.gc_handle.take().map(JoinHandle::join);
-        debug!("gc for tenant {} has stopped", tenantid);
-    }
-}
-
-pub fn shutdown_all_tenants() -> Result<()> {
-    SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
-
-    let tenants = list_tenants()?;
-    for tenantid in tenants {
-        stop_tenant_threads(tenantid);
-        let repo = get_repository_for_tenant(tenantid)?;
-        debug!("shutdown tenant {}", tenantid);
-        repo.shutdown()?;
-    }
-
-    Ok(())
-}
-
 pub fn create_repository_for_tenant(
    conf: &'static PageServerConf,
    tenantid: ZTenantId,
@@ -169,14 +115,3 @@ pub fn get_timeline_for_tenant(
        .get_timeline(timelineid)
        .with_context(|| format!("cannot fetch timeline {}", timelineid))
 }
-
-fn list_tenants() -> Result<Vec<ZTenantId>> {
-    let o = &mut REPOSITORY.lock().unwrap();
-
-    o.iter()
-        .map(|tenant| {
-            let (tenantid, _) = tenant;
-            Ok(*tenantid)
-        })
-        .collect()
-}
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -12,6 +12,7 @@ use crate::waldecoder::*;
 use crate::PageServerConf;
 use anyhow::{bail, Error, Result};
 use lazy_static::lazy_static;
+use log::*;
 use postgres::fallible_iterator::FallibleIterator;
 use postgres::replication::ReplicationIter;
 use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
@@ -24,10 +25,8 @@ use std::str::FromStr;
 use std::sync::Mutex;
 use std::thread;
 use std::thread::sleep;
-use std::thread::JoinHandle;
 use std::thread_local;
 use std::time::{Duration, SystemTime};
-use tracing::*;
 use zenith_utils::lsn::Lsn;
 use zenith_utils::zid::ZTenantId;
 use zenith_utils::zid::ZTimelineId;
@@ -37,7 +36,6 @@ use zenith_utils::zid::ZTimelineId;
 //
 struct WalReceiverEntry {
    wal_producer_connstr: String,
-    wal_receiver_handle: Option<JoinHandle<()>>,
 }

 lazy_static! {
@@ -52,19 +50,6 @@ thread_local! {
    pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
 }

-// Wait for walreceiver to stop
-// Now it stops when pageserver shutdown is requested.
-// In future we can make this more granular and send shutdown signals
-// per tenant/timeline to cancel inactive walreceivers.
-// TODO deal with blocking pg connections
-pub fn stop_wal_receiver(timelineid: ZTimelineId) {
-    let mut receivers = WAL_RECEIVERS.lock().unwrap();
-    if let Some(r) = receivers.get_mut(&timelineid) {
-        r.wal_receiver_handle.take();
-        // r.wal_receiver_handle.take().map(JoinHandle::join);
-    }
-}
-
 // Launch a new WAL receiver, or tell one that's running about change in connection string
 pub fn launch_wal_receiver(
    conf: &'static PageServerConf,
@@ -79,19 +64,19 @@ pub fn launch_wal_receiver(
            receiver.wal_producer_connstr = wal_producer_connstr.into();
        }
        None => {
-            let wal_receiver_handle = thread::Builder::new()
+            let receiver = WalReceiverEntry {
+                wal_producer_connstr: wal_producer_connstr.into(),
+            };
+            receivers.insert(timelineid, receiver);
+
+            // Also launch a new thread to handle this connection
+            let _walreceiver_thread = thread::Builder::new()
                .name("WAL receiver thread".into())
                .spawn(move || {
                    IS_WAL_RECEIVER.with(|c| c.set(true));
                    thread_main(conf, timelineid, tenantid);
                })
                .unwrap();
-
-            let receiver = WalReceiverEntry {
-                wal_producer_connstr: wal_producer_connstr.into(),
-                wal_receiver_handle: Some(wal_receiver_handle),
-            };
-            receivers.insert(timelineid, receiver);
        }
    };
 }
@@ -111,14 +96,16 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
 // This is the entry point for the WAL receiver thread.
 //
 fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
-    let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
-    info!("WAL receiver thread started");
+    info!(
+        "WAL receiver thread started for timeline : '{}'",
+        timelineid
+    );

    //
    // Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
    // and start streaming WAL from it. If the connection is lost, keep retrying.
    //
-    while !tenant_mgr::shutdown_requested() {
+    loop {
        // Look up the current WAL producer address
        let wal_producer_connstr = get_wal_producer_connstr(timelineid);

@@ -132,7 +119,6 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
            sleep(Duration::from_secs(1));
        }
    }
-    debug!("WAL streaming shut down");
 }

 fn walreceiver_main(
@@ -183,8 +169,8 @@ fn walreceiver_main(
    startpoint += startpoint.calc_padding(8u32);

    info!(
-        "last_record_lsn {} starting replication from {}, server is at {}...",
-        last_rec_lsn, startpoint, end_of_wal
+        "last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
+        last_rec_lsn, startpoint, timelineid, end_of_wal
    );

    let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
@@ -212,32 +198,27 @@ fn walreceiver_main(
                waldecoder.feed_bytes(data);

                while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                    let _enter = info_span!("processing record", lsn = %lsn).entered();
+                    // Save old checkpoint value to compare with it after decoding WAL record
+                    let old_checkpoint_bytes = checkpoint.encode();
+                    let decoded = decode_wal_record(recdata.clone());

                    // It is important to deal with the aligned records as lsn in getPage@LSN is
                    // aligned and can be several bytes bigger. Without this alignment we are
                    // at risk of hittind a deadlock.
                    assert!(lsn.is_aligned());

-                    let writer = timeline.writer();
-
-                    let mut checkpoint_modified = false;
-
-                    let decoded = decode_wal_record(recdata.clone());
                    restore_local_repo::save_decoded_record(
                        &mut checkpoint,
-                        &mut checkpoint_modified,
-                        writer.as_ref(),
+                        &*timeline,
                        &decoded,
                        recdata,
                        lsn,
                    )?;

+                    let new_checkpoint_bytes = checkpoint.encode();
                    // Check if checkpoint data was updated by save_decoded_record
-                    if checkpoint_modified {
-                        let new_checkpoint_bytes = checkpoint.encode();
-
-                        writer.put_page_image(
+                    if new_checkpoint_bytes != old_checkpoint_bytes {
+                        timeline.put_page_image(
                            RelishTag::Checkpoint,
                            0,
                            lsn,
@@ -247,7 +228,7 @@ fn walreceiver_main(

                    // Now that this record has been fully handled, including updating the
                    // checkpoint data, let the repository know that it is up-to-date to this LSN
-                    writer.advance_last_record_lsn(lsn);
+                    timeline.advance_last_record_lsn(lsn);
                    last_rec_lsn = lsn;
                }

@@ -292,11 +273,6 @@ fn walreceiver_main(

            physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
        }
-
-        if tenant_mgr::shutdown_requested() {
-            debug!("stop walreceiver because pageserver shutdown is requested");
-            break;
-        }
    }
    Ok(())
 }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -565,16 +565,22 @@ impl PostgresRedoProcess {
                stdin.write_all(&build_begin_redo_for_block_msg(tag)),
            )
            .await??;
-            if let Some(img) = base_img {
-                timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??;
+            if base_img.is_some() {
+                timeout(
+                    TIMEOUT,
+                    stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
+                )
+                .await??;
            }

            // Send WAL records.
            for rec in records.iter() {
+                let r = rec.clone();
+
                WAL_REDO_RECORD_COUNTER.inc();

                stdin
-                    .write_all(&build_apply_record_msg(rec.lsn, &rec.rec))
+                    .write_all(&build_apply_record_msg(r.lsn, r.rec))
                    .await?;

                //debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
@@ -611,41 +617,58 @@ impl PostgresRedoProcess {
 // process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
 // explanation of the protocol.

-fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec<u8> {
+fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
    let len = 4 + 1 + 4 * 4;
-    let mut buf = Vec::with_capacity(1 + len);
+    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'B');
    buf.put_u32(len as u32);

-    tag.ser_into(&mut buf)
+    // FIXME: this is a temporary hack that should go away when we refactor
+    // the postgres protocol serialization + handlers.
+    //
+    // BytesMut is a dynamic growable buffer, used a lot in tokio code but
+    // not in the std library. To write to a BytesMut from a serde serializer,
+    // we need to either:
+    // - pre-allocate the required buffer space. This is annoying because we
+    //   shouldn't care what the exact serialized size is-- that's the
+    //   serializer's job.
+    // - Or, we need to create a temporary "writer" (which implements the
+    //   `Write` trait). It's a bit awkward, because the writer consumes the
+    //   underlying BytesMut, and we need to extract it later with
+    //   `into_inner`.
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
        .expect("serialize BufferTag should always succeed");
+    let buf = writer.into_inner();

    debug_assert!(buf.len() == 1 + len);

-    buf
+    buf.freeze()
 }

-fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec<u8> {
+fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
    assert!(base_img.len() == 8192);

    let len = 4 + 1 + 4 * 4 + base_img.len();
-    let mut buf = Vec::with_capacity(1 + len);
+    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'P');
    buf.put_u32(len as u32);
-    tag.ser_into(&mut buf)
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
        .expect("serialize BufferTag should always succeed");
+    let mut buf = writer.into_inner();
    buf.put(base_img);

    debug_assert!(buf.len() == 1 + len);

-    buf
+    buf.freeze()
 }

-fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
+fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
    let len = 4 + 8 + rec.len();
-    let mut buf: Vec<u8> = Vec::with_capacity(1 + len);
+    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'A');
    buf.put_u32(len as u32);
@@ -654,19 +677,21 @@ fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {

    debug_assert!(buf.len() == 1 + len);

-    buf
+    buf.freeze()
 }

-fn build_get_page_msg(tag: BufferTag) -> Vec<u8> {
+fn build_get_page_msg(tag: BufferTag) -> Bytes {
    let len = 4 + 1 + 4 * 4;
-    let mut buf = Vec::with_capacity(1 + len);
+    let mut buf = BytesMut::with_capacity(1 + len);

    buf.put_u8(b'G');
    buf.put_u32(len as u32);
-    tag.ser_into(&mut buf)
+    let mut writer = buf.writer();
+    tag.ser_into(&mut writer)
        .expect("serialize BufferTag should always succeed");
+    let buf = writer.into_inner();

    debug_assert!(buf.len() == 1 + len);

-    buf
+    buf.freeze()
 }
--- a/postgres_ffi/src/xlog_utils.rs
+++ b/postgres_ffi/src/xlog_utils.rs
@@ -377,12 +377,10 @@ impl CheckPoint {
        Ok(CheckPoint::des(buf)?)
    }

-    /// Update next XID based on provided new_xid and stored epoch.
-    /// Next XID should be greater than new_xid. This handles 32-bit
-    /// XID wraparound correctly.
-    ///
-    /// Returns 'true' if the XID was updated.
-    pub fn update_next_xid(&mut self, xid: u32) -> bool {
+    // Update next XID based on provided new_xid and stored epoch.
+    // Next XID should be greater than new_xid.
+    // Also take in account 32-bit wrap-around.
+    pub fn update_next_xid(&mut self, xid: u32) {
        let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
        let full_xid = self.nextXid.value;
        let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
@@ -393,14 +391,10 @@ impl CheckPoint {
                // wrap-around
                epoch += 1;
            }
-            let nextXid = (epoch << 32) | new_xid as u64;
-
-            if nextXid != self.nextXid.value {
-                self.nextXid = FullTransactionId { value: nextXid };
-                return true;
-            }
+            self.nextXid = FullTransactionId {
+                value: (epoch << 32) | new_xid as u64,
+            };
        }
-        false
    }
 }

--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -34,7 +34,7 @@ pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow:

 pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
    let mut conn_handler = MgmtHandler { state };
-    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
+    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
    pgbackend.run(&mut conn_handler)
 }

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -64,7 +64,6 @@ pub fn proxy_conn_main(
            socket,
            postgres_backend::AuthType::MD5,
            state.conf.ssl_config.clone(),
-            false,
        )?,
        md5_salt: [0u8; 4],
        psql_session_id: "".into(),
--- a/test_runner/fixtures/zenith_fixtures.py
+++ b/test_runner/fixtures/zenith_fixtures.py
@@ -64,7 +64,7 @@ def pytest_configure(config):
         raise Exception('Too many workers configured. Cannot distrubute ports for services.')

    # does not use -c as it is not supported on macOS
-    cmd = ['pgrep', 'pageserver|postgres|safekeeper']
+    cmd = ['pgrep', 'pageserver|postgres|wal_acceptor']
    result = subprocess.run(cmd, stdout=subprocess.DEVNULL)
    if result.returncode == 0:
        # returncode of 0 means it found something.
@@ -72,7 +72,7 @@ def pytest_configure(config):
        # result of the test.
        # NOTE this shows as an internal pytest error, there might be a better way
        raise Exception(
-            'Found interfering processes running. Stop all Zenith pageservers, nodes, safekeepers, as well as stand-alone Postgres.'
+            'Found interfering processes running. Stop all Zenith pageservers, nodes, WALs, as well as stand-alone Postgres.'
        )


@@ -375,7 +375,6 @@ class ZenithPageserver(PgProtocol):
        Start the page server.
        Returns self.
        """
-        assert self.running == False

        self.zenith_cli.run(['start'])
        self.running = True
@@ -383,18 +382,14 @@ class ZenithPageserver(PgProtocol):
        self.initial_tenant = self.zenith_cli.run(['tenant', 'list']).stdout.strip()
        return self

-    def stop(self, immediate=False) -> 'ZenithPageserver':
+    def stop(self) -> 'ZenithPageserver':
        """
        Stop the page server.
        Returns self.
        """
-        cmd = ['stop']
-        if immediate:
-            cmd.append('immediate')

-        print(cmd)
        if self.running:
-            self.zenith_cli.run(cmd)
+            self.zenith_cli.run(['stop'])
            self.running = False

        return self
@@ -403,7 +398,7 @@ class ZenithPageserver(PgProtocol):
        return self

    def __exit__(self, exc_type, exc, tb):
-        self.stop(True)
+        self.stop()

    @cached_property
    def auth_keys(self) -> AuthKeys:
@@ -449,7 +444,7 @@ def pageserver(zenith_cli: ZenithCli, repo_dir: str, pageserver_port: Pageserver

    # After the yield comes any cleanup code we need.
    print('Starting pageserver cleanup')
-    ps.stop(True)
+    ps.stop()

 class PgBin:
    """ A helper class for executing postgres binaries """
@@ -843,7 +838,7 @@ class WalAcceptor:

    @property
    def pidfile(self) -> Path:
-        return self.data_dir / "safekeeper.pid"
+        return self.data_dir / "wal_acceptor.pid"

    def get_pid(self) -> Optional[int]:
        if not self.pidfile.exists():
@@ -895,7 +890,7 @@ class WalAcceptor:
 class WalAcceptorFactory:
    """ An object representing multiple running wal acceptors. """
    def __init__(self, zenith_binpath: Path, data_dir: Path, pageserver_port: int, port_distributor: PortDistributor):
-        self.wa_bin_path = zenith_binpath / 'safekeeper'
+        self.wa_bin_path = zenith_binpath / 'wal_acceptor'
        self.data_dir = data_dir
        self.instances: List[WalAcceptor] = []
        self.port_distributor = port_distributor
--- a/test_runner/performance/test_gist_build.py
+++ b/test_runner/performance/test_gist_build.py
@@ -1,48 +0,0 @@
-import os
-from contextlib import closing
-from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
-
-pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
-
-#
-# Test buffering GisT build. It WAL-logs the whole relation, in 32-page chunks.
-# As of this writing, we're duplicate those giant WAL records for each page,
-# which makes the delta layer about 32x larger than it needs to be.
-#
-def test_gist_buffering_build(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
-    # Create a branch for us
-    zenith_cli.run(["branch", "test_gist_buffering_build", "empty"])
-
-    pg = postgres.create_start('test_gist_buffering_build')
-    print("postgres is running on 'test_gist_buffering_build' branch")
-
-    # Open a connection directly to the page server that we'll use to force
-    # flushing the layers to disk
-    psconn = pageserver.connect();
-    pscur = psconn.cursor()
-
-    # Get the timeline ID of our branch. We need it for the 'do_gc' command
-    with closing(pg.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("SHOW zenith.zenith_timeline")
-            timeline = cur.fetchone()[0]
-
-            # Create test table.
-            cur.execute("create table gist_point_tbl(id int4, p point)");
-            cur.execute("insert into gist_point_tbl select g, point(g, g) from generate_series(1, 1000000) g;");
-
-            # Build the index.
-            with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
-                with zenbenchmark.record_duration('build'):
-                    cur.execute("create index gist_pointidx2 on gist_point_tbl using gist(p) with (buffering = on)");
-
-                    # Flush the layers from memory to disk. This is included in the reported
-                    # time and I/O
-                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 1000000")
-
-            # Record peak memory usage
-            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
-
-            # Report disk space used by the repository
-            timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
-            zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
--- a/walkeeper/README
+++ b/walkeeper/README
@@ -89,12 +89,12 @@ A: Page Server is a single server which can be lost. As our primary

 Q: What if the compute node evicts a page, needs it back, but the page is yet
   to reach the Page Server?
-A: If the compute node has evicted a page, changes to it have been WAL-logged
-   (that's why it is called Write Ahead logging; there are some exceptions like
-   index builds, but these are exceptions). These WAL records will eventually
-   reach the Page Server. The Page Server notes that the compute note requests
-   pages with a very recent LSN and will not respond to the compute node until a
-   corresponding WAL is received from WAL safekeepers.
+A: If the compute node has evicted a page, all changes from that page are
+   already committed, i.e. they are saved on majority of WAL safekeepers. These
+   WAL records will eventually reach the Page Server. The Page Server notes
+   that the compute note requests pages with a very recent LSN and will not
+   respond to the compute node until it a corresponding WAL is received from WAL
+   safekeepers.

 Q: How long may Page Server wait for?
 A: Not too long, hopefully. If a page is evicted, it probably was not used for
--- a/walkeeper/src/bin/wal_acceptor.rs
+++ b/walkeeper/src/bin/wal_acceptor.rs
@@ -1,5 +1,5 @@
 //
-// Main entry point for the safekeeper executable
+// Main entry point for the wal_acceptor executable
 //
 use anyhow::Result;
 use clap::{App, Arg};
@@ -20,14 +20,14 @@ use walkeeper::WalAcceptorConf;

 fn main() -> Result<()> {
    zenith_metrics::set_common_metrics_prefix("safekeeper");
-    let arg_matches = App::new("Zenith safekeeper")
+    let arg_matches = App::new("Zenith wal_acceptor")
        .about("Store WAL stream to local file system and push it to WAL receivers")
        .arg(
            Arg::with_name("datadir")
                .short("D")
                .long("dir")
                .takes_value(true)
-                .help("Path to the safekeeper data directory"),
+                .help("Path to the WAL acceptor data directory"),
        )
        .arg(
            Arg::with_name("listen-pg")
@@ -128,20 +128,14 @@ fn main() -> Result<()> {
 }

 fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
-    let log_filename = conf.data_dir.join("safekeeper.log");
-    let log_file = logging::init(log_filename, conf.daemonize)?;
+    let log_filename = conf.data_dir.join("wal_acceptor.log");
+    let (_scope_guard, log_file) = logging::init(log_filename, conf.daemonize)?;

    let http_listener = TcpListener::bind(conf.listen_http_addr.clone()).map_err(|e| {
        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
        e
    })?;

-    info!("Starting safekeeper on {}", conf.listen_pg_addr);
-    let pg_listener = TcpListener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
-        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
-        e
-    })?;
-
    if conf.daemonize {
        info!("daemonizing...");

@@ -151,7 +145,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
        let stderr = log_file;

        let daemonize = Daemonize::new()
-            .pid_file("safekeeper.pid")
+            .pid_file("wal_acceptor.pid")
            .working_directory(Path::new("."))
            .stdout(stdout)
            .stderr(stderr);
@@ -166,7 +160,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {

    let http_endpoint_thread = thread::Builder::new()
        .name("http_endpoint_thread".into())
-        .spawn(|| {
+        .spawn(move || {
            // No authentication at all: read-only metrics only, early stage.
            let router = endpoint::make_router();
            endpoint::serve_thread_main(router, http_listener).unwrap();
@@ -190,7 +184,7 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
        .name("WAL acceptor thread".into())
        .spawn(|| {
            // thread code
-            let thread_result = wal_service::thread_main(conf, pg_listener);
+            let thread_result = wal_service::thread_main(conf);
            if let Err(e) = thread_result {
                info!("wal_service thread terminated: {}", e);
            }
--- a/walkeeper/src/send_wal.rs
+++ b/walkeeper/src/send_wal.rs
@@ -20,6 +20,7 @@ use crate::timeline::CreateControlFile;

 /// Handler for streaming WAL from acceptor
 pub struct SendWalHandler {
+    /// wal acceptor configuration
    pub conf: WalAcceptorConf,
    /// assigned application name
    pub appname: Option<String>,
--- a/walkeeper/src/timeline.rs
+++ b/walkeeper/src/timeline.rs
@@ -112,7 +112,7 @@ impl SharedState {
        }
        match opts.open(&control_file_path) {
            Ok(mut file) => {
-                // Lock file to prevent two or more active safekeepers
+                // Lock file to prevent two or more active wal_acceptors
                match file.try_lock_exclusive() {
                    Ok(()) => {}
                    Err(e) => {
--- a/walkeeper/src/wal_service.rs
+++ b/walkeeper/src/wal_service.rs
@@ -12,7 +12,13 @@ use crate::WalAcceptorConf;
 use zenith_utils::postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
-pub fn thread_main(conf: WalAcceptorConf, listener: TcpListener) -> Result<()> {
+pub fn thread_main(conf: WalAcceptorConf) -> Result<()> {
+    info!("Starting wal acceptor on {}", conf.listen_pg_addr);
+    let listener = TcpListener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
+        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+        e
+    })?;
+
    loop {
        match listener.accept() {
            Ok((socket, peer_addr)) => {
@@ -35,8 +41,8 @@ fn handle_socket(socket: TcpStream, conf: WalAcceptorConf) -> Result<()> {
    socket.set_nodelay(true)?;

    let mut conn_handler = SendWalHandler::new(conf);
-    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, false)?;
-    // libpq replication protocol between safekeeper and replicas/pagers
+    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
+    // libpq replication protocol between wal_acceptor and replicas/pagers
    pgbackend.run(&mut conn_handler)?;

    Ok(())
--- a/zenith/src/main.rs
+++ b/zenith/src/main.rs
@@ -88,12 +88,7 @@ fn main() -> Result<()> {
        )
        .subcommand(SubCommand::with_name("status"))
        .subcommand(SubCommand::with_name("start").about("Start local pageserver"))
-        .subcommand(SubCommand::with_name("stop").about("Stop local pageserver")
-                    .arg(Arg::with_name("immediate")
-                    .help("Don't flush repository data at shutdown")
-                    .required(false)
-                    )
-        )
+        .subcommand(SubCommand::with_name("stop").about("Stop local pageserver"))
        .subcommand(SubCommand::with_name("restart").about("Restart local pageserver"))
        .subcommand(
            SubCommand::with_name("pg")
@@ -201,12 +196,10 @@ fn main() -> Result<()> {
            }
        }

-        ("stop", Some(stop_match)) => {
+        ("stop", Some(_sub_m)) => {
            let pageserver = PageServerNode::from_env(&env);

-            let immediate = stop_match.is_present("immediate");
-
-            if let Err(e) = pageserver.stop(immediate) {
+            if let Err(e) = pageserver.stop() {
                eprintln!("pageserver stop failed: {}", e);
                exit(1);
            }
@@ -215,8 +208,7 @@ fn main() -> Result<()> {
        ("restart", Some(_sub_m)) => {
            let pageserver = PageServerNode::from_env(&env);

-            //TODO what shutdown strategy should we use here?
-            if let Err(e) = pageserver.stop(false) {
+            if let Err(e) = pageserver.stop() {
                eprintln!("pageserver stop failed: {}", e);
                exit(1);
            }
--- a/zenith_metrics/src/lib.rs
+++ b/zenith_metrics/src/lib.rs
@@ -46,7 +46,7 @@ pub fn set_common_metrics_prefix(prefix: &'static str) {
 }

 /// Prepends a prefix to a common metric name so they are distinguished between
-/// different services, see <https://github.com/zenithdb/zenith/pull/681>
+/// different services, see https://github.com/zenithdb/zenith/pull/681
 /// A call to set_common_metrics_prefix() is necessary prior to calling this.
 pub fn new_common_metric_name(unprefixed_metric_name: &str) -> String {
    // Not unwrap() because metrics may be initialized after multiple threads have been started.
--- a/zenith_utils/Cargo.toml
+++ b/zenith_utils/Cargo.toml
@@ -18,9 +18,12 @@ serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 thiserror = "1.0"
 tokio = "1.11"
-tracing = "0.1"
-tracing-log = "0.1"
-tracing-subscriber = "0.2"
+
+slog-async = "2.6.0"
+slog-stdlog = "4.1.0"
+slog-scope = "4.4.0"
+slog-term = "2.8.0"
+slog = "2.7.0"

 zenith_metrics = { path = "../zenith_metrics" }
 workspace_hack = { path = "../workspace_hack" }
--- a/zenith_utils/src/http/endpoint.rs
+++ b/zenith_utils/src/http/endpoint.rs
@@ -12,17 +12,8 @@ use std::net::TcpListener;
 use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter};
 use zenith_metrics::{Encoder, TextEncoder};

-use std::sync::Mutex;
-use tokio::sync::oneshot::Sender;
-
 use super::error::ApiError;

-lazy_static! {
-    /// Channel used to send shutdown signal - wrapped in an Option to allow
-    /// it to be taken by value (since oneshot channels consume themselves on send)
-    static ref SHUTDOWN_SENDER: Mutex<Option<Sender<()>>> = Mutex::new(None);
-}
-
 lazy_static! {
    static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
        new_common_metric_name("serve_metrics_count"),
@@ -152,18 +143,11 @@ pub fn check_permission(req: &Request<Body>, tenantid: Option<ZTenantId>) -> Res
    }
 }

-// Send shutdown signal
-pub fn shutdown() {
-    if let Some(tx) = SHUTDOWN_SENDER.lock().unwrap().take() {
-        let _ = tx.send(());
-    }
-}
-
 pub fn serve_thread_main(
    router_builder: RouterBuilder<hyper::Body, ApiError>,
    listener: TcpListener,
 ) -> anyhow::Result<()> {
-    log::info!("Starting a http endpoint at {}", listener.local_addr()?);
+    log::info!("Starting a http endoint at {}", listener.local_addr()?);

    // Create a Service from the router above to handle incoming requests.
    let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
@@ -175,14 +159,7 @@ pub fn serve_thread_main(

    let _guard = runtime.enter();

-    let (send, recv) = tokio::sync::oneshot::channel::<()>();
-    *SHUTDOWN_SENDER.lock().unwrap() = Some(send);
-
-    let server = Server::from_tcp(listener)?
-        .serve(service)
-        .with_graceful_shutdown(async {
-            recv.await.ok();
-        });
+    let server = Server::from_tcp(listener)?.serve(service);

    runtime.block_on(server)?;

--- a/zenith_utils/src/logging.rs
+++ b/zenith_utils/src/logging.rs
@@ -1,3 +1,4 @@
+use slog::{Drain, Level};
 use std::{
    fs::{File, OpenOptions},
    path::Path,
@@ -5,12 +6,10 @@ use std::{

 use anyhow::{Context, Result};

-use tracing::subscriber::set_global_default;
-use tracing_log::LogTracer;
-use tracing_subscriber::fmt;
-use tracing_subscriber::{layer::SubscriberExt, EnvFilter, Registry};
-
-pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
+pub fn init(
+    log_filename: impl AsRef<Path>,
+    daemonize: bool,
+) -> Result<(slog_scope::GlobalLoggerGuard, File)> {
    // Don't open the same file for output multiple times;
    // the different fds could overwrite each other's output.
    let log_file = OpenOptions::new()
@@ -19,38 +18,30 @@ pub fn init(log_filename: impl AsRef<Path>, daemonize: bool) -> Result<File> {
        .open(&log_filename)
        .with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;

-    let default_filter_str = "info";
-
-    // We fall back to printing all spans at info-level or above if
-    // the RUST_LOG environment variable is not set.
-    let env_filter =
-        EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(default_filter_str));
-
    // we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
    // if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
    // for example to be in line with docker log command which expects logs comimg from stdout
-    //
-    // TODO: perhaps use a more human-readable format when !daemonize
-    if daemonize {
-        let x = log_file.try_clone().unwrap();
-
-        let fmt_layer = fmt::layer()
-            .pretty()
-            .with_target(false) // don't include event targets
-            .with_ansi(false) // don't use colors in log file
-            .with_writer(move || x.try_clone().unwrap());
-        let subscriber = Registry::default().with(env_filter).with(fmt_layer);
-
-        set_global_default(subscriber).expect("Failed to set subscriber");
+    let guard = if daemonize {
+        let decorator = slog_term::PlainSyncDecorator::new(log_file.try_clone()?);
+        let drain = slog_term::FullFormat::new(decorator)
+            .build()
+            .filter_level(Level::Info)
+            .fuse();
+        let logger = slog::Logger::root(drain, slog::o!());
+        slog_scope::set_global_logger(logger)
    } else {
-        let fmt_layer = fmt::layer().with_target(false); // don't include event targets
-        let subscriber = Registry::default().with(env_filter).with(fmt_layer);
+        let decorator = slog_term::TermDecorator::new().build();
+        let drain = slog_term::FullFormat::new(decorator)
+            .build()
+            .filter_level(Level::Info)
+            .fuse();
+        let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse();
+        let logger = slog::Logger::root(drain, slog::o!());
+        slog_scope::set_global_logger(logger)
+    };

-        set_global_default(subscriber).expect("Failed to set subscriber");
-    }
+    // initialise forwarding of std log calls
+    slog_stdlog::init()?;

-    // Redirect all `log`'s events to our subscriber
-    LogTracer::init().expect("Failed to set logger");
-
-    Ok(log_file)
+    Ok((guard, log_file))
 }
--- a/zenith_utils/src/postgres_backend.rs
+++ b/zenith_utils/src/postgres_backend.rs
@@ -13,11 +13,7 @@ use serde::{Deserialize, Serialize};
 use std::io::{self, Write};
 use std::net::{Shutdown, SocketAddr, TcpStream};
 use std::str::FromStr;
-use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
-use std::time::Duration;
-
-static PGBACKEND_SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);

 pub trait Handler {
    /// Handle single query.
@@ -139,32 +135,13 @@ pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
    query_string
 }

-// Helper function for socket read loops
-pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
-    for cause in error.chain() {
-        if let Some(io_error) = cause.downcast_ref::<io::Error>() {
-            if io_error.kind() == std::io::ErrorKind::WouldBlock {
-                return true;
-            }
-        }
-    }
-    false
-}
-
 impl PostgresBackend {
    pub fn new(
        socket: TcpStream,
        auth_type: AuthType,
        tls_config: Option<Arc<rustls::ServerConfig>>,
-        set_read_timeout: bool,
    ) -> io::Result<Self> {
        let peer_addr = socket.peer_addr()?;
-        if set_read_timeout {
-            socket
-                .set_read_timeout(Some(Duration::from_secs(5)))
-                .unwrap();
-        }
-
        Ok(Self {
            stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
            buf_out: BytesMut::with_capacity(10 * 1024),
@@ -252,26 +229,12 @@ impl PostgresBackend {

        let mut unnamed_query_string = Bytes::new();

-        while !PGBACKEND_SHUTDOWN_REQUESTED.load(Ordering::Relaxed) {
-            match self.read_message() {
-                Ok(message) => {
-                    if let Some(msg) = message {
-                        trace!("got message {:?}", msg);
+        while let Some(msg) = self.read_message()? {
+            trace!("got message {:?}", msg);

-                        match self.process_message(handler, msg, &mut unnamed_query_string)? {
-                            ProcessMsgResult::Continue => continue,
-                            ProcessMsgResult::Break => break,
-                        }
-                    } else {
-                        break;
-                    }
-                }
-                Err(e) => {
-                    // If it is a timeout error, continue the loop
-                    if !is_socket_read_timed_out(&e) {
-                        return Err(e);
-                    }
-                }
+            match self.process_message(handler, msg, &mut unnamed_query_string)? {
+                ProcessMsgResult::Continue => continue,
+                ProcessMsgResult::Break => break,
            }
        }

@@ -464,8 +427,3 @@ impl PostgresBackend {
        Ok(ProcessMsgResult::Continue)
    }
 }
-
-// Set the flag to inform connections to cancel
-pub fn set_pgbackend_shutdown_requested() {
-    PGBACKEND_SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
-}
--- a/zenith_utils/src/vec_map.rs
+++ b/zenith_utils/src/vec_map.rs
@@ -55,7 +55,7 @@ impl<K: Ord, V> VecMap<K, V> {
    }

    /// Add a key value pair to the map.
-    /// If `key` is less than or equal to the current maximum key
+    /// If [`key`] is less than or equal to the current maximum key
    /// the pair will not be added and InvalidKey error will be returned.
    pub fn append(&mut self, key: K, value: V) -> Result<(), InvalidKey> {
        if let Some((last_key, _last_value)) = self.0.last() {
@@ -69,7 +69,7 @@ impl<K: Ord, V> VecMap<K, V> {
    }

    /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If `key` is less than the current maximum key no updates or additions
+    /// If [`key`] is less than the current maximum key no updates or additions
    /// will occur and InvalidKey error will be returned.
    pub fn append_or_update_last(&mut self, key: K, mut value: V) -> Result<Option<V>, InvalidKey> {
        if let Some((last_key, last_value)) = self.0.last_mut() {
@@ -89,8 +89,8 @@ impl<K: Ord, V> VecMap<K, V> {

    /// Split the map into two.
    ///
-    /// The left map contains everything before `cutoff` (exclusive).
-    /// Right map contains `cutoff` and everything after (inclusive).
+    /// The left map contains everything before [`cutoff`] (exclusive).
+    /// Right map contains [`cutoff`] and everything after (inclusive).
    pub fn split_at(&self, cutoff: &K) -> (Self, Self)
    where
        K: Clone,
@@ -107,9 +107,9 @@ impl<K: Ord, V> VecMap<K, V> {
        )
    }

-    /// Move items from `other` to the end of `self`, leaving `other` empty.
-    /// If any keys in `other` is less than or equal to any key in `self`,
-    /// `InvalidKey` error will be returned and no mutation will occur.
+    /// Move items from [`other`] to the end of [`self`], leaving [`other`] empty.
+    /// If any keys in [`other`] is less than or equal to any key in [`self`],
+    /// [`InvalidKey`] error will be returned and no mutation will occur.
    pub fn extend(&mut self, other: &mut Self) -> Result<(), InvalidKey> {
        let self_last_opt = self.0.last().map(extract_key);
        let other_first_opt = other.0.last().map(extract_key);
--- a/zenith_utils/tests/ssl_test.rs
+++ b/zenith_utils/tests/ssl_test.rs
@@ -110,7 +110,7 @@ fn ssl() {
        .unwrap();
    let tls_config = Some(Arc::new(cfg));

-    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap();
+    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config).unwrap();
    pgb.run(&mut handler).unwrap();
    assert!(handler.got_query);

@@ -150,7 +150,7 @@ fn no_ssl() {

    let mut handler = TestHandler;

-    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None, true).unwrap();
+    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None).unwrap();
    pgb.run(&mut handler).unwrap();

    client_jh.join().unwrap();
@@ -214,7 +214,7 @@ fn server_forces_ssl() {
        .unwrap();
    let tls_config = Some(Arc::new(cfg));

-    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap();
+    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config).unwrap();
    let res = pgb.run(&mut handler).unwrap_err();
    assert_eq!("client did not connect with TLS", format!("{}", res));